]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/locks_i386.c
5f693ff515312a7cfad3019412468b2de8082e63
[apple/xnu.git] / osfmk / i386 / locks_i386.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
64 #define ATOMIC_PRIVATE 1
65 #define LOCK_PRIVATE 1
66
67 #include <mach_ldebug.h>
68
69 #include <kern/lock_stat.h>
70 #include <kern/locks.h>
71 #include <kern/kalloc.h>
72 #include <kern/misc_protos.h>
73 #include <kern/thread.h>
74 #include <kern/processor.h>
75 #include <kern/cpu_data.h>
76 #include <kern/cpu_number.h>
77 #include <kern/sched_prim.h>
78 #include <kern/xpr.h>
79 #include <kern/debug.h>
80 #include <string.h>
81
82 #include <i386/machine_routines.h> /* machine_timeout_suspended() */
83 #include <machine/atomic.h>
84 #include <machine/machine_cpu.h>
85 #include <i386/mp.h>
86 #include <machine/atomic.h>
87 #include <sys/kdebug.h>
88 #include <i386/locks_i386_inlines.h>
89
90 #if CONFIG_DTRACE
91 #define DTRACE_RW_SHARED 0x0 //reader
92 #define DTRACE_RW_EXCL 0x1 //writer
93 #define DTRACE_NO_FLAG 0x0 //not applicable
94 #endif /* CONFIG_DTRACE */
95
96 #define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
97 #define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
98 #define LCK_RW_LCK_SHARED_CODE 0x102
99 #define LCK_RW_LCK_SH_TO_EX_CODE 0x103
100 #define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
101 #define LCK_RW_LCK_EX_TO_SH_CODE 0x105
102
103 #define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
104 #define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
105 #define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
106 #define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
107 #define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
108 #define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
109 #define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
110 #define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
111
112
113 #define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
114
115 unsigned int LcksOpts = 0;
116
117 #if DEVELOPMENT || DEBUG
118 unsigned int LckDisablePreemptCheck = 0;
119 #endif
120
121 /* Forwards */
122
123 #if USLOCK_DEBUG
124 /*
125 * Perform simple lock checks.
126 */
127 int uslock_check = 1;
128 int max_lock_loops = 100000000;
129 decl_simple_lock_data(extern, printf_lock)
130 decl_simple_lock_data(extern, panic_lock)
131 #endif /* USLOCK_DEBUG */
132
133 extern unsigned int not_in_kdp;
134
135 /*
136 * We often want to know the addresses of the callers
137 * of the various lock routines. However, this information
138 * is only used for debugging and statistics.
139 */
140 typedef void *pc_t;
141 #define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
142 #define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
143 #if ANY_LOCK_DEBUG
144 #define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
145 #define DECL_PC(pc) pc_t pc;
146 #else /* ANY_LOCK_DEBUG */
147 #define DECL_PC(pc)
148 #ifdef lint
149 /*
150 * Eliminate lint complaints about unused local pc variables.
151 */
152 #define OBTAIN_PC(pc) ++pc
153 #else /* lint */
154 #define OBTAIN_PC(pc)
155 #endif /* lint */
156 #endif /* USLOCK_DEBUG */
157
158 /*
159 * atomic exchange API is a low level abstraction of the operations
160 * to atomically read, modify, and write a pointer. This abstraction works
161 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
162 * well as the ARM exclusive instructions.
163 *
164 * atomic_exchange_begin() - begin exchange and retrieve current value
165 * atomic_exchange_complete() - conclude an exchange
166 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
167 */
168 static uint32_t
169 atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
170 {
171 uint32_t val;
172
173 (void)ord; // Memory order not used
174 val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
175 *previous = val;
176 return val;
177 }
178
179 static boolean_t
180 atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
181 {
182 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
183 }
184
185 static void
186 atomic_exchange_abort(void)
187 {
188 }
189
190 static boolean_t
191 atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
192 {
193 uint32_t value, prev;
194
195 for (;;) {
196 value = atomic_exchange_begin32(target, &prev, ord);
197 if (value & test_mask) {
198 if (wait) {
199 cpu_pause();
200 } else {
201 atomic_exchange_abort();
202 }
203 return FALSE;
204 }
205 value |= set_mask;
206 if (atomic_exchange_complete32(target, prev, value, ord)) {
207 return TRUE;
208 }
209 }
210 }
211
212 /*
213 * Portable lock package implementation of usimple_locks.
214 */
215
216 #if USLOCK_DEBUG
217 #define USLDBG(stmt) stmt
218 void usld_lock_init(usimple_lock_t, unsigned short);
219 void usld_lock_pre(usimple_lock_t, pc_t);
220 void usld_lock_post(usimple_lock_t, pc_t);
221 void usld_unlock(usimple_lock_t, pc_t);
222 void usld_lock_try_pre(usimple_lock_t, pc_t);
223 void usld_lock_try_post(usimple_lock_t, pc_t);
224 int usld_lock_common_checks(usimple_lock_t, char *);
225 #else /* USLOCK_DEBUG */
226 #define USLDBG(stmt)
227 #endif /* USLOCK_DEBUG */
228
229 /*
230 * Forward definitions
231 */
232
233 static void lck_rw_lock_shared_gen(lck_rw_t *lck);
234 static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
235 static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
236 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
237 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
238 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
239 void lck_rw_clear_promotions_x86(thread_t thread);
240 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
241 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
242 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
243 static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect);
244 static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
245 static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
246 static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
247 static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
248 static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
249 static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
250
251
252 /*
253 * Routine: lck_spin_alloc_init
254 */
255 lck_spin_t *
256 lck_spin_alloc_init(
257 lck_grp_t *grp,
258 lck_attr_t *attr)
259 {
260 lck_spin_t *lck;
261
262 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) {
263 lck_spin_init(lck, grp, attr);
264 }
265
266 return lck;
267 }
268
269 /*
270 * Routine: lck_spin_free
271 */
272 void
273 lck_spin_free(
274 lck_spin_t *lck,
275 lck_grp_t *grp)
276 {
277 lck_spin_destroy(lck, grp);
278 kfree(lck, sizeof(lck_spin_t));
279 }
280
281 /*
282 * Routine: lck_spin_init
283 */
284 void
285 lck_spin_init(
286 lck_spin_t *lck,
287 lck_grp_t *grp,
288 __unused lck_attr_t *attr)
289 {
290 usimple_lock_init((usimple_lock_t) lck, 0);
291 lck_grp_reference(grp);
292 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
293 }
294
295 /*
296 * Routine: lck_spin_destroy
297 */
298 void
299 lck_spin_destroy(
300 lck_spin_t *lck,
301 lck_grp_t *grp)
302 {
303 if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
304 return;
305 }
306 lck->interlock = LCK_SPIN_TAG_DESTROYED;
307 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
308 lck_grp_deallocate(grp);
309 return;
310 }
311
312 /*
313 * Routine: lck_spin_lock
314 */
315 void
316 lck_spin_lock_grp(
317 lck_spin_t *lck,
318 lck_grp_t *grp)
319 {
320 #pragma unused(grp)
321 usimple_lock((usimple_lock_t) lck, grp);
322 }
323
324 void
325 lck_spin_lock(
326 lck_spin_t *lck)
327 {
328 usimple_lock((usimple_lock_t) lck, NULL);
329 }
330
331 /*
332 * Routine: lck_spin_unlock
333 */
334 void
335 lck_spin_unlock(
336 lck_spin_t *lck)
337 {
338 usimple_unlock((usimple_lock_t) lck);
339 }
340
341 boolean_t
342 lck_spin_try_lock_grp(
343 lck_spin_t *lck,
344 lck_grp_t *grp)
345 {
346 #pragma unused(grp)
347 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
348 #if DEVELOPMENT || DEBUG
349 if (lrval) {
350 pltrace(FALSE);
351 }
352 #endif
353 return lrval;
354 }
355
356
357 /*
358 * Routine: lck_spin_try_lock
359 */
360 boolean_t
361 lck_spin_try_lock(
362 lck_spin_t *lck)
363 {
364 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
365 #if DEVELOPMENT || DEBUG
366 if (lrval) {
367 pltrace(FALSE);
368 }
369 #endif
370 return lrval;
371 }
372
373 /*
374 * Routine: lck_spin_assert
375 */
376 void
377 lck_spin_assert(lck_spin_t *lock, unsigned int type)
378 {
379 thread_t thread, holder;
380 uintptr_t state;
381
382 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
383 panic("lck_spin_assert(): invalid arg (%u)", type);
384 }
385
386 state = lock->interlock;
387 holder = (thread_t)state;
388 thread = current_thread();
389 if (type == LCK_ASSERT_OWNED) {
390 if (__improbable(holder == THREAD_NULL)) {
391 panic("Lock not owned %p = %lx", lock, state);
392 }
393 if (__improbable(holder != thread)) {
394 panic("Lock not owned by current thread %p = %lx", lock, state);
395 }
396 } else if (type == LCK_ASSERT_NOTOWNED) {
397 if (__improbable(holder != THREAD_NULL)) {
398 if (holder == thread) {
399 panic("Lock owned by current thread %p = %lx", lock, state);
400 } else {
401 panic("Lock %p owned by thread %p", lock, holder);
402 }
403 }
404 }
405 }
406
407 /*
408 * Routine: kdp_lck_spin_is_acquired
409 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
410 * Returns: TRUE if lock is acquired.
411 */
412 boolean_t
413 kdp_lck_spin_is_acquired(lck_spin_t *lck)
414 {
415 if (not_in_kdp) {
416 panic("panic: spinlock acquired check done outside of kernel debugger");
417 }
418 return (lck->interlock != 0)? TRUE : FALSE;
419 }
420
421 /*
422 * Initialize a usimple_lock.
423 *
424 * No change in preemption state.
425 */
426 void
427 usimple_lock_init(
428 usimple_lock_t l,
429 __unused unsigned short tag)
430 {
431 #ifndef MACHINE_SIMPLE_LOCK
432 USLDBG(usld_lock_init(l, tag));
433 hw_lock_init(&l->interlock);
434 #else
435 simple_lock_init((simple_lock_t)l, tag);
436 #endif
437 }
438
439 volatile uint32_t spinlock_owner_cpu = ~0;
440 volatile usimple_lock_t spinlock_timed_out;
441
442 uint32_t
443 spinlock_timeout_NMI(uintptr_t thread_addr)
444 {
445 uint32_t i;
446
447 for (i = 0; i < real_ncpus; i++) {
448 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
449 spinlock_owner_cpu = i;
450 if ((uint32_t) cpu_number() != i) {
451 /* Cause NMI and panic on the owner's cpu */
452 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
453 }
454 break;
455 }
456 }
457
458 return spinlock_owner_cpu;
459 }
460
461 /*
462 * Acquire a usimple_lock.
463 *
464 * Returns with preemption disabled. Note
465 * that the hw_lock routines are responsible for
466 * maintaining preemption state.
467 */
468 void
469 (usimple_lock)(
470 usimple_lock_t l
471 LCK_GRP_ARG(lck_grp_t *grp))
472 {
473 #ifndef MACHINE_SIMPLE_LOCK
474 DECL_PC(pc);
475
476 OBTAIN_PC(pc);
477 USLDBG(usld_lock_pre(l, pc));
478
479 if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
480 boolean_t uslock_acquired = FALSE;
481 while (machine_timeout_suspended()) {
482 enable_preemption();
483 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) {
484 break;
485 }
486 }
487
488 if (uslock_acquired == FALSE) {
489 uint32_t lock_cpu;
490 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
491 spinlock_timed_out = l;
492 lock_cpu = spinlock_timeout_NMI(lowner);
493 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
494 l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
495 }
496 }
497 #if DEVELOPMENT || DEBUG
498 pltrace(FALSE);
499 #endif
500
501 USLDBG(usld_lock_post(l, pc));
502 #else
503 simple_lock((simple_lock_t)l, grp);
504 #endif
505 #if CONFIG_DTRACE
506 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
507 #endif
508 }
509
510
511 /*
512 * Release a usimple_lock.
513 *
514 * Returns with preemption enabled. Note
515 * that the hw_lock routines are responsible for
516 * maintaining preemption state.
517 */
518 void
519 usimple_unlock(
520 usimple_lock_t l)
521 {
522 #ifndef MACHINE_SIMPLE_LOCK
523 DECL_PC(pc);
524
525 OBTAIN_PC(pc);
526 USLDBG(usld_unlock(l, pc));
527 #if DEVELOPMENT || DEBUG
528 pltrace(TRUE);
529 #endif
530 hw_lock_unlock(&l->interlock);
531 #else
532 simple_unlock_rwmb((simple_lock_t)l);
533 #endif
534 }
535
536
537 /*
538 * Conditionally acquire a usimple_lock.
539 *
540 * On success, returns with preemption disabled.
541 * On failure, returns with preemption in the same state
542 * as when first invoked. Note that the hw_lock routines
543 * are responsible for maintaining preemption state.
544 *
545 * XXX No stats are gathered on a miss; I preserved this
546 * behavior from the original assembly-language code, but
547 * doesn't it make sense to log misses? XXX
548 */
549 unsigned int
550 usimple_lock_try(
551 usimple_lock_t l,
552 lck_grp_t *grp)
553 {
554 #ifndef MACHINE_SIMPLE_LOCK
555 unsigned int success;
556 DECL_PC(pc);
557
558 OBTAIN_PC(pc);
559 USLDBG(usld_lock_try_pre(l, pc));
560 if ((success = hw_lock_try(&l->interlock, grp))) {
561 #if DEVELOPMENT || DEBUG
562 pltrace(FALSE);
563 #endif
564 USLDBG(usld_lock_try_post(l, pc));
565 }
566 return success;
567 #else
568 return simple_lock_try((simple_lock_t)l, grp);
569 #endif
570 }
571
572 /*
573 * Acquire a usimple_lock while polling for pending TLB flushes
574 * and spinning on a lock.
575 *
576 */
577 void
578 usimple_lock_try_lock_loop(usimple_lock_t l, lck_grp_t *grp)
579 {
580 boolean_t istate = ml_get_interrupts_enabled();
581 while (!simple_lock_try(l, grp)) {
582 if (!istate) {
583 handle_pending_TLB_flushes();
584 }
585 cpu_pause();
586 }
587 }
588
589 #if USLOCK_DEBUG
590 /*
591 * States of a usimple_lock. The default when initializing
592 * a usimple_lock is setting it up for debug checking.
593 */
594 #define USLOCK_CHECKED 0x0001 /* lock is being checked */
595 #define USLOCK_TAKEN 0x0002 /* lock has been taken */
596 #define USLOCK_INIT 0xBAA0 /* lock has been initialized */
597 #define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
598 #define USLOCK_CHECKING(l) (uslock_check && \
599 ((l)->debug.state & USLOCK_CHECKED))
600
601 /*
602 * Trace activities of a particularly interesting lock.
603 */
604 void usl_trace(usimple_lock_t, int, pc_t, const char *);
605
606
607 /*
608 * Initialize the debugging information contained
609 * in a usimple_lock.
610 */
611 void
612 usld_lock_init(
613 usimple_lock_t l,
614 __unused unsigned short tag)
615 {
616 if (l == USIMPLE_LOCK_NULL) {
617 panic("lock initialization: null lock pointer");
618 }
619 l->lock_type = USLOCK_TAG;
620 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
621 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
622 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
623 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
624 l->debug.duration[0] = l->debug.duration[1] = 0;
625 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
626 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
627 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
628 }
629
630
631 /*
632 * These checks apply to all usimple_locks, not just
633 * those with USLOCK_CHECKED turned on.
634 */
635 int
636 usld_lock_common_checks(
637 usimple_lock_t l,
638 char *caller)
639 {
640 if (l == USIMPLE_LOCK_NULL) {
641 panic("%s: null lock pointer", caller);
642 }
643 if (l->lock_type != USLOCK_TAG) {
644 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
645 }
646 if (!(l->debug.state & USLOCK_INIT)) {
647 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
648 }
649 return USLOCK_CHECKING(l);
650 }
651
652
653 /*
654 * Debug checks on a usimple_lock just before attempting
655 * to acquire it.
656 */
657 /* ARGSUSED */
658 void
659 usld_lock_pre(
660 usimple_lock_t l,
661 pc_t pc)
662 {
663 char caller[] = "usimple_lock";
664
665
666 if (!usld_lock_common_checks(l, caller)) {
667 return;
668 }
669
670 /*
671 * Note that we have a weird case where we are getting a lock when we are]
672 * in the process of putting the system to sleep. We are running with no
673 * current threads, therefore we can't tell if we are trying to retake a lock
674 * we have or someone on the other processor has it. Therefore we just
675 * ignore this test if the locking thread is 0.
676 */
677
678 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
679 l->debug.lock_thread == (void *) current_thread()) {
680 printf("%s: lock %p already locked (at %p) by",
681 caller, l, l->debug.lock_pc);
682 printf(" current thread %p (new attempt at pc %p)\n",
683 l->debug.lock_thread, pc);
684 panic("%s", caller);
685 }
686 mp_disable_preemption();
687 usl_trace(l, cpu_number(), pc, caller);
688 mp_enable_preemption();
689 }
690
691
692 /*
693 * Debug checks on a usimple_lock just after acquiring it.
694 *
695 * Pre-emption has been disabled at this point,
696 * so we are safe in using cpu_number.
697 */
698 void
699 usld_lock_post(
700 usimple_lock_t l,
701 pc_t pc)
702 {
703 int mycpu;
704 char caller[] = "successful usimple_lock";
705
706
707 if (!usld_lock_common_checks(l, caller)) {
708 return;
709 }
710
711 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
712 panic("%s: lock %p became uninitialized",
713 caller, l);
714 }
715 if ((l->debug.state & USLOCK_TAKEN)) {
716 panic("%s: lock 0x%p became TAKEN by someone else",
717 caller, l);
718 }
719
720 mycpu = cpu_number();
721 l->debug.lock_thread = (void *)current_thread();
722 l->debug.state |= USLOCK_TAKEN;
723 l->debug.lock_pc = pc;
724 l->debug.lock_cpu = mycpu;
725
726 usl_trace(l, mycpu, pc, caller);
727 }
728
729
730 /*
731 * Debug checks on a usimple_lock just before
732 * releasing it. Note that the caller has not
733 * yet released the hardware lock.
734 *
735 * Preemption is still disabled, so there's
736 * no problem using cpu_number.
737 */
738 void
739 usld_unlock(
740 usimple_lock_t l,
741 pc_t pc)
742 {
743 int mycpu;
744 char caller[] = "usimple_unlock";
745
746
747 if (!usld_lock_common_checks(l, caller)) {
748 return;
749 }
750
751 mycpu = cpu_number();
752
753 if (!(l->debug.state & USLOCK_TAKEN)) {
754 panic("%s: lock 0x%p hasn't been taken",
755 caller, l);
756 }
757 if (l->debug.lock_thread != (void *) current_thread()) {
758 panic("%s: unlocking lock 0x%p, owned by thread %p",
759 caller, l, l->debug.lock_thread);
760 }
761 if (l->debug.lock_cpu != mycpu) {
762 printf("%s: unlocking lock 0x%p on cpu 0x%x",
763 caller, l, mycpu);
764 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
765 panic("%s", caller);
766 }
767 usl_trace(l, mycpu, pc, caller);
768
769 l->debug.unlock_thread = l->debug.lock_thread;
770 l->debug.lock_thread = INVALID_PC;
771 l->debug.state &= ~USLOCK_TAKEN;
772 l->debug.unlock_pc = pc;
773 l->debug.unlock_cpu = mycpu;
774 }
775
776
777 /*
778 * Debug checks on a usimple_lock just before
779 * attempting to acquire it.
780 *
781 * Preemption isn't guaranteed to be disabled.
782 */
783 void
784 usld_lock_try_pre(
785 usimple_lock_t l,
786 pc_t pc)
787 {
788 char caller[] = "usimple_lock_try";
789
790 if (!usld_lock_common_checks(l, caller)) {
791 return;
792 }
793 mp_disable_preemption();
794 usl_trace(l, cpu_number(), pc, caller);
795 mp_enable_preemption();
796 }
797
798
799 /*
800 * Debug checks on a usimple_lock just after
801 * successfully attempting to acquire it.
802 *
803 * Preemption has been disabled by the
804 * lock acquisition attempt, so it's safe
805 * to use cpu_number.
806 */
807 void
808 usld_lock_try_post(
809 usimple_lock_t l,
810 pc_t pc)
811 {
812 int mycpu;
813 char caller[] = "successful usimple_lock_try";
814
815 if (!usld_lock_common_checks(l, caller)) {
816 return;
817 }
818
819 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
820 panic("%s: lock 0x%p became uninitialized",
821 caller, l);
822 }
823 if ((l->debug.state & USLOCK_TAKEN)) {
824 panic("%s: lock 0x%p became TAKEN by someone else",
825 caller, l);
826 }
827
828 mycpu = cpu_number();
829 l->debug.lock_thread = (void *) current_thread();
830 l->debug.state |= USLOCK_TAKEN;
831 l->debug.lock_pc = pc;
832 l->debug.lock_cpu = mycpu;
833
834 usl_trace(l, mycpu, pc, caller);
835 }
836
837
838 /*
839 * For very special cases, set traced_lock to point to a
840 * specific lock of interest. The result is a series of
841 * XPRs showing lock operations on that lock. The lock_seq
842 * value is used to show the order of those operations.
843 */
844 usimple_lock_t traced_lock;
845 unsigned int lock_seq;
846
847 void
848 usl_trace(
849 usimple_lock_t l,
850 int mycpu,
851 pc_t pc,
852 const char * op_name)
853 {
854 if (traced_lock == l) {
855 XPR(XPR_SLOCK,
856 "seq %d, cpu %d, %s @ %x\n",
857 (uintptr_t) lock_seq, (uintptr_t) mycpu,
858 (uintptr_t) op_name, (uintptr_t) pc, 0);
859 lock_seq++;
860 }
861 }
862
863
864 #endif /* USLOCK_DEBUG */
865
866 /*
867 * Routine: lck_rw_alloc_init
868 */
869 lck_rw_t *
870 lck_rw_alloc_init(
871 lck_grp_t *grp,
872 lck_attr_t *attr)
873 {
874 lck_rw_t *lck;
875
876 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
877 bzero(lck, sizeof(lck_rw_t));
878 lck_rw_init(lck, grp, attr);
879 }
880
881 return lck;
882 }
883
884 /*
885 * Routine: lck_rw_free
886 */
887 void
888 lck_rw_free(
889 lck_rw_t *lck,
890 lck_grp_t *grp)
891 {
892 lck_rw_destroy(lck, grp);
893 kfree(lck, sizeof(lck_rw_t));
894 }
895
896 /*
897 * Routine: lck_rw_init
898 */
899 void
900 lck_rw_init(
901 lck_rw_t *lck,
902 lck_grp_t *grp,
903 lck_attr_t *attr)
904 {
905 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
906 attr : &LockDefaultLckAttr;
907
908 hw_lock_byte_init(&lck->lck_rw_interlock);
909 lck->lck_rw_want_write = FALSE;
910 lck->lck_rw_want_upgrade = FALSE;
911 lck->lck_rw_shared_count = 0;
912 lck->lck_rw_can_sleep = TRUE;
913 lck->lck_r_waiting = lck->lck_w_waiting = 0;
914 lck->lck_rw_tag = 0;
915 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
916 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
917
918 lck_grp_reference(grp);
919 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
920 }
921
922 /*
923 * Routine: lck_rw_destroy
924 */
925 void
926 lck_rw_destroy(
927 lck_rw_t *lck,
928 lck_grp_t *grp)
929 {
930 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
931 return;
932 }
933 #if MACH_LDEBUG
934 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
935 #endif
936 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
937 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
938 lck_grp_deallocate(grp);
939 return;
940 }
941
942 /*
943 * Sleep locks. These use the same data structure and algorithm
944 * as the spin locks, but the process sleeps while it is waiting
945 * for the lock. These work on uniprocessor systems.
946 */
947
948 #define DECREMENTER_TIMEOUT 1000000
949
950 /*
951 * We disable interrupts while holding the RW interlock to prevent an
952 * interrupt from exacerbating hold time.
953 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
954 */
955 static inline boolean_t
956 lck_interlock_lock(lck_rw_t *lck)
957 {
958 boolean_t istate;
959
960 istate = ml_set_interrupts_enabled(FALSE);
961 hw_lock_byte_lock(&lck->lck_rw_interlock);
962 return istate;
963 }
964
965 static inline void
966 lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
967 {
968 hw_lock_byte_unlock(&lck->lck_rw_interlock);
969 ml_set_interrupts_enabled(istate);
970 }
971
972 /*
973 * This inline is used when busy-waiting for an rw lock.
974 * If interrupts were disabled when the lock primitive was called,
975 * we poll the IPI handler for pending tlb flushes.
976 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
977 */
978 static inline void
979 lck_rw_lock_pause(boolean_t interrupts_enabled)
980 {
981 if (!interrupts_enabled) {
982 handle_pending_TLB_flushes();
983 }
984 cpu_pause();
985 }
986
987 static inline boolean_t
988 lck_rw_held_read_or_upgrade(lck_rw_t *lock)
989 {
990 if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
991 return TRUE;
992 }
993 return FALSE;
994 }
995
996 /*
997 * compute the deadline to spin against when
998 * waiting for a change of state on a lck_rw_t
999 */
1000 static inline uint64_t
1001 lck_rw_deadline_for_spin(lck_rw_t *lck)
1002 {
1003 if (lck->lck_rw_can_sleep) {
1004 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
1005 /*
1006 * there are already threads waiting on this lock... this
1007 * implies that they have spun beyond their deadlines waiting for
1008 * the desired state to show up so we will not bother spinning at this time...
1009 * or
1010 * the current number of threads sharing this lock exceeds our capacity to run them
1011 * concurrently and since all states we're going to spin for require the rw_shared_count
1012 * to be at 0, we'll not bother spinning since the latency for this to happen is
1013 * unpredictable...
1014 */
1015 return mach_absolute_time();
1016 }
1017 return mach_absolute_time() + MutexSpin;
1018 } else {
1019 return mach_absolute_time() + (1LL * 1000000000LL);
1020 }
1021 }
1022
1023
1024 /*
1025 * Spin while interlock is held.
1026 */
1027
1028 static inline void
1029 lck_rw_interlock_spin(lck_rw_t *lock)
1030 {
1031 while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1032 cpu_pause();
1033 }
1034 }
1035
1036 static boolean_t
1037 lck_rw_grab_want(lck_rw_t *lock)
1038 {
1039 uint32_t data, prev;
1040
1041 for (;;) {
1042 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
1043 if ((data & LCK_RW_INTERLOCK) == 0) {
1044 break;
1045 }
1046 atomic_exchange_abort();
1047 lck_rw_interlock_spin(lock);
1048 }
1049 if (data & LCK_RW_WANT_WRITE) {
1050 atomic_exchange_abort();
1051 return FALSE;
1052 }
1053 data |= LCK_RW_WANT_WRITE;
1054 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1055 }
1056
1057 static boolean_t
1058 lck_rw_grab_shared(lck_rw_t *lock)
1059 {
1060 uint32_t data, prev;
1061
1062 for (;;) {
1063 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1064 if ((data & LCK_RW_INTERLOCK) == 0) {
1065 break;
1066 }
1067 atomic_exchange_abort();
1068 lck_rw_interlock_spin(lock);
1069 }
1070 if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1071 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1072 atomic_exchange_abort();
1073 return FALSE;
1074 }
1075 }
1076 data += LCK_RW_SHARED_READER;
1077 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1078 }
1079
1080 /*
1081 * Routine: lck_rw_lock_exclusive
1082 */
1083 static void
1084 lck_rw_lock_exclusive_gen(
1085 lck_rw_t *lck)
1086 {
1087 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1088 uint64_t deadline = 0;
1089 int slept = 0;
1090 int gotlock = 0;
1091 int lockheld = 0;
1092 wait_result_t res = 0;
1093 boolean_t istate = -1;
1094
1095 #if CONFIG_DTRACE
1096 boolean_t dtrace_ls_initialized = FALSE;
1097 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
1098 uint64_t wait_interval = 0;
1099 int readers_at_sleep = 0;
1100 #endif
1101
1102 /*
1103 * Try to acquire the lck_rw_want_write bit.
1104 */
1105 while (!lck_rw_grab_want(lck)) {
1106 #if CONFIG_DTRACE
1107 if (dtrace_ls_initialized == FALSE) {
1108 dtrace_ls_initialized = TRUE;
1109 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1110 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1111 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1112 if (dtrace_ls_enabled) {
1113 /*
1114 * Either sleeping or spinning is happening,
1115 * start a timing of our delay interval now.
1116 */
1117 readers_at_sleep = lck->lck_rw_shared_count;
1118 wait_interval = mach_absolute_time();
1119 }
1120 }
1121 #endif
1122 if (istate == -1) {
1123 istate = ml_get_interrupts_enabled();
1124 }
1125
1126 deadline = lck_rw_deadline_for_spin(lck);
1127
1128 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1129
1130 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
1131 lck_rw_lock_pause(istate);
1132 }
1133
1134 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
1135
1136 if (gotlock) {
1137 break;
1138 }
1139 /*
1140 * if we get here, the deadline has expired w/o us
1141 * being able to grab the lock exclusively
1142 * check to see if we're allowed to do a thread_block
1143 */
1144 if (lck->lck_rw_can_sleep) {
1145 istate = lck_interlock_lock(lck);
1146
1147 if (lck->lck_rw_want_write) {
1148 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1149
1150 lck->lck_w_waiting = TRUE;
1151
1152 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1153 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1154 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1155 lck_interlock_unlock(lck, istate);
1156
1157 if (res == THREAD_WAITING) {
1158 res = thread_block(THREAD_CONTINUE_NULL);
1159 slept++;
1160 }
1161 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1162 } else {
1163 lck->lck_rw_want_write = TRUE;
1164 lck_interlock_unlock(lck, istate);
1165 break;
1166 }
1167 }
1168 }
1169 /*
1170 * Wait for readers (and upgrades) to finish...
1171 * the test for these conditions must be done simultaneously with
1172 * a check of the interlock not being held since
1173 * the rw_shared_count will drop to 0 first and then want_upgrade
1174 * will be set to 1 in the shared_to_exclusive scenario... those
1175 * adjustments are done behind the interlock and represent an
1176 * atomic change in state and must be considered as such
1177 * however, once we see the read count at 0, the want_upgrade not set
1178 * and the interlock not held, we are safe to proceed
1179 */
1180 while (lck_rw_held_read_or_upgrade(lck)) {
1181 #if CONFIG_DTRACE
1182 /*
1183 * Either sleeping or spinning is happening, start
1184 * a timing of our delay interval now. If we set it
1185 * to -1 we don't have accurate data so we cannot later
1186 * decide to record a dtrace spin or sleep event.
1187 */
1188 if (dtrace_ls_initialized == FALSE) {
1189 dtrace_ls_initialized = TRUE;
1190 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1191 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1192 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1193 if (dtrace_ls_enabled) {
1194 /*
1195 * Either sleeping or spinning is happening,
1196 * start a timing of our delay interval now.
1197 */
1198 readers_at_sleep = lck->lck_rw_shared_count;
1199 wait_interval = mach_absolute_time();
1200 }
1201 }
1202 #endif
1203 if (istate == -1) {
1204 istate = ml_get_interrupts_enabled();
1205 }
1206
1207 deadline = lck_rw_deadline_for_spin(lck);
1208
1209 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1210
1211 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
1212 lck_rw_lock_pause(istate);
1213 }
1214
1215 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
1216
1217 if (!lockheld) {
1218 break;
1219 }
1220 /*
1221 * if we get here, the deadline has expired w/o us
1222 * being able to grab the lock exclusively
1223 * check to see if we're allowed to do a thread_block
1224 */
1225 if (lck->lck_rw_can_sleep) {
1226 istate = lck_interlock_lock(lck);
1227
1228 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
1229 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
1230
1231 lck->lck_w_waiting = TRUE;
1232
1233 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1234 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1235 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1236 lck_interlock_unlock(lck, istate);
1237
1238 if (res == THREAD_WAITING) {
1239 res = thread_block(THREAD_CONTINUE_NULL);
1240 slept++;
1241 }
1242 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
1243 } else {
1244 lck_interlock_unlock(lck, istate);
1245 /*
1246 * must own the lock now, since we checked for
1247 * readers or upgrade owner behind the interlock
1248 * no need for a call to 'lck_rw_held_read_or_upgrade'
1249 */
1250 break;
1251 }
1252 }
1253 }
1254
1255 #if CONFIG_DTRACE
1256 /*
1257 * Decide what latencies we suffered that are Dtrace events.
1258 * If we have set wait_interval, then we either spun or slept.
1259 * At least we get out from under the interlock before we record
1260 * which is the best we can do here to minimize the impact
1261 * of the tracing.
1262 * If we have set wait_interval to -1, then dtrace was not enabled when we
1263 * started sleeping/spinning so we don't record this event.
1264 */
1265 if (dtrace_ls_enabled == TRUE) {
1266 if (slept == 0) {
1267 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1268 mach_absolute_time() - wait_interval, 1);
1269 } else {
1270 /*
1271 * For the blocking case, we also record if when we blocked
1272 * it was held for read or write, and how many readers.
1273 * Notice that above we recorded this before we dropped
1274 * the interlock so the count is accurate.
1275 */
1276 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1277 mach_absolute_time() - wait_interval, 1,
1278 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1279 }
1280 }
1281 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1282 #endif
1283 }
1284
1285 /*
1286 * Routine: lck_rw_done
1287 */
1288
1289 lck_rw_type_t
1290 lck_rw_done(lck_rw_t *lock)
1291 {
1292 uint32_t data, prev;
1293
1294 for (;;) {
1295 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1296 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
1297 atomic_exchange_abort();
1298 lck_rw_interlock_spin(lock);
1299 continue;
1300 }
1301 if (data & LCK_RW_SHARED_MASK) {
1302 data -= LCK_RW_SHARED_READER;
1303 if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
1304 goto check_waiters;
1305 }
1306 } else { /* if reader count == 0, must be exclusive lock */
1307 if (data & LCK_RW_WANT_UPGRADE) {
1308 data &= ~(LCK_RW_WANT_UPGRADE);
1309 } else {
1310 if (data & LCK_RW_WANT_WRITE) {
1311 data &= ~(LCK_RW_WANT_EXCL);
1312 } else { /* lock is not 'owned', panic */
1313 panic("Releasing non-exclusive RW lock without a reader refcount!");
1314 }
1315 }
1316 check_waiters:
1317 if (prev & LCK_RW_W_WAITING) {
1318 data &= ~(LCK_RW_W_WAITING);
1319 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
1320 data &= ~(LCK_RW_R_WAITING);
1321 }
1322 } else {
1323 data &= ~(LCK_RW_R_WAITING);
1324 }
1325 }
1326 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1327 break;
1328 }
1329 cpu_pause();
1330 }
1331 return lck_rw_done_gen(lock, prev);
1332 }
1333
1334 /*
1335 * Routine: lck_rw_done_gen
1336 *
1337 * called from lck_rw_done()
1338 * prior_lock_state is the value in the 1st
1339 * word of the lock at the time of a successful
1340 * atomic compare and exchange with the new value...
1341 * it represents the state of the lock before we
1342 * decremented the rw_shared_count or cleared either
1343 * rw_want_upgrade or rw_want_write and
1344 * the lck_x_waiting bits... since the wrapper
1345 * routine has already changed the state atomically,
1346 * we just need to decide if we should
1347 * wake up anyone and what value to return... we do
1348 * this by examining the state of the lock before
1349 * we changed it
1350 */
1351 static lck_rw_type_t
1352 lck_rw_done_gen(
1353 lck_rw_t *lck,
1354 uint32_t prior_lock_state)
1355 {
1356 lck_rw_t *fake_lck;
1357 lck_rw_type_t lock_type;
1358 thread_t thread;
1359 uint32_t rwlock_count;
1360
1361 thread = current_thread();
1362 rwlock_count = thread->rwlock_count--;
1363 fake_lck = (lck_rw_t *)&prior_lock_state;
1364
1365 if (lck->lck_rw_can_sleep) {
1366 /*
1367 * prior_lock state is a snapshot of the 1st word of the
1368 * lock in question... we'll fake up a pointer to it
1369 * and carefully not access anything beyond whats defined
1370 * in the first word of a lck_rw_t
1371 */
1372
1373 if (fake_lck->lck_rw_shared_count <= 1) {
1374 if (fake_lck->lck_w_waiting) {
1375 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1376 }
1377
1378 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1379 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1380 }
1381 }
1382 #if MACH_LDEBUG
1383 if (rwlock_count == 0) {
1384 panic("rw lock count underflow for thread %p", thread);
1385 }
1386 #endif
1387 /* Check if dropping the lock means that we need to unpromote */
1388
1389 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1390 /* sched_flags checked without lock, but will be rechecked while clearing */
1391 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1392 }
1393 }
1394 if (fake_lck->lck_rw_shared_count) {
1395 lock_type = LCK_RW_TYPE_SHARED;
1396 } else {
1397 lock_type = LCK_RW_TYPE_EXCLUSIVE;
1398 }
1399
1400 #if CONFIG_DTRACE
1401 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
1402 #endif
1403
1404 return lock_type;
1405 }
1406
1407
1408 /*
1409 * Routine: lck_rw_unlock
1410 */
1411 void
1412 lck_rw_unlock(
1413 lck_rw_t *lck,
1414 lck_rw_type_t lck_rw_type)
1415 {
1416 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1417 lck_rw_unlock_shared(lck);
1418 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1419 lck_rw_unlock_exclusive(lck);
1420 } else {
1421 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1422 }
1423 }
1424
1425
1426 /*
1427 * Routine: lck_rw_unlock_shared
1428 */
1429 void
1430 lck_rw_unlock_shared(
1431 lck_rw_t *lck)
1432 {
1433 lck_rw_type_t ret;
1434
1435 assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1436 ret = lck_rw_done(lck);
1437
1438 if (ret != LCK_RW_TYPE_SHARED) {
1439 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1440 }
1441 }
1442
1443
1444 /*
1445 * Routine: lck_rw_unlock_exclusive
1446 */
1447 void
1448 lck_rw_unlock_exclusive(
1449 lck_rw_t *lck)
1450 {
1451 lck_rw_type_t ret;
1452
1453 ret = lck_rw_done(lck);
1454
1455 if (ret != LCK_RW_TYPE_EXCLUSIVE) {
1456 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1457 }
1458 }
1459
1460
1461 /*
1462 * Routine: lck_rw_lock
1463 */
1464 void
1465 lck_rw_lock(
1466 lck_rw_t *lck,
1467 lck_rw_type_t lck_rw_type)
1468 {
1469 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1470 lck_rw_lock_shared(lck);
1471 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1472 lck_rw_lock_exclusive(lck);
1473 } else {
1474 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1475 }
1476 }
1477
1478 /*
1479 * Routine: lck_rw_lock_shared
1480 */
1481 void
1482 lck_rw_lock_shared(lck_rw_t *lock)
1483 {
1484 uint32_t data, prev;
1485
1486 current_thread()->rwlock_count++;
1487 for (;;) {
1488 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1489 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1490 atomic_exchange_abort();
1491 if (lock->lck_rw_can_sleep) {
1492 lck_rw_lock_shared_gen(lock);
1493 } else {
1494 cpu_pause();
1495 continue;
1496 }
1497 break;
1498 }
1499 data += LCK_RW_SHARED_READER;
1500 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1501 break;
1502 }
1503 cpu_pause();
1504 }
1505 #if CONFIG_DTRACE
1506 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1507 #endif /* CONFIG_DTRACE */
1508 return;
1509 }
1510
1511 /*
1512 * Routine: lck_rw_lock_shared_gen
1513 * Function:
1514 * assembly fast path code has determined that this lock
1515 * is held exclusively... this is where we spin/block
1516 * until we can acquire the lock in the shared mode
1517 */
1518 static void
1519 lck_rw_lock_shared_gen(
1520 lck_rw_t *lck)
1521 {
1522 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1523 uint64_t deadline = 0;
1524 int gotlock = 0;
1525 int slept = 0;
1526 wait_result_t res = 0;
1527 boolean_t istate = -1;
1528
1529 #if CONFIG_DTRACE
1530 uint64_t wait_interval = 0;
1531 int readers_at_sleep = 0;
1532 boolean_t dtrace_ls_initialized = FALSE;
1533 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1534 #endif
1535
1536 while (!lck_rw_grab_shared(lck)) {
1537 #if CONFIG_DTRACE
1538 if (dtrace_ls_initialized == FALSE) {
1539 dtrace_ls_initialized = TRUE;
1540 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1541 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1542 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1543 if (dtrace_ls_enabled) {
1544 /*
1545 * Either sleeping or spinning is happening,
1546 * start a timing of our delay interval now.
1547 */
1548 readers_at_sleep = lck->lck_rw_shared_count;
1549 wait_interval = mach_absolute_time();
1550 }
1551 }
1552 #endif
1553 if (istate == -1) {
1554 istate = ml_get_interrupts_enabled();
1555 }
1556
1557 deadline = lck_rw_deadline_for_spin(lck);
1558
1559 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
1560 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1561
1562 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
1563 lck_rw_lock_pause(istate);
1564 }
1565
1566 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
1567 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
1568
1569 if (gotlock) {
1570 break;
1571 }
1572 /*
1573 * if we get here, the deadline has expired w/o us
1574 * being able to grab the lock for read
1575 * check to see if we're allowed to do a thread_block
1576 */
1577 if (lck->lck_rw_can_sleep) {
1578 istate = lck_interlock_lock(lck);
1579
1580 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1581 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1582 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
1583 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
1584
1585 lck->lck_r_waiting = TRUE;
1586
1587 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1588 res = assert_wait(RW_LOCK_READER_EVENT(lck),
1589 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1590 lck_interlock_unlock(lck, istate);
1591
1592 if (res == THREAD_WAITING) {
1593 res = thread_block(THREAD_CONTINUE_NULL);
1594 slept++;
1595 }
1596 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
1597 trace_lck, res, slept, 0, 0);
1598 } else {
1599 lck->lck_rw_shared_count++;
1600 lck_interlock_unlock(lck, istate);
1601 break;
1602 }
1603 }
1604 }
1605
1606 #if CONFIG_DTRACE
1607 if (dtrace_ls_enabled == TRUE) {
1608 if (slept == 0) {
1609 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1610 } else {
1611 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1612 mach_absolute_time() - wait_interval, 0,
1613 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1614 }
1615 }
1616 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1617 #endif
1618 }
1619
1620
1621 /*
1622 * Routine: lck_rw_lock_exclusive
1623 */
1624
1625 void
1626 lck_rw_lock_exclusive(lck_rw_t *lock)
1627 {
1628 current_thread()->rwlock_count++;
1629 if (atomic_test_and_set32(&lock->data,
1630 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1631 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1632 #if CONFIG_DTRACE
1633 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1634 #endif /* CONFIG_DTRACE */
1635 } else {
1636 lck_rw_lock_exclusive_gen(lock);
1637 }
1638 }
1639
1640
1641 /*
1642 * Routine: lck_rw_lock_shared_to_exclusive
1643 */
1644
1645 boolean_t
1646 lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1647 {
1648 uint32_t data, prev;
1649
1650 for (;;) {
1651 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1652 if (data & LCK_RW_INTERLOCK) {
1653 atomic_exchange_abort();
1654 lck_rw_interlock_spin(lock);
1655 continue;
1656 }
1657 if (data & LCK_RW_WANT_UPGRADE) {
1658 data -= LCK_RW_SHARED_READER;
1659 if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */
1660 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1661 }
1662 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1663 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1664 }
1665 } else {
1666 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1667 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1668 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1669 break;
1670 }
1671 }
1672 cpu_pause();
1673 }
1674 /* we now own the WANT_UPGRADE */
1675 if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */
1676 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1677 }
1678 #if CONFIG_DTRACE
1679 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1680 #endif
1681 return TRUE;
1682 }
1683
1684
1685 /*
1686 * Routine: lck_rw_lock_shared_to_exclusive_failure
1687 * Function:
1688 * assembly fast path code has already dropped our read
1689 * count and determined that someone else owns 'lck_rw_want_upgrade'
1690 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1691 * all we need to do here is determine if a wakeup is needed
1692 */
1693 static boolean_t
1694 lck_rw_lock_shared_to_exclusive_failure(
1695 lck_rw_t *lck,
1696 uint32_t prior_lock_state)
1697 {
1698 lck_rw_t *fake_lck;
1699 thread_t thread = current_thread();
1700 uint32_t rwlock_count;
1701
1702 /* Check if dropping the lock means that we need to unpromote */
1703 rwlock_count = thread->rwlock_count--;
1704 #if MACH_LDEBUG
1705 if (rwlock_count == 0) {
1706 panic("rw lock count underflow for thread %p", thread);
1707 }
1708 #endif
1709 fake_lck = (lck_rw_t *)&prior_lock_state;
1710
1711 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
1712 /*
1713 * Someone else has requested upgrade.
1714 * Since we've released the read lock, wake
1715 * him up if he's blocked waiting
1716 */
1717 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1718 }
1719
1720 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1721 /* sched_flags checked without lock, but will be rechecked while clearing */
1722 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1723 }
1724
1725 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
1726 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
1727
1728 return FALSE;
1729 }
1730
1731
1732 /*
1733 * Routine: lck_rw_lock_shared_to_exclusive_failure
1734 * Function:
1735 * assembly fast path code has already dropped our read
1736 * count and successfully acquired 'lck_rw_want_upgrade'
1737 * we just need to wait for the rest of the readers to drain
1738 * and then we can return as the exclusive holder of this lock
1739 */
1740 static boolean_t
1741 lck_rw_lock_shared_to_exclusive_success(
1742 lck_rw_t *lck)
1743 {
1744 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1745 uint64_t deadline = 0;
1746 int slept = 0;
1747 int still_shared = 0;
1748 wait_result_t res;
1749 boolean_t istate = -1;
1750
1751 #if CONFIG_DTRACE
1752 uint64_t wait_interval = 0;
1753 int readers_at_sleep = 0;
1754 boolean_t dtrace_ls_initialized = FALSE;
1755 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1756 #endif
1757
1758 while (lck->lck_rw_shared_count != 0) {
1759 #if CONFIG_DTRACE
1760 if (dtrace_ls_initialized == FALSE) {
1761 dtrace_ls_initialized = TRUE;
1762 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1763 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1764 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1765 if (dtrace_ls_enabled) {
1766 /*
1767 * Either sleeping or spinning is happening,
1768 * start a timing of our delay interval now.
1769 */
1770 readers_at_sleep = lck->lck_rw_shared_count;
1771 wait_interval = mach_absolute_time();
1772 }
1773 }
1774 #endif
1775 if (istate == -1) {
1776 istate = ml_get_interrupts_enabled();
1777 }
1778
1779 deadline = lck_rw_deadline_for_spin(lck);
1780
1781 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
1782 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1783
1784 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
1785 lck_rw_lock_pause(istate);
1786 }
1787
1788 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
1789 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1790
1791 if (!still_shared) {
1792 break;
1793 }
1794 /*
1795 * if we get here, the deadline has expired w/o
1796 * the rw_shared_count having drained to 0
1797 * check to see if we're allowed to do a thread_block
1798 */
1799 if (lck->lck_rw_can_sleep) {
1800 istate = lck_interlock_lock(lck);
1801
1802 if (lck->lck_rw_shared_count != 0) {
1803 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
1804 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
1805
1806 lck->lck_w_waiting = TRUE;
1807
1808 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1809 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1810 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
1811 lck_interlock_unlock(lck, istate);
1812
1813 if (res == THREAD_WAITING) {
1814 res = thread_block(THREAD_CONTINUE_NULL);
1815 slept++;
1816 }
1817 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
1818 trace_lck, res, slept, 0, 0);
1819 } else {
1820 lck_interlock_unlock(lck, istate);
1821 break;
1822 }
1823 }
1824 }
1825 #if CONFIG_DTRACE
1826 /*
1827 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1828 */
1829 if (dtrace_ls_enabled == TRUE) {
1830 if (slept == 0) {
1831 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1832 } else {
1833 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1834 mach_absolute_time() - wait_interval, 1,
1835 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1836 }
1837 }
1838 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1839 #endif
1840 return TRUE;
1841 }
1842
1843 /*
1844 * Routine: lck_rw_lock_exclusive_to_shared
1845 */
1846
1847 void
1848 lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1849 {
1850 uint32_t data, prev;
1851
1852 for (;;) {
1853 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1854 if (data & LCK_RW_INTERLOCK) {
1855 atomic_exchange_abort();
1856 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
1857 continue;
1858 }
1859 data += LCK_RW_SHARED_READER;
1860 if (data & LCK_RW_WANT_UPGRADE) {
1861 data &= ~(LCK_RW_WANT_UPGRADE);
1862 } else {
1863 data &= ~(LCK_RW_WANT_EXCL);
1864 }
1865 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
1866 data &= ~(LCK_RW_W_WAITING);
1867 }
1868 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
1869 break;
1870 }
1871 cpu_pause();
1872 }
1873 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1874 }
1875
1876
1877 /*
1878 * Routine: lck_rw_lock_exclusive_to_shared_gen
1879 * Function:
1880 * assembly fast path has already dropped
1881 * our exclusive state and bumped lck_rw_shared_count
1882 * all we need to do here is determine if anyone
1883 * needs to be awakened.
1884 */
1885 static void
1886 lck_rw_lock_exclusive_to_shared_gen(
1887 lck_rw_t *lck,
1888 uint32_t prior_lock_state)
1889 {
1890 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1891 lck_rw_t *fake_lck;
1892
1893 fake_lck = (lck_rw_t *)&prior_lock_state;
1894
1895 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
1896 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
1897
1898 /*
1899 * don't wake up anyone waiting to take the lock exclusively
1900 * since we hold a read count... when the read count drops to 0,
1901 * the writers will be woken.
1902 *
1903 * wake up any waiting readers if we don't have any writers waiting,
1904 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1905 */
1906 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1907 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1908 }
1909
1910 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
1911 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
1912
1913 #if CONFIG_DTRACE
1914 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1915 #endif
1916 }
1917
1918
1919 /*
1920 * Routine: lck_rw_try_lock
1921 */
1922 boolean_t
1923 lck_rw_try_lock(
1924 lck_rw_t *lck,
1925 lck_rw_type_t lck_rw_type)
1926 {
1927 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1928 return lck_rw_try_lock_shared(lck);
1929 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1930 return lck_rw_try_lock_exclusive(lck);
1931 } else {
1932 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1933 }
1934 return FALSE;
1935 }
1936
1937 /*
1938 * Routine: lck_rw_try_lock_shared
1939 */
1940
1941 boolean_t
1942 lck_rw_try_lock_shared(lck_rw_t *lock)
1943 {
1944 uint32_t data, prev;
1945
1946 for (;;) {
1947 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1948 if (data & LCK_RW_INTERLOCK) {
1949 atomic_exchange_abort();
1950 lck_rw_interlock_spin(lock);
1951 continue;
1952 }
1953 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1954 atomic_exchange_abort();
1955 return FALSE; /* lock is busy */
1956 }
1957 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1958 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1959 break;
1960 }
1961 cpu_pause();
1962 }
1963 current_thread()->rwlock_count++;
1964 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1965 #if CONFIG_DTRACE
1966 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1967 #endif /* CONFIG_DTRACE */
1968 return TRUE;
1969 }
1970
1971
1972 /*
1973 * Routine: lck_rw_try_lock_exclusive
1974 */
1975
1976 boolean_t
1977 lck_rw_try_lock_exclusive(lck_rw_t *lock)
1978 {
1979 uint32_t data, prev;
1980
1981 for (;;) {
1982 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1983 if (data & LCK_RW_INTERLOCK) {
1984 atomic_exchange_abort();
1985 lck_rw_interlock_spin(lock);
1986 continue;
1987 }
1988 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1989 atomic_exchange_abort();
1990 return FALSE; /* can't get it */
1991 }
1992 data |= LCK_RW_WANT_EXCL;
1993 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
1994 break;
1995 }
1996 cpu_pause();
1997 }
1998
1999 current_thread()->rwlock_count++;
2000 #if CONFIG_DTRACE
2001 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
2002 #endif /* CONFIG_DTRACE */
2003 return TRUE;
2004 }
2005
2006
2007 void
2008 lck_rw_assert(
2009 lck_rw_t *lck,
2010 unsigned int type)
2011 {
2012 switch (type) {
2013 case LCK_RW_ASSERT_SHARED:
2014 if (lck->lck_rw_shared_count != 0) {
2015 return;
2016 }
2017 break;
2018 case LCK_RW_ASSERT_EXCLUSIVE:
2019 if ((lck->lck_rw_want_write ||
2020 lck->lck_rw_want_upgrade) &&
2021 lck->lck_rw_shared_count == 0) {
2022 return;
2023 }
2024 break;
2025 case LCK_RW_ASSERT_HELD:
2026 if (lck->lck_rw_want_write ||
2027 lck->lck_rw_want_upgrade ||
2028 lck->lck_rw_shared_count != 0) {
2029 return;
2030 }
2031 break;
2032 case LCK_RW_ASSERT_NOTHELD:
2033 if (!(lck->lck_rw_want_write ||
2034 lck->lck_rw_want_upgrade ||
2035 lck->lck_rw_shared_count != 0)) {
2036 return;
2037 }
2038 break;
2039 default:
2040 break;
2041 }
2042
2043 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
2044 }
2045
2046 /* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
2047 void
2048 lck_rw_clear_promotions_x86(thread_t thread)
2049 {
2050 #if MACH_LDEBUG
2051 /* It's fatal to leave a RW lock locked and return to userspace */
2052 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2053 #else
2054 /* Paper over the issue */
2055 thread->rwlock_count = 0;
2056 lck_rw_clear_promotion(thread, 0);
2057 #endif
2058 }
2059
2060 boolean_t
2061 lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2062 {
2063 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2064
2065 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2066 lck_rw_unlock_shared(lck);
2067 mutex_pause(2);
2068 lck_rw_lock_shared(lck);
2069 return TRUE;
2070 }
2071
2072 return FALSE;
2073 }
2074
2075 /*
2076 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2077 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2078 */
2079 boolean_t
2080 kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2081 {
2082 if (not_in_kdp) {
2083 panic("panic: rw lock exclusive check done outside of kernel debugger");
2084 }
2085 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2086 }
2087
2088 /*
2089 * Slow path routines for lck_mtx locking and unlocking functions.
2090 *
2091 * These functions were previously implemented in x86 assembly,
2092 * and some optimizations are in place in this c code to obtain a compiled code
2093 * as performant and compact as the assembly version.
2094 *
2095 * To avoid to inline these functions on the fast path, all functions directly called by
2096 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2097 * in such a way the fast path can tail call into them. In this way the return address
2098 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2099 *
2100 * Slow path code is structured in such a way there are no calls to functions that will return
2101 * on the context of the caller function, i.e. all functions called are or tail call functions
2102 * or inline functions. The number of arguments of the tail call functions are less then six,
2103 * so that they can be passed over registers and do not need to be pushed on stack.
2104 * This allows the compiler to not create a stack frame for the functions.
2105 *
2106 * __improbable and __probable are used to compile the slow path code in such a way
2107 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2108 * to make this case the most optimized even if falling through the slow path.
2109 */
2110
2111 /*
2112 * Intel lock invariants:
2113 *
2114 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2115 * lck_mtx_pri: contains the max priority of all waiters during a contention period
2116 * not cleared on last unlock, but stomped over on next first contention
2117 * lck_mtx_promoted: set when the current lock owner has been promoted
2118 * cleared when lock owner unlocks, set on acquire or wait.
2119 *
2120 * The lock owner is promoted to the max priority of all its waiters only if it
2121 * was a lower priority when it acquired or was an owner when a waiter waited.
2122 * Max priority is capped at MAXPRI_PROMOTE.
2123 *
2124 * The last waiter will not be promoted as it is woken up, but the last
2125 * lock owner may not have been the last thread to have been woken up depending on the
2126 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2127 * flag set.
2128 *
2129 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2130 * priority from dropping priority in the future without having to take thread lock
2131 * on acquire.
2132 */
2133
2134 #ifdef MUTEX_ZONE
2135 extern zone_t lck_mtx_zone;
2136 #endif
2137
2138 /*
2139 * Routine: lck_mtx_alloc_init
2140 */
2141 lck_mtx_t *
2142 lck_mtx_alloc_init(
2143 lck_grp_t *grp,
2144 lck_attr_t *attr)
2145 {
2146 lck_mtx_t *lck;
2147 #ifdef MUTEX_ZONE
2148 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) {
2149 lck_mtx_init(lck, grp, attr);
2150 }
2151 #else
2152 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) {
2153 lck_mtx_init(lck, grp, attr);
2154 }
2155 #endif
2156 return lck;
2157 }
2158
2159 /*
2160 * Routine: lck_mtx_free
2161 */
2162 void
2163 lck_mtx_free(
2164 lck_mtx_t *lck,
2165 lck_grp_t *grp)
2166 {
2167 lck_mtx_destroy(lck, grp);
2168 #ifdef MUTEX_ZONE
2169 zfree(lck_mtx_zone, lck);
2170 #else
2171 kfree(lck, sizeof(lck_mtx_t));
2172 #endif
2173 }
2174
2175 /*
2176 * Routine: lck_mtx_ext_init
2177 */
2178 static void
2179 lck_mtx_ext_init(
2180 lck_mtx_ext_t *lck,
2181 lck_grp_t *grp,
2182 lck_attr_t *attr)
2183 {
2184 bzero((void *)lck, sizeof(lck_mtx_ext_t));
2185
2186 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2187 lck->lck_mtx_deb.type = MUTEX_TAG;
2188 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2189 }
2190
2191 lck->lck_mtx_grp = grp;
2192
2193 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
2194 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
2195 }
2196
2197 lck->lck_mtx.lck_mtx_is_ext = 1;
2198 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
2199 }
2200
2201 /*
2202 * Routine: lck_mtx_init
2203 */
2204 void
2205 lck_mtx_init(
2206 lck_mtx_t *lck,
2207 lck_grp_t *grp,
2208 lck_attr_t *attr)
2209 {
2210 lck_mtx_ext_t *lck_ext;
2211 lck_attr_t *lck_attr;
2212
2213 if (attr != LCK_ATTR_NULL) {
2214 lck_attr = attr;
2215 } else {
2216 lck_attr = &LockDefaultLckAttr;
2217 }
2218
2219 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2220 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2221 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2222 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2223 lck->lck_mtx_ptr = lck_ext;
2224 }
2225 } else {
2226 lck->lck_mtx_owner = 0;
2227 lck->lck_mtx_state = 0;
2228 }
2229 lck->lck_mtx_pad32 = 0xFFFFFFFF;
2230 lck_grp_reference(grp);
2231 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2232 }
2233
2234 /*
2235 * Routine: lck_mtx_init_ext
2236 */
2237 void
2238 lck_mtx_init_ext(
2239 lck_mtx_t *lck,
2240 lck_mtx_ext_t *lck_ext,
2241 lck_grp_t *grp,
2242 lck_attr_t *attr)
2243 {
2244 lck_attr_t *lck_attr;
2245
2246 if (attr != LCK_ATTR_NULL) {
2247 lck_attr = attr;
2248 } else {
2249 lck_attr = &LockDefaultLckAttr;
2250 }
2251
2252 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2253 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2254 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2255 lck->lck_mtx_ptr = lck_ext;
2256 } else {
2257 lck->lck_mtx_owner = 0;
2258 lck->lck_mtx_state = 0;
2259 }
2260 lck->lck_mtx_pad32 = 0xFFFFFFFF;
2261
2262 lck_grp_reference(grp);
2263 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2264 }
2265
2266 static void
2267 lck_mtx_lock_mark_destroyed(
2268 lck_mtx_t *mutex,
2269 boolean_t indirect)
2270 {
2271 uint32_t state;
2272
2273 if (indirect) {
2274 /* convert to destroyed state */
2275 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2276 return;
2277 }
2278
2279 state = ordered_load_mtx_state(mutex);
2280 lck_mtx_interlock_lock(mutex, &state);
2281
2282 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2283
2284 enable_preemption();
2285 }
2286
2287 /*
2288 * Routine: lck_mtx_destroy
2289 */
2290 void
2291 lck_mtx_destroy(
2292 lck_mtx_t *lck,
2293 lck_grp_t *grp)
2294 {
2295 boolean_t indirect;
2296
2297 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
2298 return;
2299 }
2300 #if MACH_LDEBUG
2301 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2302 #endif
2303 indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2304
2305 lck_mtx_lock_mark_destroyed(lck, indirect);
2306
2307 if (indirect) {
2308 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2309 }
2310 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2311 lck_grp_deallocate(grp);
2312 return;
2313 }
2314
2315
2316 #if DEVELOPMENT | DEBUG
2317 __attribute__((noinline))
2318 void
2319 lck_mtx_owner_check_panic(
2320 lck_mtx_t *lock)
2321 {
2322 thread_t owner = (thread_t)lock->lck_mtx_owner;
2323 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2324 }
2325 #endif
2326
2327 __attribute__((always_inline))
2328 static boolean_t
2329 get_indirect_mutex(
2330 lck_mtx_t **lock,
2331 uint32_t *state)
2332 {
2333 *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2334 *state = ordered_load_mtx_state(*lock);
2335 return TRUE;
2336 }
2337
2338 /*
2339 * Routine: lck_mtx_unlock_slow
2340 *
2341 * Unlocks a mutex held by current thread.
2342 *
2343 * It will wake up waiters if necessary and
2344 * drop promotions.
2345 *
2346 * Interlock can be held.
2347 */
2348 __attribute__((noinline))
2349 void
2350 lck_mtx_unlock_slow(
2351 lck_mtx_t *lock)
2352 {
2353 thread_t thread;
2354 uint32_t state, prev;
2355 boolean_t indirect = FALSE;
2356
2357 state = ordered_load_mtx_state(lock);
2358
2359 /* Is this an indirect mutex? */
2360 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2361 indirect = get_indirect_mutex(&lock, &state);
2362 }
2363
2364 thread = current_thread();
2365
2366 #if DEVELOPMENT | DEBUG
2367 thread_t owner = (thread_t)lock->lck_mtx_owner;
2368 if (__improbable(owner != thread)) {
2369 return lck_mtx_owner_check_panic(lock);
2370 }
2371 #endif
2372
2373 /* check if it is held as a spinlock */
2374 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
2375 goto unlock;
2376 }
2377
2378 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2379
2380 unlock:
2381 /* preemption disabled, interlock held and mutex not held */
2382
2383 /* clear owner */
2384 ordered_store_mtx_owner(lock, 0);
2385 /* keep original state in prev for later evaluation */
2386 prev = state;
2387 /* release interlock, promotion and clear spin flag */
2388 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK));
2389 if ((state & LCK_MTX_WAITERS_MSK)) {
2390 state -= LCK_MTX_WAITER; /* decrement waiter count */
2391 }
2392 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
2393
2394 #if MACH_LDEBUG
2395 /* perform lock statistics after drop to prevent delay */
2396 if (thread) {
2397 thread->mutex_count--; /* lock statistic */
2398 }
2399 #endif /* MACH_LDEBUG */
2400
2401 /* check if there are waiters to wake up or priority to drop */
2402 if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK))) {
2403 return lck_mtx_unlock_wakeup_tail(lock, prev, indirect);
2404 }
2405
2406 /* re-enable preemption */
2407 lck_mtx_unlock_finish_inline(lock, FALSE);
2408
2409 return;
2410 }
2411
2412 #define LCK_MTX_LCK_WAIT_CODE 0x20
2413 #define LCK_MTX_LCK_WAKEUP_CODE 0x21
2414 #define LCK_MTX_LCK_SPIN_CODE 0x22
2415 #define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2416 #define LCK_MTX_LCK_DEMOTE_CODE 0x24
2417
2418 /*
2419 * Routine: lck_mtx_unlock_wakeup_tail
2420 *
2421 * Invoked on unlock when there is
2422 * contention, i.e. the assembly routine sees
2423 * that mutex->lck_mtx_waiters != 0 or
2424 * that mutex->lck_mtx_promoted != 0
2425 *
2426 * neither the mutex or interlock is held
2427 *
2428 * Note that this routine might not be called if there are pending
2429 * waiters which have previously been woken up, and they didn't
2430 * end up boosting the old owner.
2431 *
2432 * assembly routine previously did the following to mutex:
2433 * (after saving the state in prior_lock_state)
2434 * cleared lck_mtx_promoted
2435 * decremented lck_mtx_waiters if nonzero
2436 *
2437 * This function needs to be called as a tail call
2438 * to optimize the compiled code.
2439 */
2440 __attribute__((noinline))
2441 static void
2442 lck_mtx_unlock_wakeup_tail(
2443 lck_mtx_t *mutex,
2444 int prior_lock_state,
2445 boolean_t indirect)
2446 {
2447 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2448 lck_mtx_t fake_lck;
2449
2450 /*
2451 * prior_lock state is a snapshot of the 2nd word of the
2452 * lock in question... we'll fake up a lock with the bits
2453 * copied into place and carefully not access anything
2454 * beyond whats defined in the second word of a lck_mtx_t
2455 */
2456 fake_lck.lck_mtx_state = prior_lock_state;
2457
2458 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
2459 trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
2460
2461 if (__probable(fake_lck.lck_mtx_waiters)) {
2462 kern_return_t did_wake;
2463
2464 if (fake_lck.lck_mtx_waiters > 1) {
2465 did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
2466 } else {
2467 did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex));
2468 }
2469 /*
2470 * The waiters count always precisely matches the number of threads on the waitqueue.
2471 * i.e. we should never see ret == KERN_NOT_WAITING.
2472 */
2473 assert(did_wake == KERN_SUCCESS);
2474 }
2475
2476 /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */
2477 if (__improbable(fake_lck.lck_mtx_promoted)) {
2478 thread_t thread = current_thread();
2479
2480 spl_t s = splsched();
2481 thread_lock(thread);
2482
2483 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
2484 thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
2485 assert(thread->was_promoted_on_wakeup == 0);
2486 assert(thread->promotions > 0);
2487
2488 assert_promotions_invariant(thread);
2489
2490 if (--thread->promotions == 0) {
2491 sched_thread_unpromote(thread, trace_lck);
2492 }
2493
2494 assert_promotions_invariant(thread);
2495
2496 thread_unlock(thread);
2497 splx(s);
2498 }
2499
2500 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
2501 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2502
2503 lck_mtx_unlock_finish_inline(mutex, indirect);
2504 }
2505
2506 /*
2507 * Routine: lck_mtx_lock_acquire_x86
2508 *
2509 * Invoked on acquiring the mutex when there is
2510 * contention (i.e. the assembly routine sees that
2511 * that mutex->lck_mtx_waiters != 0 or
2512 * thread->was_promoted_on_wakeup != 0)...
2513 *
2514 * mutex is owned... interlock is held... preemption is disabled
2515 */
2516 __attribute__((always_inline))
2517 static void
2518 lck_mtx_lock_acquire_inline(
2519 lck_mtx_t *mutex)
2520 {
2521 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2522 integer_t priority;
2523
2524 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
2525 trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
2526
2527 if (mutex->lck_mtx_waiters) {
2528 priority = mutex->lck_mtx_pri;
2529 } else {
2530 priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */
2531 }
2532 /* the priority must have been set correctly by wait */
2533 assert(priority <= MAXPRI_PROMOTE);
2534 assert(priority == 0 || priority >= BASEPRI_DEFAULT);
2535
2536 /* if the mutex wasn't owned, then the owner wasn't promoted */
2537 assert(mutex->lck_mtx_promoted == 0);
2538
2539 thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */
2540
2541 if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
2542 spl_t s = splsched();
2543 thread_lock(thread);
2544
2545 if (thread->was_promoted_on_wakeup) {
2546 assert(thread->promotions > 0);
2547 }
2548
2549 /* Intel only promotes if priority goes up */
2550 if (thread->sched_pri < priority && thread->promotion_priority < priority) {
2551 /* Remember that I need to drop this promotion on unlock */
2552 mutex->lck_mtx_promoted = 1;
2553
2554 if (thread->promotions++ == 0) {
2555 /* This is the first promotion for the owner */
2556 sched_thread_promote_to_pri(thread, priority, trace_lck);
2557 } else {
2558 /*
2559 * Holder was previously promoted due to a different mutex,
2560 * raise to match this one.
2561 * Or, this thread was promoted on wakeup but someone else
2562 * later contended on mutex at higher priority before we got here
2563 */
2564 sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
2565 }
2566 }
2567
2568 if (thread->was_promoted_on_wakeup) {
2569 thread->was_promoted_on_wakeup = 0;
2570 if (--thread->promotions == 0) {
2571 sched_thread_unpromote(thread, trace_lck);
2572 }
2573 }
2574
2575 thread_unlock(thread);
2576 splx(s);
2577 }
2578 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
2579 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
2580 }
2581
2582 void
2583 lck_mtx_lock_acquire_x86(
2584 lck_mtx_t *mutex)
2585 {
2586 return lck_mtx_lock_acquire_inline(mutex);
2587 }
2588
2589 /*
2590 * Tail call helpers for lock functions that perform
2591 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2592 * the caller's compiled code.
2593 */
2594
2595 __attribute__((noinline))
2596 static void
2597 lck_mtx_lock_acquire_tail(
2598 lck_mtx_t *mutex,
2599 boolean_t indirect)
2600 {
2601 lck_mtx_lock_acquire_inline(mutex);
2602 lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect);
2603 }
2604
2605 __attribute__((noinline))
2606 static boolean_t
2607 lck_mtx_try_lock_acquire_tail(
2608 lck_mtx_t *mutex)
2609 {
2610 lck_mtx_lock_acquire_inline(mutex);
2611 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2612
2613 return TRUE;
2614 }
2615
2616 __attribute__((noinline))
2617 static void
2618 lck_mtx_convert_spin_acquire_tail(
2619 lck_mtx_t *mutex)
2620 {
2621 lck_mtx_lock_acquire_inline(mutex);
2622 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2623 }
2624
2625 boolean_t
2626 lck_mtx_ilk_unlock(
2627 lck_mtx_t *mutex)
2628 {
2629 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2630 return TRUE;
2631 }
2632
2633 static inline void
2634 lck_mtx_interlock_lock_set_and_clear_flags(
2635 lck_mtx_t *mutex,
2636 uint32_t xor_flags,
2637 uint32_t and_flags,
2638 uint32_t *new_state)
2639 {
2640 uint32_t state, prev;
2641 state = *new_state;
2642
2643 for (;;) {
2644 /* have to wait for interlock to clear */
2645 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2646 cpu_pause();
2647 state = ordered_load_mtx_state(mutex);
2648 }
2649 prev = state; /* prev contains snapshot for exchange */
2650 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
2651 state &= ~and_flags; /* clear flags */
2652
2653 disable_preemption();
2654 if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
2655 break;
2656 }
2657 enable_preemption();
2658 cpu_pause();
2659 state = ordered_load_mtx_state(mutex);
2660 }
2661 *new_state = state;
2662 return;
2663 }
2664
2665 static inline void
2666 lck_mtx_interlock_lock_clear_flags(
2667 lck_mtx_t *mutex,
2668 uint32_t and_flags,
2669 uint32_t *new_state)
2670 {
2671 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2672 }
2673
2674 static inline void
2675 lck_mtx_interlock_lock(
2676 lck_mtx_t *mutex,
2677 uint32_t *new_state)
2678 {
2679 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2680 }
2681
2682 static inline int
2683 lck_mtx_interlock_try_lock_set_flags(
2684 lck_mtx_t *mutex,
2685 uint32_t or_flags,
2686 uint32_t *new_state)
2687 {
2688 uint32_t state, prev;
2689 state = *new_state;
2690
2691 /* have to wait for interlock to clear */
2692 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2693 return 0;
2694 }
2695 prev = state; /* prev contains snapshot for exchange */
2696 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
2697 disable_preemption();
2698 if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
2699 *new_state = state;
2700 return 1;
2701 }
2702
2703 enable_preemption();
2704 return 0;
2705 }
2706
2707 static inline int
2708 lck_mtx_interlock_try_lock(
2709 lck_mtx_t *mutex,
2710 uint32_t *new_state)
2711 {
2712 return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state);
2713 }
2714
2715 static inline int
2716 lck_mtx_interlock_try_lock_disable_interrupts(
2717 lck_mtx_t *mutex,
2718 boolean_t *istate)
2719 {
2720 uint32_t state;
2721
2722 *istate = ml_set_interrupts_enabled(FALSE);
2723 state = ordered_load_mtx_state(mutex);
2724
2725 if (lck_mtx_interlock_try_lock(mutex, &state)) {
2726 return 1;
2727 } else {
2728 ml_set_interrupts_enabled(*istate);
2729 return 0;
2730 }
2731 }
2732
2733 static inline void
2734 lck_mtx_interlock_unlock_enable_interrupts(
2735 lck_mtx_t *mutex,
2736 boolean_t istate)
2737 {
2738 lck_mtx_ilk_unlock(mutex);
2739 ml_set_interrupts_enabled(istate);
2740 }
2741
2742 __attribute__((noinline))
2743 static void
2744 lck_mtx_lock_contended(
2745 lck_mtx_t *lock,
2746 boolean_t indirect,
2747 boolean_t *first_miss)
2748 {
2749 lck_mtx_spinwait_ret_type_t ret;
2750 uint32_t state;
2751 thread_t thread;
2752
2753 try_again:
2754
2755 if (indirect) {
2756 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2757 }
2758
2759 ret = lck_mtx_lock_spinwait_x86(lock);
2760 state = ordered_load_mtx_state(lock);
2761 switch (ret) {
2762 case LCK_MTX_SPINWAIT_NO_SPIN:
2763 /*
2764 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2765 * try to spin.
2766 */
2767 if (indirect) {
2768 lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2769 }
2770
2771 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2772 case LCK_MTX_SPINWAIT_SPUN:
2773 /*
2774 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2775 * interlock not held
2776 */
2777 lck_mtx_interlock_lock(lock, &state);
2778 assert(state & LCK_MTX_ILOCKED_MSK);
2779
2780 if (state & LCK_MTX_MLOCKED_MSK) {
2781 if (indirect) {
2782 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2783 }
2784 lck_mtx_lock_wait_x86(lock);
2785 /*
2786 * interlock is not held here.
2787 */
2788 goto try_again;
2789 } else {
2790 /* grab the mutex */
2791 state |= LCK_MTX_MLOCKED_MSK;
2792 ordered_store_mtx_state_release(lock, state);
2793 thread = current_thread();
2794 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2795 #if MACH_LDEBUG
2796 if (thread) {
2797 thread->mutex_count++;
2798 }
2799 #endif /* MACH_LDEBUG */
2800 }
2801
2802 break;
2803 case LCK_MTX_SPINWAIT_ACQUIRED:
2804 /*
2805 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2806 * interlock is held and preemption disabled
2807 * owner is set and mutex marked as locked
2808 * statistics updated too
2809 */
2810 break;
2811 default:
2812 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2813 }
2814
2815 /*
2816 * interlock is already acquired here
2817 */
2818
2819 /* mutex has been acquired */
2820 thread = (thread_t)lock->lck_mtx_owner;
2821 if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) {
2822 return lck_mtx_lock_acquire_tail(lock, indirect);
2823 }
2824
2825 /* release the interlock */
2826 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2827 }
2828
2829 /*
2830 * Helper noinline functions for calling
2831 * panic to optimize compiled code.
2832 */
2833
2834 __attribute__((noinline))
2835 static void
2836 lck_mtx_destroyed(
2837 lck_mtx_t *lock)
2838 {
2839 panic("trying to interlock destroyed mutex (%p)", lock);
2840 }
2841
2842 __attribute__((noinline))
2843 static boolean_t
2844 lck_mtx_try_destroyed(
2845 lck_mtx_t *lock)
2846 {
2847 panic("trying to interlock destroyed mutex (%p)", lock);
2848 return FALSE;
2849 }
2850
2851 __attribute__((always_inline))
2852 static boolean_t
2853 lck_mtx_lock_wait_interlock_to_clear(
2854 lck_mtx_t *lock,
2855 uint32_t* new_state)
2856 {
2857 uint32_t state;
2858
2859 for (;;) {
2860 cpu_pause();
2861 state = ordered_load_mtx_state(lock);
2862 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2863 *new_state = state;
2864 return TRUE;
2865 }
2866 if (state & LCK_MTX_MLOCKED_MSK) {
2867 /* if it is held as mutex, just fail */
2868 return FALSE;
2869 }
2870 }
2871 }
2872
2873 __attribute__((always_inline))
2874 static boolean_t
2875 lck_mtx_try_lock_wait_interlock_to_clear(
2876 lck_mtx_t *lock,
2877 uint32_t* new_state)
2878 {
2879 uint32_t state;
2880
2881 for (;;) {
2882 cpu_pause();
2883 state = ordered_load_mtx_state(lock);
2884 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2885 /* if it is held as mutex or spin, just fail */
2886 return FALSE;
2887 }
2888 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2889 *new_state = state;
2890 return TRUE;
2891 }
2892 }
2893 }
2894
2895 /*
2896 * Routine: lck_mtx_lock_slow
2897 *
2898 * Locks a mutex for current thread.
2899 * If the lock is contended this function might
2900 * sleep.
2901 *
2902 * Called with interlock not held.
2903 */
2904 __attribute__((noinline))
2905 void
2906 lck_mtx_lock_slow(
2907 lck_mtx_t *lock)
2908 {
2909 boolean_t indirect = FALSE;
2910 uint32_t state;
2911 int first_miss = 0;
2912
2913 state = ordered_load_mtx_state(lock);
2914
2915 /* is the interlock or mutex held */
2916 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2917 /*
2918 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2919 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2920 * set in state (state == lck_mtx_tag)
2921 */
2922
2923
2924 /* is the mutex already held and not indirect */
2925 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
2926 /* no, must have been the mutex */
2927 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2928 }
2929
2930 /* check to see if it is marked destroyed */
2931 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2932 return lck_mtx_destroyed(lock);
2933 }
2934
2935 /* Is this an indirect mutex? */
2936 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2937 indirect = get_indirect_mutex(&lock, &state);
2938
2939 first_miss = 0;
2940 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
2941
2942 if (state & LCK_MTX_SPIN_MSK) {
2943 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
2944 assert(state & LCK_MTX_ILOCKED_MSK);
2945 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2946 }
2947 }
2948
2949 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2950 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2951 }
2952 }
2953
2954 /* no - can't be INDIRECT, DESTROYED or locked */
2955 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2956 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2957 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2958 }
2959 }
2960
2961 /* lock and interlock acquired */
2962
2963 thread_t thread = current_thread();
2964 /* record owner of mutex */
2965 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2966
2967 #if MACH_LDEBUG
2968 if (thread) {
2969 thread->mutex_count++; /* lock statistic */
2970 }
2971 #endif
2972 /*
2973 * Check if there are waiters to
2974 * inherit their priority.
2975 */
2976 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2977 return lck_mtx_lock_acquire_tail(lock, indirect);
2978 }
2979
2980 /* release the interlock */
2981 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2982
2983 return;
2984 }
2985
2986 __attribute__((noinline))
2987 boolean_t
2988 lck_mtx_try_lock_slow(
2989 lck_mtx_t *lock)
2990 {
2991 boolean_t indirect = FALSE;
2992 uint32_t state;
2993 int first_miss = 0;
2994
2995 state = ordered_load_mtx_state(lock);
2996
2997 /* is the interlock or mutex held */
2998 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2999 /*
3000 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3001 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3002 * set in state (state == lck_mtx_tag)
3003 */
3004
3005 /* is the mutex already held and not indirect */
3006 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3007 return FALSE;
3008 }
3009
3010 /* check to see if it is marked destroyed */
3011 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3012 return lck_mtx_try_destroyed(lock);
3013 }
3014
3015 /* Is this an indirect mutex? */
3016 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3017 indirect = get_indirect_mutex(&lock, &state);
3018
3019 first_miss = 0;
3020 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3021 }
3022
3023 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3024 if (indirect) {
3025 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3026 }
3027 return FALSE;
3028 }
3029 }
3030
3031 /* no - can't be INDIRECT, DESTROYED or locked */
3032 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3033 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3034 if (indirect) {
3035 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3036 }
3037 return FALSE;
3038 }
3039 }
3040
3041 /* lock and interlock acquired */
3042
3043 thread_t thread = current_thread();
3044 /* record owner of mutex */
3045 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3046
3047 #if MACH_LDEBUG
3048 if (thread) {
3049 thread->mutex_count++; /* lock statistic */
3050 }
3051 #endif
3052 /*
3053 * Check if there are waiters to
3054 * inherit their priority.
3055 */
3056 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3057 return lck_mtx_try_lock_acquire_tail(lock);
3058 }
3059
3060 /* release the interlock */
3061 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3062
3063 return TRUE;
3064 }
3065
3066 __attribute__((noinline))
3067 void
3068 lck_mtx_lock_spin_slow(
3069 lck_mtx_t *lock)
3070 {
3071 boolean_t indirect = FALSE;
3072 uint32_t state;
3073 int first_miss = 0;
3074
3075 state = ordered_load_mtx_state(lock);
3076
3077 /* is the interlock or mutex held */
3078 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3079 /*
3080 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3081 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3082 * set in state (state == lck_mtx_tag)
3083 */
3084
3085
3086 /* is the mutex already held and not indirect */
3087 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3088 /* no, must have been the mutex */
3089 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3090 }
3091
3092 /* check to see if it is marked destroyed */
3093 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3094 return lck_mtx_destroyed(lock);
3095 }
3096
3097 /* Is this an indirect mutex? */
3098 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3099 indirect = get_indirect_mutex(&lock, &state);
3100
3101 first_miss = 0;
3102 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3103
3104 if (state & LCK_MTX_SPIN_MSK) {
3105 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
3106 assert(state & LCK_MTX_ILOCKED_MSK);
3107 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3108 }
3109 }
3110
3111 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3112 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3113 }
3114 }
3115
3116 /* no - can't be INDIRECT, DESTROYED or locked */
3117 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3118 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3119 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3120 }
3121 }
3122
3123 /* lock as spinlock and interlock acquired */
3124
3125 thread_t thread = current_thread();
3126 /* record owner of mutex */
3127 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3128
3129 #if MACH_LDEBUG
3130 if (thread) {
3131 thread->mutex_count++; /* lock statistic */
3132 }
3133 #endif
3134
3135 #if CONFIG_DTRACE
3136 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3137 #endif
3138 /* return with the interlock held and preemption disabled */
3139 return;
3140 }
3141
3142 __attribute__((noinline))
3143 boolean_t
3144 lck_mtx_try_lock_spin_slow(
3145 lck_mtx_t *lock)
3146 {
3147 boolean_t indirect = FALSE;
3148 uint32_t state;
3149 int first_miss = 0;
3150
3151 state = ordered_load_mtx_state(lock);
3152
3153 /* is the interlock or mutex held */
3154 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3155 /*
3156 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3157 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3158 * set in state (state == lck_mtx_tag)
3159 */
3160
3161 /* is the mutex already held and not indirect */
3162 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
3163 return FALSE;
3164 }
3165
3166 /* check to see if it is marked destroyed */
3167 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3168 return lck_mtx_try_destroyed(lock);
3169 }
3170
3171 /* Is this an indirect mutex? */
3172 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3173 indirect = get_indirect_mutex(&lock, &state);
3174
3175 first_miss = 0;
3176 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
3177 }
3178
3179 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3180 if (indirect) {
3181 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3182 }
3183 return FALSE;
3184 }
3185 }
3186
3187 /* no - can't be INDIRECT, DESTROYED or locked */
3188 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3189 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3190 if (indirect) {
3191 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3192 }
3193 return FALSE;
3194 }
3195 }
3196
3197 /* lock and interlock acquired */
3198
3199 thread_t thread = current_thread();
3200 /* record owner of mutex */
3201 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3202
3203 #if MACH_LDEBUG
3204 if (thread) {
3205 thread->mutex_count++; /* lock statistic */
3206 }
3207 #endif
3208
3209 #if CONFIG_DTRACE
3210 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3211 #endif
3212 return TRUE;
3213 }
3214
3215 __attribute__((noinline))
3216 void
3217 lck_mtx_convert_spin(
3218 lck_mtx_t *lock)
3219 {
3220 uint32_t state;
3221
3222 state = ordered_load_mtx_state(lock);
3223
3224 /* Is this an indirect mutex? */
3225 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3226 /* If so, take indirection */
3227 get_indirect_mutex(&lock, &state);
3228 }
3229
3230 assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3231
3232 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3233 /* already owned as a mutex, just return */
3234 return;
3235 }
3236
3237 assert(get_preemption_level() > 0);
3238 assert(state & LCK_MTX_ILOCKED_MSK);
3239 assert(state & LCK_MTX_SPIN_MSK);
3240
3241 /*
3242 * Check if there are waiters to
3243 * inherit their priority.
3244 */
3245 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3246 return lck_mtx_convert_spin_acquire_tail(lock);
3247 }
3248
3249 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3250
3251 return;
3252 }
3253
3254 static inline boolean_t
3255 lck_mtx_lock_grab_mutex(
3256 lck_mtx_t *lock)
3257 {
3258 uint32_t state;
3259
3260 state = ordered_load_mtx_state(lock);
3261
3262 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3263 return FALSE;
3264 }
3265
3266 /* lock and interlock acquired */
3267
3268 thread_t thread = current_thread();
3269 /* record owner of mutex */
3270 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3271
3272 #if MACH_LDEBUG
3273 if (thread) {
3274 thread->mutex_count++; /* lock statistic */
3275 }
3276 #endif
3277 return TRUE;
3278 }
3279
3280 __attribute__((noinline))
3281 void
3282 lck_mtx_assert(
3283 lck_mtx_t *lock,
3284 unsigned int type)
3285 {
3286 thread_t thread, owner;
3287 uint32_t state;
3288
3289 thread = current_thread();
3290 state = ordered_load_mtx_state(lock);
3291
3292 if (state == LCK_MTX_TAG_INDIRECT) {
3293 get_indirect_mutex(&lock, &state);
3294 }
3295
3296 owner = (thread_t)lock->lck_mtx_owner;
3297
3298 if (type == LCK_MTX_ASSERT_OWNED) {
3299 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
3300 panic("mutex (%p) not owned\n", lock);
3301 }
3302 } else {
3303 assert(type == LCK_MTX_ASSERT_NOTOWNED);
3304 if (owner == thread) {
3305 panic("mutex (%p) owned\n", lock);
3306 }
3307 }
3308 }
3309
3310 /*
3311 * Routine: lck_mtx_lock_spinwait_x86
3312 *
3313 * Invoked trying to acquire a mutex when there is contention but
3314 * the holder is running on another processor. We spin for up to a maximum
3315 * time waiting for the lock to be released.
3316 *
3317 * Called with the interlock unlocked.
3318 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3319 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3320 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3321 */
3322 __attribute__((noinline))
3323 lck_mtx_spinwait_ret_type_t
3324 lck_mtx_lock_spinwait_x86(
3325 lck_mtx_t *mutex)
3326 {
3327 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3328 thread_t holder;
3329 uint64_t overall_deadline;
3330 uint64_t check_owner_deadline;
3331 uint64_t cur_time;
3332 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN;
3333 int loopcount = 0;
3334
3335 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3336 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
3337
3338 cur_time = mach_absolute_time();
3339 overall_deadline = cur_time + MutexSpin;
3340 check_owner_deadline = cur_time;
3341
3342 /*
3343 * Spin while:
3344 * - mutex is locked, and
3345 * - its locked as a spin lock, and
3346 * - owner is running on another processor, and
3347 * - owner (processor) is not idling, and
3348 * - we haven't spun for long enough.
3349 */
3350 do {
3351 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3352 retval = LCK_MTX_SPINWAIT_ACQUIRED;
3353 break;
3354 }
3355 cur_time = mach_absolute_time();
3356
3357 if (cur_time >= overall_deadline) {
3358 break;
3359 }
3360
3361 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
3362 boolean_t istate;
3363
3364 /*
3365 * We will repeatedly peek at the state of the lock while spinning,
3366 * and we will acquire the interlock to do so.
3367 * The thread that will unlock the mutex will also need to acquire
3368 * the interlock, and we want to avoid to slow it down.
3369 * To avoid to get an interrupt while holding the interlock
3370 * and increase the time we are holding it, we
3371 * will try to acquire the interlock with interrupts disabled.
3372 * This is safe because it is a "try_lock", if we can't acquire
3373 * the interlock we re-enable the interrupts and fail, so it is
3374 * ok to call it even if the interlock was already held.
3375 */
3376 if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
3377 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
3378 if (!(holder->machine.specFlags & OnProc) ||
3379 (holder->state & TH_IDLE)) {
3380 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3381
3382 if (loopcount == 0) {
3383 retval = LCK_MTX_SPINWAIT_NO_SPIN;
3384 }
3385 break;
3386 }
3387 }
3388 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3389
3390 check_owner_deadline = cur_time + (MutexSpin / 4);
3391 }
3392 }
3393 cpu_pause();
3394
3395 loopcount++;
3396 } while (TRUE);
3397
3398 #if CONFIG_DTRACE
3399 /*
3400 * We've already kept a count via overall_deadline of how long we spun.
3401 * If dtrace is active, then we compute backwards to decide how
3402 * long we spun.
3403 *
3404 * Note that we record a different probe id depending on whether
3405 * this is a direct or indirect mutex. This allows us to
3406 * penalize only lock groups that have debug/stats enabled
3407 * with dtrace processing if desired.
3408 */
3409 if (__probable(mutex->lck_mtx_is_ext == 0)) {
3410 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3411 mach_absolute_time() - (overall_deadline - MutexSpin));
3412 } else {
3413 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3414 mach_absolute_time() - (overall_deadline - MutexSpin));
3415 }
3416 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3417 #endif
3418
3419 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3420 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
3421
3422 return retval;
3423 }
3424
3425
3426
3427 /*
3428 * Routine: lck_mtx_lock_wait_x86
3429 *
3430 * Invoked in order to wait on contention.
3431 *
3432 * Called with the interlock locked and
3433 * preemption disabled...
3434 * returns it unlocked and with preemption enabled
3435 *
3436 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3437 * A runnable waiter can exist between wait and acquire
3438 * without a waiters count being set.
3439 * This allows us to never make a spurious wakeup call.
3440 *
3441 * Priority:
3442 * This avoids taking the thread lock if the owning thread is the same priority.
3443 * This optimizes the case of same-priority threads contending on a lock.
3444 * However, that allows the owning thread to drop in priority while holding the lock,
3445 * because there is no state that the priority change can notice that
3446 * says that the targeted thread holds a contended mutex.
3447 *
3448 * One possible solution: priority changes could look for some atomic tag
3449 * on the thread saying 'holding contended lock', and then set up a promotion.
3450 * Needs a story for dropping that promotion - the last contended unlock
3451 * has to notice that this has happened.
3452 */
3453 __attribute__((noinline))
3454 void
3455 lck_mtx_lock_wait_x86(
3456 lck_mtx_t *mutex)
3457 {
3458 #if CONFIG_DTRACE
3459 uint64_t sleep_start = 0;
3460
3461 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3462 sleep_start = mach_absolute_time();
3463 }
3464 #endif
3465 thread_t self = current_thread();
3466 assert(self->waiting_for_mutex == NULL);
3467
3468 self->waiting_for_mutex = mutex;
3469
3470 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3471
3472 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3473 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3474 mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
3475
3476 integer_t waiter_pri = self->sched_pri;
3477 waiter_pri = MAX(waiter_pri, self->base_pri);
3478 waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT);
3479 waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE);
3480
3481 assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE);
3482
3483 /* Re-initialize lck_mtx_pri if this is the first contention */
3484 if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri) {
3485 mutex->lck_mtx_pri = waiter_pri;
3486 }
3487
3488 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3489
3490 assert(holder != NULL);
3491
3492 /*
3493 * Intel only causes a promotion when priority needs to change,
3494 * reducing thread lock holds but leaving us vulnerable to the holder
3495 * dropping priority.
3496 */
3497 if (holder->sched_pri < mutex->lck_mtx_pri) {
3498 int promote_pri = mutex->lck_mtx_pri;
3499
3500 spl_t s = splsched();
3501 thread_lock(holder);
3502
3503 /* Check again in case sched_pri changed */
3504 if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) {
3505 if (mutex->lck_mtx_promoted == 0) {
3506 /* This is the first promotion for this mutex */
3507 mutex->lck_mtx_promoted = 1;
3508
3509 if (holder->promotions++ == 0) {
3510 /* This is the first promotion for holder */
3511 sched_thread_promote_to_pri(holder, promote_pri, trace_lck);
3512 } else {
3513 /*
3514 * Holder was previously promoted due to a different mutex,
3515 * check if it needs to raise to match this one
3516 */
3517 sched_thread_update_promotion_to_pri(holder, promote_pri,
3518 trace_lck);
3519 }
3520 } else {
3521 /*
3522 * Holder was previously promoted due to this mutex,
3523 * check if the pri needs to go up
3524 */
3525 sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck);
3526 }
3527 }
3528
3529 thread_unlock(holder);
3530 splx(s);
3531 }
3532
3533 mutex->lck_mtx_waiters++;
3534
3535 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3536 assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
3537
3538 lck_mtx_ilk_unlock(mutex);
3539
3540 thread_block(THREAD_CONTINUE_NULL);
3541
3542 self->waiting_for_mutex = NULL;
3543
3544 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3545 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3546 mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
3547
3548 #if CONFIG_DTRACE
3549 /*
3550 * Record the Dtrace lockstat probe for blocking, block time
3551 * measured from when we were entered.
3552 */
3553 if (sleep_start) {
3554 if (mutex->lck_mtx_is_ext == 0) {
3555 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3556 mach_absolute_time() - sleep_start);
3557 } else {
3558 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3559 mach_absolute_time() - sleep_start);
3560 }
3561 }
3562 #endif
3563 }
3564
3565 /*
3566 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3567 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3568 * Returns: TRUE if lock is acquired.
3569 */
3570 boolean_t
3571 kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3572 {
3573 if (not_in_kdp) {
3574 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3575 }
3576
3577 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3578 return TRUE;
3579 }
3580
3581 return FALSE;
3582 }
3583
3584 void
3585 kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3586 {
3587 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3588 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3589 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3590 waitinfo->owner = thread_tid(holder);
3591 }
3592
3593 void
3594 kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3595 {
3596 lck_rw_t *rwlck = NULL;
3597 switch (waitinfo->wait_type) {
3598 case kThreadWaitKernelRWLockRead:
3599 rwlck = READ_EVENT_TO_RWLOCK(event);
3600 break;
3601 case kThreadWaitKernelRWLockWrite:
3602 case kThreadWaitKernelRWLockUpgrade:
3603 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3604 break;
3605 default:
3606 panic("%s was called with an invalid blocking type", __FUNCTION__);
3607 break;
3608 }
3609 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3610 waitinfo->owner = 0;
3611 }