]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/locks_i386.c
xnu-4903.270.47.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
CommitLineData
91447636 1/*
39236c6e 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
91447636 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
91447636
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
0a7de745 31/*
91447636
A
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
0a7de745 35 *
91447636
A
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
0a7de745 41 *
91447636
A
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745 45 *
91447636 46 * Carnegie Mellon requests users of this software to return to
0a7de745 47 *
91447636
A
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
0a7de745 52 *
91447636
A
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
d9a64523
A
64#define ATOMIC_PRIVATE 1
65#define LOCK_PRIVATE 1
66
91447636
A
67#include <mach_ldebug.h>
68
0a7de745 69#include <kern/lock_stat.h>
91447636
A
70#include <kern/locks.h>
71#include <kern/kalloc.h>
72#include <kern/misc_protos.h>
73#include <kern/thread.h>
74#include <kern/processor.h>
75#include <kern/cpu_data.h>
76#include <kern/cpu_number.h>
77#include <kern/sched_prim.h>
78#include <kern/xpr.h>
79#include <kern/debug.h>
80#include <string.h>
81
060df5ea 82#include <i386/machine_routines.h> /* machine_timeout_suspended() */
5ba3f43e 83#include <machine/atomic.h>
b0d623f7 84#include <machine/machine_cpu.h>
060df5ea 85#include <i386/mp.h>
d9a64523 86#include <machine/atomic.h>
91447636 87#include <sys/kdebug.h>
d9a64523 88#include <i386/locks_i386_inlines.h>
91447636 89
0a7de745
A
90#if CONFIG_DTRACE
91#define DTRACE_RW_SHARED 0x0 //reader
92#define DTRACE_RW_EXCL 0x1 //writer
93#define DTRACE_NO_FLAG 0x0 //not applicable
94#endif /* CONFIG_DTRACE */
2d21ac55 95
0a7de745
A
96#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
97#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
98#define LCK_RW_LCK_SHARED_CODE 0x102
99#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
100#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
101#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
91447636 102
0a7de745
A
103#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
104#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
105#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
106#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
107#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
108#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
109#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
110#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
b0d623f7 111
91447636 112
0a7de745 113#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
91447636 114
0a7de745 115unsigned int LcksOpts = 0;
91447636 116
5ba3f43e
A
117#if DEVELOPMENT || DEBUG
118unsigned int LckDisablePreemptCheck = 0;
119#endif
120
91447636
A
121/* Forwards */
122
0a7de745 123#if USLOCK_DEBUG
91447636
A
124/*
125 * Perform simple lock checks.
126 */
0a7de745
A
127int uslock_check = 1;
128int max_lock_loops = 100000000;
129decl_simple_lock_data(extern, printf_lock)
130decl_simple_lock_data(extern, panic_lock)
131#endif /* USLOCK_DEBUG */
91447636 132
fe8ab488 133extern unsigned int not_in_kdp;
91447636
A
134
135/*
136 * We often want to know the addresses of the callers
137 * of the various lock routines. However, this information
138 * is only used for debugging and statistics.
139 */
0a7de745
A
140typedef void *pc_t;
141#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
142#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
143#if ANY_LOCK_DEBUG
144#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
145#define DECL_PC(pc) pc_t pc;
146#else /* ANY_LOCK_DEBUG */
91447636 147#define DECL_PC(pc)
0a7de745 148#ifdef lint
91447636
A
149/*
150 * Eliminate lint complaints about unused local pc variables.
151 */
0a7de745
A
152#define OBTAIN_PC(pc) ++pc
153#else /* lint */
154#define OBTAIN_PC(pc)
155#endif /* lint */
156#endif /* USLOCK_DEBUG */
91447636 157
5ba3f43e
A
158/*
159 * atomic exchange API is a low level abstraction of the operations
160 * to atomically read, modify, and write a pointer. This abstraction works
161 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
162 * well as the ARM exclusive instructions.
163 *
164 * atomic_exchange_begin() - begin exchange and retrieve current value
165 * atomic_exchange_complete() - conclude an exchange
166 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
167 */
168static uint32_t
169atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
170{
0a7de745 171 uint32_t val;
5ba3f43e 172
0a7de745 173 (void)ord; // Memory order not used
5ba3f43e
A
174 val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
175 *previous = val;
176 return val;
177}
178
179static boolean_t
180atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
181{
182 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
183}
184
185static void
0a7de745
A
186atomic_exchange_abort(void)
187{
188}
5ba3f43e
A
189
190static boolean_t
191atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
192{
0a7de745 193 uint32_t value, prev;
5ba3f43e 194
0a7de745 195 for (;;) {
5ba3f43e
A
196 value = atomic_exchange_begin32(target, &prev, ord);
197 if (value & test_mask) {
0a7de745 198 if (wait) {
5ba3f43e 199 cpu_pause();
0a7de745 200 } else {
5ba3f43e 201 atomic_exchange_abort();
0a7de745 202 }
5ba3f43e
A
203 return FALSE;
204 }
205 value |= set_mask;
0a7de745 206 if (atomic_exchange_complete32(target, prev, value, ord)) {
5ba3f43e 207 return TRUE;
0a7de745 208 }
5ba3f43e
A
209 }
210}
91447636
A
211
212/*
213 * Portable lock package implementation of usimple_locks.
214 */
215
0a7de745
A
216#if USLOCK_DEBUG
217#define USLDBG(stmt) stmt
218void usld_lock_init(usimple_lock_t, unsigned short);
219void usld_lock_pre(usimple_lock_t, pc_t);
220void usld_lock_post(usimple_lock_t, pc_t);
221void usld_unlock(usimple_lock_t, pc_t);
222void usld_lock_try_pre(usimple_lock_t, pc_t);
223void usld_lock_try_post(usimple_lock_t, pc_t);
224int usld_lock_common_checks(usimple_lock_t, char *);
225#else /* USLOCK_DEBUG */
226#define USLDBG(stmt)
227#endif /* USLOCK_DEBUG */
91447636 228
2d21ac55
A
229/*
230 * Forward definitions
231 */
232
5ba3f43e
A
233static void lck_rw_lock_shared_gen(lck_rw_t *lck);
234static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
235static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
236static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
237static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
238static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
39236c6e 239void lck_rw_clear_promotions_x86(thread_t thread);
5ba3f43e
A
240static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
241static boolean_t lck_rw_grab_want(lck_rw_t *lock);
242static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
d9a64523
A
243static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect);
244static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
245static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
246static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
247static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
248static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
249static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
250
39236c6e 251
91447636
A
252/*
253 * Routine: lck_spin_alloc_init
254 */
255lck_spin_t *
256lck_spin_alloc_init(
0a7de745
A
257 lck_grp_t *grp,
258 lck_attr_t *attr)
91447636 259{
0a7de745 260 lck_spin_t *lck;
91447636 261
0a7de745 262 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0) {
91447636 263 lck_spin_init(lck, grp, attr);
0a7de745 264 }
91447636 265
0a7de745 266 return lck;
91447636
A
267}
268
269/*
270 * Routine: lck_spin_free
271 */
272void
273lck_spin_free(
0a7de745
A
274 lck_spin_t *lck,
275 lck_grp_t *grp)
91447636
A
276{
277 lck_spin_destroy(lck, grp);
278 kfree(lck, sizeof(lck_spin_t));
279}
280
281/*
282 * Routine: lck_spin_init
283 */
284void
285lck_spin_init(
0a7de745
A
286 lck_spin_t *lck,
287 lck_grp_t *grp,
288 __unused lck_attr_t *attr)
91447636
A
289{
290 usimple_lock_init((usimple_lock_t) lck, 0);
291 lck_grp_reference(grp);
292 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
293}
294
295/*
296 * Routine: lck_spin_destroy
297 */
298void
299lck_spin_destroy(
0a7de745
A
300 lck_spin_t *lck,
301 lck_grp_t *grp)
91447636 302{
0a7de745 303 if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
91447636 304 return;
0a7de745 305 }
b0d623f7 306 lck->interlock = LCK_SPIN_TAG_DESTROYED;
91447636
A
307 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
308 lck_grp_deallocate(grp);
309 return;
310}
311
312/*
313 * Routine: lck_spin_lock
314 */
0a7de745
A
315void
316lck_spin_lock_grp(
317 lck_spin_t *lck,
318 lck_grp_t *grp)
319{
320#pragma unused(grp)
321 usimple_lock((usimple_lock_t) lck, grp);
322}
323
91447636
A
324void
325lck_spin_lock(
0a7de745 326 lck_spin_t *lck)
91447636 327{
0a7de745 328 usimple_lock((usimple_lock_t) lck, NULL);
91447636
A
329}
330
331/*
332 * Routine: lck_spin_unlock
333 */
334void
335lck_spin_unlock(
0a7de745 336 lck_spin_t *lck)
91447636
A
337{
338 usimple_unlock((usimple_lock_t) lck);
339}
340
0a7de745
A
341boolean_t
342lck_spin_try_lock_grp(
343 lck_spin_t *lck,
344 lck_grp_t *grp)
345{
346#pragma unused(grp)
347 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
348#if DEVELOPMENT || DEBUG
349 if (lrval) {
350 pltrace(FALSE);
351 }
352#endif
353 return lrval;
354}
355
91447636
A
356
357/*
358 * Routine: lck_spin_try_lock
359 */
360boolean_t
361lck_spin_try_lock(
0a7de745 362 lck_spin_t *lck)
91447636 363{
0a7de745
A
364 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
365#if DEVELOPMENT || DEBUG
39037602
A
366 if (lrval) {
367 pltrace(FALSE);
368 }
369#endif
0a7de745 370 return lrval;
39037602
A
371}
372
373/*
374 * Routine: lck_spin_assert
375 */
376void
377lck_spin_assert(lck_spin_t *lock, unsigned int type)
378{
379 thread_t thread, holder;
380 uintptr_t state;
381
382 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
383 panic("lck_spin_assert(): invalid arg (%u)", type);
384 }
385
386 state = lock->interlock;
387 holder = (thread_t)state;
388 thread = current_thread();
389 if (type == LCK_ASSERT_OWNED) {
390 if (__improbable(holder == THREAD_NULL)) {
391 panic("Lock not owned %p = %lx", lock, state);
392 }
393 if (__improbable(holder != thread)) {
394 panic("Lock not owned by current thread %p = %lx", lock, state);
395 }
396 } else if (type == LCK_ASSERT_NOTOWNED) {
397 if (__improbable(holder != THREAD_NULL)) {
398 if (holder == thread) {
399 panic("Lock owned by current thread %p = %lx", lock, state);
400 } else {
401 panic("Lock %p owned by thread %p", lock, holder);
402 }
403 }
404 }
91447636
A
405}
406
fe8ab488 407/*
3e170ce0 408 * Routine: kdp_lck_spin_is_acquired
fe8ab488
A
409 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
410 * Returns: TRUE if lock is acquired.
411 */
412boolean_t
0a7de745
A
413kdp_lck_spin_is_acquired(lck_spin_t *lck)
414{
fe8ab488
A
415 if (not_in_kdp) {
416 panic("panic: spinlock acquired check done outside of kernel debugger");
417 }
418 return (lck->interlock != 0)? TRUE : FALSE;
419}
420
91447636
A
421/*
422 * Initialize a usimple_lock.
423 *
424 * No change in preemption state.
425 */
426void
427usimple_lock_init(
0a7de745
A
428 usimple_lock_t l,
429 __unused unsigned short tag)
91447636 430{
0a7de745 431#ifndef MACHINE_SIMPLE_LOCK
91447636
A
432 USLDBG(usld_lock_init(l, tag));
433 hw_lock_init(&l->interlock);
434#else
0a7de745 435 simple_lock_init((simple_lock_t)l, tag);
91447636
A
436#endif
437}
438
060df5ea
A
439volatile uint32_t spinlock_owner_cpu = ~0;
440volatile usimple_lock_t spinlock_timed_out;
441
0a7de745
A
442uint32_t
443spinlock_timeout_NMI(uintptr_t thread_addr)
444{
060df5ea
A
445 uint32_t i;
446
447 for (i = 0; i < real_ncpus; i++) {
a39ff7e2 448 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
060df5ea 449 spinlock_owner_cpu = i;
5ba3f43e
A
450 if ((uint32_t) cpu_number() != i) {
451 /* Cause NMI and panic on the owner's cpu */
452 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
453 }
060df5ea
A
454 break;
455 }
456 }
457
458 return spinlock_owner_cpu;
459}
91447636
A
460
461/*
462 * Acquire a usimple_lock.
463 *
464 * Returns with preemption disabled. Note
465 * that the hw_lock routines are responsible for
466 * maintaining preemption state.
467 */
468void
0a7de745
A
469(usimple_lock)(
470 usimple_lock_t l
471 LCK_GRP_ARG(lck_grp_t *grp))
91447636 472{
0a7de745 473#ifndef MACHINE_SIMPLE_LOCK
2d21ac55 474 DECL_PC(pc);
91447636 475
b0d623f7 476 OBTAIN_PC(pc);
91447636 477 USLDBG(usld_lock_pre(l, pc));
6d2010ae 478
0a7de745 479 if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
b0d623f7 480 boolean_t uslock_acquired = FALSE;
060df5ea
A
481 while (machine_timeout_suspended()) {
482 enable_preemption();
0a7de745 483 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) {
060df5ea 484 break;
0a7de745 485 }
6d2010ae
A
486 }
487
060df5ea
A
488 if (uslock_acquired == FALSE) {
489 uint32_t lock_cpu;
7ddcb079 490 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
060df5ea 491 spinlock_timed_out = l;
7ddcb079 492 lock_cpu = spinlock_timeout_NMI(lowner);
5ba3f43e 493 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
0a7de745 494 l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
060df5ea 495 }
b0d623f7 496 }
39037602 497#if DEVELOPMENT || DEBUG
0a7de745 498 pltrace(FALSE);
39037602
A
499#endif
500
91447636
A
501 USLDBG(usld_lock_post(l, pc));
502#else
0a7de745 503 simple_lock((simple_lock_t)l, grp);
91447636 504#endif
5ba3f43e 505#if CONFIG_DTRACE
0a7de745 506 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
5ba3f43e 507#endif
91447636
A
508}
509
510
511/*
512 * Release a usimple_lock.
513 *
514 * Returns with preemption enabled. Note
515 * that the hw_lock routines are responsible for
516 * maintaining preemption state.
517 */
518void
519usimple_unlock(
0a7de745 520 usimple_lock_t l)
91447636 521{
0a7de745 522#ifndef MACHINE_SIMPLE_LOCK
91447636
A
523 DECL_PC(pc);
524
b0d623f7 525 OBTAIN_PC(pc);
91447636 526 USLDBG(usld_unlock(l, pc));
39037602 527#if DEVELOPMENT || DEBUG
0a7de745 528 pltrace(TRUE);
39037602 529#endif
91447636
A
530 hw_lock_unlock(&l->interlock);
531#else
532 simple_unlock_rwmb((simple_lock_t)l);
533#endif
534}
535
536
537/*
538 * Conditionally acquire a usimple_lock.
539 *
540 * On success, returns with preemption disabled.
541 * On failure, returns with preemption in the same state
542 * as when first invoked. Note that the hw_lock routines
543 * are responsible for maintaining preemption state.
544 *
545 * XXX No stats are gathered on a miss; I preserved this
546 * behavior from the original assembly-language code, but
547 * doesn't it make sense to log misses? XXX
548 */
549unsigned int
550usimple_lock_try(
0a7de745
A
551 usimple_lock_t l,
552 lck_grp_t *grp)
91447636 553{
0a7de745
A
554#ifndef MACHINE_SIMPLE_LOCK
555 unsigned int success;
2d21ac55 556 DECL_PC(pc);
91447636 557
b0d623f7 558 OBTAIN_PC(pc);
91447636 559 USLDBG(usld_lock_try_pre(l, pc));
0a7de745 560 if ((success = hw_lock_try(&l->interlock, grp))) {
39037602
A
561#if DEVELOPMENT || DEBUG
562 pltrace(FALSE);
563#endif
0a7de745 564 USLDBG(usld_lock_try_post(l, pc));
91447636
A
565 }
566 return success;
567#else
0a7de745 568 return simple_lock_try((simple_lock_t)l, grp);
91447636
A
569#endif
570}
571
39037602
A
572/*
573 * Acquire a usimple_lock while polling for pending TLB flushes
574 * and spinning on a lock.
575 *
576 */
577void
0a7de745 578usimple_lock_try_lock_loop(usimple_lock_t l, lck_grp_t *grp)
39037602
A
579{
580 boolean_t istate = ml_get_interrupts_enabled();
0a7de745
A
581 while (!simple_lock_try(l, grp)) {
582 if (!istate) {
39037602 583 handle_pending_TLB_flushes();
0a7de745 584 }
39037602
A
585 cpu_pause();
586 }
587}
588
0a7de745 589#if USLOCK_DEBUG
91447636
A
590/*
591 * States of a usimple_lock. The default when initializing
592 * a usimple_lock is setting it up for debug checking.
593 */
0a7de745
A
594#define USLOCK_CHECKED 0x0001 /* lock is being checked */
595#define USLOCK_TAKEN 0x0002 /* lock has been taken */
596#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
597#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
598#define USLOCK_CHECKING(l) (uslock_check && \
599 ((l)->debug.state & USLOCK_CHECKED))
91447636
A
600
601/*
602 * Trace activities of a particularly interesting lock.
603 */
0a7de745 604void usl_trace(usimple_lock_t, int, pc_t, const char *);
91447636
A
605
606
607/*
608 * Initialize the debugging information contained
609 * in a usimple_lock.
610 */
611void
612usld_lock_init(
0a7de745
A
613 usimple_lock_t l,
614 __unused unsigned short tag)
91447636 615{
0a7de745 616 if (l == USIMPLE_LOCK_NULL) {
91447636 617 panic("lock initialization: null lock pointer");
0a7de745 618 }
91447636
A
619 l->lock_type = USLOCK_TAG;
620 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
621 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
622 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
623 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
624 l->debug.duration[0] = l->debug.duration[1] = 0;
625 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
626 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
627 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
628}
629
630
631/*
632 * These checks apply to all usimple_locks, not just
633 * those with USLOCK_CHECKED turned on.
634 */
635int
636usld_lock_common_checks(
0a7de745
A
637 usimple_lock_t l,
638 char *caller)
91447636 639{
0a7de745 640 if (l == USIMPLE_LOCK_NULL) {
91447636 641 panic("%s: null lock pointer", caller);
0a7de745
A
642 }
643 if (l->lock_type != USLOCK_TAG) {
ebb1b9f4 644 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
0a7de745
A
645 }
646 if (!(l->debug.state & USLOCK_INIT)) {
ebb1b9f4 647 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
0a7de745 648 }
91447636
A
649 return USLOCK_CHECKING(l);
650}
651
652
653/*
654 * Debug checks on a usimple_lock just before attempting
655 * to acquire it.
656 */
657/* ARGSUSED */
658void
659usld_lock_pre(
0a7de745
A
660 usimple_lock_t l,
661 pc_t pc)
91447636 662{
0a7de745 663 char caller[] = "usimple_lock";
91447636
A
664
665
0a7de745 666 if (!usld_lock_common_checks(l, caller)) {
91447636 667 return;
0a7de745 668 }
91447636
A
669
670/*
671 * Note that we have a weird case where we are getting a lock when we are]
672 * in the process of putting the system to sleep. We are running with no
673 * current threads, therefore we can't tell if we are trying to retake a lock
674 * we have or someone on the other processor has it. Therefore we just
675 * ignore this test if the locking thread is 0.
676 */
677
678 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
679 l->debug.lock_thread == (void *) current_thread()) {
2d21ac55 680 printf("%s: lock %p already locked (at %p) by",
0a7de745 681 caller, l, l->debug.lock_pc);
2d21ac55 682 printf(" current thread %p (new attempt at pc %p)\n",
0a7de745 683 l->debug.lock_thread, pc);
2d21ac55 684 panic("%s", caller);
91447636
A
685 }
686 mp_disable_preemption();
687 usl_trace(l, cpu_number(), pc, caller);
688 mp_enable_preemption();
689}
690
691
692/*
693 * Debug checks on a usimple_lock just after acquiring it.
694 *
695 * Pre-emption has been disabled at this point,
696 * so we are safe in using cpu_number.
697 */
698void
699usld_lock_post(
0a7de745
A
700 usimple_lock_t l,
701 pc_t pc)
91447636 702{
0a7de745
A
703 int mycpu;
704 char caller[] = "successful usimple_lock";
91447636
A
705
706
0a7de745 707 if (!usld_lock_common_checks(l, caller)) {
91447636 708 return;
0a7de745 709 }
91447636 710
0a7de745 711 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
b0d623f7 712 panic("%s: lock %p became uninitialized",
0a7de745
A
713 caller, l);
714 }
715 if ((l->debug.state & USLOCK_TAKEN)) {
b0d623f7 716 panic("%s: lock 0x%p became TAKEN by someone else",
0a7de745
A
717 caller, l);
718 }
91447636
A
719
720 mycpu = cpu_number();
721 l->debug.lock_thread = (void *)current_thread();
722 l->debug.state |= USLOCK_TAKEN;
723 l->debug.lock_pc = pc;
724 l->debug.lock_cpu = mycpu;
725
726 usl_trace(l, mycpu, pc, caller);
727}
728
729
730/*
731 * Debug checks on a usimple_lock just before
732 * releasing it. Note that the caller has not
733 * yet released the hardware lock.
734 *
735 * Preemption is still disabled, so there's
736 * no problem using cpu_number.
737 */
738void
739usld_unlock(
0a7de745
A
740 usimple_lock_t l,
741 pc_t pc)
91447636 742{
0a7de745
A
743 int mycpu;
744 char caller[] = "usimple_unlock";
91447636
A
745
746
0a7de745 747 if (!usld_lock_common_checks(l, caller)) {
91447636 748 return;
0a7de745 749 }
91447636
A
750
751 mycpu = cpu_number();
752
0a7de745 753 if (!(l->debug.state & USLOCK_TAKEN)) {
b0d623f7 754 panic("%s: lock 0x%p hasn't been taken",
0a7de745
A
755 caller, l);
756 }
757 if (l->debug.lock_thread != (void *) current_thread()) {
b0d623f7 758 panic("%s: unlocking lock 0x%p, owned by thread %p",
0a7de745
A
759 caller, l, l->debug.lock_thread);
760 }
91447636 761 if (l->debug.lock_cpu != mycpu) {
b0d623f7 762 printf("%s: unlocking lock 0x%p on cpu 0x%x",
0a7de745 763 caller, l, mycpu);
91447636 764 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
2d21ac55 765 panic("%s", caller);
91447636
A
766 }
767 usl_trace(l, mycpu, pc, caller);
768
769 l->debug.unlock_thread = l->debug.lock_thread;
770 l->debug.lock_thread = INVALID_PC;
771 l->debug.state &= ~USLOCK_TAKEN;
772 l->debug.unlock_pc = pc;
773 l->debug.unlock_cpu = mycpu;
774}
775
776
777/*
778 * Debug checks on a usimple_lock just before
779 * attempting to acquire it.
780 *
781 * Preemption isn't guaranteed to be disabled.
782 */
783void
784usld_lock_try_pre(
0a7de745
A
785 usimple_lock_t l,
786 pc_t pc)
91447636 787{
0a7de745 788 char caller[] = "usimple_lock_try";
91447636 789
0a7de745 790 if (!usld_lock_common_checks(l, caller)) {
91447636 791 return;
0a7de745 792 }
91447636
A
793 mp_disable_preemption();
794 usl_trace(l, cpu_number(), pc, caller);
795 mp_enable_preemption();
796}
797
798
799/*
800 * Debug checks on a usimple_lock just after
801 * successfully attempting to acquire it.
802 *
803 * Preemption has been disabled by the
804 * lock acquisition attempt, so it's safe
805 * to use cpu_number.
806 */
807void
808usld_lock_try_post(
0a7de745
A
809 usimple_lock_t l,
810 pc_t pc)
91447636 811{
0a7de745
A
812 int mycpu;
813 char caller[] = "successful usimple_lock_try";
91447636 814
0a7de745 815 if (!usld_lock_common_checks(l, caller)) {
91447636 816 return;
0a7de745 817 }
91447636 818
0a7de745 819 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
b0d623f7 820 panic("%s: lock 0x%p became uninitialized",
0a7de745
A
821 caller, l);
822 }
823 if ((l->debug.state & USLOCK_TAKEN)) {
b0d623f7 824 panic("%s: lock 0x%p became TAKEN by someone else",
0a7de745
A
825 caller, l);
826 }
91447636
A
827
828 mycpu = cpu_number();
829 l->debug.lock_thread = (void *) current_thread();
830 l->debug.state |= USLOCK_TAKEN;
831 l->debug.lock_pc = pc;
832 l->debug.lock_cpu = mycpu;
833
834 usl_trace(l, mycpu, pc, caller);
835}
836
837
838/*
839 * For very special cases, set traced_lock to point to a
840 * specific lock of interest. The result is a series of
841 * XPRs showing lock operations on that lock. The lock_seq
842 * value is used to show the order of those operations.
843 */
0a7de745
A
844usimple_lock_t traced_lock;
845unsigned int lock_seq;
91447636
A
846
847void
848usl_trace(
0a7de745
A
849 usimple_lock_t l,
850 int mycpu,
851 pc_t pc,
852 const char * op_name)
91447636
A
853{
854 if (traced_lock == l) {
855 XPR(XPR_SLOCK,
856 "seq %d, cpu %d, %s @ %x\n",
b0d623f7
A
857 (uintptr_t) lock_seq, (uintptr_t) mycpu,
858 (uintptr_t) op_name, (uintptr_t) pc, 0);
91447636
A
859 lock_seq++;
860 }
861}
862
863
0a7de745 864#endif /* USLOCK_DEBUG */
91447636 865
91447636
A
866/*
867 * Routine: lck_rw_alloc_init
868 */
869lck_rw_t *
870lck_rw_alloc_init(
0a7de745
A
871 lck_grp_t *grp,
872 lck_attr_t *attr)
873{
874 lck_rw_t *lck;
91447636 875
b0d623f7
A
876 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
877 bzero(lck, sizeof(lck_rw_t));
91447636 878 lck_rw_init(lck, grp, attr);
b0d623f7
A
879 }
880
0a7de745 881 return lck;
91447636
A
882}
883
884/*
885 * Routine: lck_rw_free
886 */
887void
888lck_rw_free(
0a7de745
A
889 lck_rw_t *lck,
890 lck_grp_t *grp)
891{
91447636
A
892 lck_rw_destroy(lck, grp);
893 kfree(lck, sizeof(lck_rw_t));
894}
895
896/*
897 * Routine: lck_rw_init
898 */
899void
900lck_rw_init(
0a7de745
A
901 lck_rw_t *lck,
902 lck_grp_t *grp,
903 lck_attr_t *attr)
0c530ab8 904{
0a7de745
A
905 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
906 attr : &LockDefaultLckAttr;
91447636 907
2d21ac55
A
908 hw_lock_byte_init(&lck->lck_rw_interlock);
909 lck->lck_rw_want_write = FALSE;
910 lck->lck_rw_want_upgrade = FALSE;
911 lck->lck_rw_shared_count = 0;
912 lck->lck_rw_can_sleep = TRUE;
b0d623f7 913 lck->lck_r_waiting = lck->lck_w_waiting = 0;
91447636 914 lck->lck_rw_tag = 0;
2d21ac55 915 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
0a7de745 916 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
91447636
A
917
918 lck_grp_reference(grp);
919 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
920}
921
922/*
923 * Routine: lck_rw_destroy
924 */
925void
926lck_rw_destroy(
0a7de745
A
927 lck_rw_t *lck,
928 lck_grp_t *grp)
b0d623f7 929{
0a7de745 930 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
91447636 931 return;
0a7de745 932 }
39236c6e
A
933#if MACH_LDEBUG
934 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
935#endif
91447636
A
936 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
937 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
938 lck_grp_deallocate(grp);
939 return;
940}
941
942/*
943 * Sleep locks. These use the same data structure and algorithm
944 * as the spin locks, but the process sleeps while it is waiting
945 * for the lock. These work on uniprocessor systems.
946 */
947
948#define DECREMENTER_TIMEOUT 1000000
949
91447636 950/*
6d2010ae
A
951 * We disable interrupts while holding the RW interlock to prevent an
952 * interrupt from exacerbating hold time.
91447636
A
953 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
954 */
5ba3f43e 955static inline boolean_t
91447636
A
956lck_interlock_lock(lck_rw_t *lck)
957{
0a7de745 958 boolean_t istate;
91447636 959
0a7de745 960 istate = ml_set_interrupts_enabled(FALSE);
2d21ac55 961 hw_lock_byte_lock(&lck->lck_rw_interlock);
91447636
A
962 return istate;
963}
964
5ba3f43e 965static inline void
91447636 966lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
0a7de745 967{
2d21ac55 968 hw_lock_byte_unlock(&lck->lck_rw_interlock);
91447636
A
969 ml_set_interrupts_enabled(istate);
970}
971
0c530ab8
A
972/*
973 * This inline is used when busy-waiting for an rw lock.
974 * If interrupts were disabled when the lock primitive was called,
975 * we poll the IPI handler for pending tlb flushes.
976 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
977 */
978static inline void
979lck_rw_lock_pause(boolean_t interrupts_enabled)
980{
0a7de745 981 if (!interrupts_enabled) {
0c530ab8 982 handle_pending_TLB_flushes();
0a7de745 983 }
0c530ab8
A
984 cpu_pause();
985}
986
5ba3f43e
A
987static inline boolean_t
988lck_rw_held_read_or_upgrade(lck_rw_t *lock)
989{
0a7de745 990 if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
5ba3f43e 991 return TRUE;
0a7de745 992 }
5ba3f43e
A
993 return FALSE;
994}
b0d623f7
A
995
996/*
997 * compute the deadline to spin against when
998 * waiting for a change of state on a lck_rw_t
999 */
1000static inline uint64_t
1001lck_rw_deadline_for_spin(lck_rw_t *lck)
1002{
1003 if (lck->lck_rw_can_sleep) {
1004 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
1005 /*
1006 * there are already threads waiting on this lock... this
0a7de745 1007 * implies that they have spun beyond their deadlines waiting for
b0d623f7
A
1008 * the desired state to show up so we will not bother spinning at this time...
1009 * or
1010 * the current number of threads sharing this lock exceeds our capacity to run them
1011 * concurrently and since all states we're going to spin for require the rw_shared_count
1012 * to be at 0, we'll not bother spinning since the latency for this to happen is
1013 * unpredictable...
1014 */
0a7de745 1015 return mach_absolute_time();
b0d623f7 1016 }
0a7de745
A
1017 return mach_absolute_time() + MutexSpin;
1018 } else {
1019 return mach_absolute_time() + (1LL * 1000000000LL);
1020 }
b0d623f7
A
1021}
1022
1023
5ba3f43e
A
1024/*
1025 * Spin while interlock is held.
1026 */
1027
1028static inline void
1029lck_rw_interlock_spin(lck_rw_t *lock)
1030{
1031 while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1032 cpu_pause();
1033 }
1034}
1035
1036static boolean_t
1037lck_rw_grab_want(lck_rw_t *lock)
1038{
0a7de745 1039 uint32_t data, prev;
5ba3f43e 1040
0a7de745 1041 for (;;) {
5ba3f43e 1042 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
0a7de745 1043 if ((data & LCK_RW_INTERLOCK) == 0) {
5ba3f43e 1044 break;
0a7de745 1045 }
5ba3f43e
A
1046 atomic_exchange_abort();
1047 lck_rw_interlock_spin(lock);
1048 }
1049 if (data & LCK_RW_WANT_WRITE) {
1050 atomic_exchange_abort();
1051 return FALSE;
1052 }
1053 data |= LCK_RW_WANT_WRITE;
1054 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1055}
1056
1057static boolean_t
1058lck_rw_grab_shared(lck_rw_t *lock)
1059{
0a7de745 1060 uint32_t data, prev;
5ba3f43e 1061
0a7de745 1062 for (;;) {
5ba3f43e 1063 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
0a7de745 1064 if ((data & LCK_RW_INTERLOCK) == 0) {
5ba3f43e 1065 break;
0a7de745 1066 }
5ba3f43e
A
1067 atomic_exchange_abort();
1068 lck_rw_interlock_spin(lock);
1069 }
1070 if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1071 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1072 atomic_exchange_abort();
1073 return FALSE;
1074 }
1075 }
1076 data += LCK_RW_SHARED_READER;
1077 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1078}
1079
91447636
A
1080/*
1081 * Routine: lck_rw_lock_exclusive
1082 */
5ba3f43e 1083static void
b0d623f7 1084lck_rw_lock_exclusive_gen(
0a7de745 1085 lck_rw_t *lck)
91447636 1086{
0a7de745
A
1087 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1088 uint64_t deadline = 0;
1089 int slept = 0;
1090 int gotlock = 0;
1091 int lockheld = 0;
1092 wait_result_t res = 0;
1093 boolean_t istate = -1;
91447636 1094
0a7de745 1095#if CONFIG_DTRACE
b0d623f7 1096 boolean_t dtrace_ls_initialized = FALSE;
0a7de745 1097 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
b0d623f7
A
1098 uint64_t wait_interval = 0;
1099 int readers_at_sleep = 0;
2d21ac55 1100#endif
91447636 1101
91447636 1102 /*
2d21ac55 1103 * Try to acquire the lck_rw_want_write bit.
91447636 1104 */
0a7de745
A
1105 while (!lck_rw_grab_want(lck)) {
1106#if CONFIG_DTRACE
b0d623f7
A
1107 if (dtrace_ls_initialized == FALSE) {
1108 dtrace_ls_initialized = TRUE;
1109 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1110 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1111 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1112 if (dtrace_ls_enabled) {
1113 /*
1114 * Either sleeping or spinning is happening,
1115 * start a timing of our delay interval now.
1116 */
1117 readers_at_sleep = lck->lck_rw_shared_count;
1118 wait_interval = mach_absolute_time();
1119 }
91447636 1120 }
2d21ac55 1121#endif
0a7de745 1122 if (istate == -1) {
b0d623f7 1123 istate = ml_get_interrupts_enabled();
0a7de745 1124 }
91447636 1125
b0d623f7
A
1126 deadline = lck_rw_deadline_for_spin(lck);
1127
3e170ce0 1128 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
0a7de745
A
1129
1130 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
b0d623f7 1131 lck_rw_lock_pause(istate);
0a7de745 1132 }
b0d623f7 1133
3e170ce0 1134 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
b0d623f7 1135
0a7de745 1136 if (gotlock) {
b0d623f7 1137 break;
0a7de745 1138 }
b0d623f7
A
1139 /*
1140 * if we get here, the deadline has expired w/o us
1141 * being able to grab the lock exclusively
1142 * check to see if we're allowed to do a thread_block
1143 */
1144 if (lck->lck_rw_can_sleep) {
91447636 1145 istate = lck_interlock_lock(lck);
91447636 1146
b0d623f7 1147 if (lck->lck_rw_want_write) {
3e170ce0 1148 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
91447636 1149
b0d623f7 1150 lck->lck_w_waiting = TRUE;
91447636 1151
813fb2f6 1152 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
d9a64523 1153 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
0a7de745 1154 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
b0d623f7 1155 lck_interlock_unlock(lck, istate);
91447636 1156
b0d623f7
A
1157 if (res == THREAD_WAITING) {
1158 res = thread_block(THREAD_CONTINUE_NULL);
1159 slept++;
1160 }
3e170ce0 1161 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
b0d623f7
A
1162 } else {
1163 lck->lck_rw_want_write = TRUE;
1164 lck_interlock_unlock(lck, istate);
1165 break;
1166 }
1167 }
1168 }
1169 /*
1170 * Wait for readers (and upgrades) to finish...
1171 * the test for these conditions must be done simultaneously with
1172 * a check of the interlock not being held since
1173 * the rw_shared_count will drop to 0 first and then want_upgrade
1174 * will be set to 1 in the shared_to_exclusive scenario... those
1175 * adjustments are done behind the interlock and represent an
1176 * atomic change in state and must be considered as such
1177 * however, once we see the read count at 0, the want_upgrade not set
1178 * and the interlock not held, we are safe to proceed
1179 */
1180 while (lck_rw_held_read_or_upgrade(lck)) {
0a7de745 1181#if CONFIG_DTRACE
2d21ac55
A
1182 /*
1183 * Either sleeping or spinning is happening, start
1184 * a timing of our delay interval now. If we set it
1185 * to -1 we don't have accurate data so we cannot later
1186 * decide to record a dtrace spin or sleep event.
1187 */
b0d623f7
A
1188 if (dtrace_ls_initialized == FALSE) {
1189 dtrace_ls_initialized = TRUE;
1190 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1191 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1192 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1193 if (dtrace_ls_enabled) {
1194 /*
1195 * Either sleeping or spinning is happening,
1196 * start a timing of our delay interval now.
1197 */
1198 readers_at_sleep = lck->lck_rw_shared_count;
1199 wait_interval = mach_absolute_time();
1200 }
2d21ac55
A
1201 }
1202#endif
0a7de745 1203 if (istate == -1) {
b0d623f7 1204 istate = ml_get_interrupts_enabled();
0a7de745 1205 }
b0d623f7
A
1206
1207 deadline = lck_rw_deadline_for_spin(lck);
1208
3e170ce0 1209 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7 1210
0a7de745 1211 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
b0d623f7 1212 lck_rw_lock_pause(istate);
0a7de745 1213 }
b0d623f7 1214
3e170ce0 1215 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
b0d623f7 1216
0a7de745 1217 if (!lockheld) {
b0d623f7 1218 break;
0a7de745 1219 }
b0d623f7
A
1220 /*
1221 * if we get here, the deadline has expired w/o us
1222 * being able to grab the lock exclusively
1223 * check to see if we're allowed to do a thread_block
1224 */
1225 if (lck->lck_rw_can_sleep) {
91447636 1226 istate = lck_interlock_lock(lck);
91447636 1227
b0d623f7 1228 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
3e170ce0 1229 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7
A
1230
1231 lck->lck_w_waiting = TRUE;
1232
813fb2f6 1233 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
d9a64523 1234 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
0a7de745 1235 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1236 lck_interlock_unlock(lck, istate);
b0d623f7
A
1237
1238 if (res == THREAD_WAITING) {
1239 res = thread_block(THREAD_CONTINUE_NULL);
1240 slept++;
1241 }
3e170ce0 1242 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
b0d623f7
A
1243 } else {
1244 lck_interlock_unlock(lck, istate);
1245 /*
1246 * must own the lock now, since we checked for
1247 * readers or upgrade owner behind the interlock
1248 * no need for a call to 'lck_rw_held_read_or_upgrade'
1249 */
1250 break;
91447636
A
1251 }
1252 }
91447636
A
1253 }
1254
0a7de745 1255#if CONFIG_DTRACE
2d21ac55
A
1256 /*
1257 * Decide what latencies we suffered that are Dtrace events.
1258 * If we have set wait_interval, then we either spun or slept.
1259 * At least we get out from under the interlock before we record
1260 * which is the best we can do here to minimize the impact
1261 * of the tracing.
1262 * If we have set wait_interval to -1, then dtrace was not enabled when we
1263 * started sleeping/spinning so we don't record this event.
1264 */
b0d623f7 1265 if (dtrace_ls_enabled == TRUE) {
2d21ac55 1266 if (slept == 0) {
0a7de745 1267 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
2d21ac55
A
1268 mach_absolute_time() - wait_interval, 1);
1269 } else {
1270 /*
1271 * For the blocking case, we also record if when we blocked
1272 * it was held for read or write, and how many readers.
1273 * Notice that above we recorded this before we dropped
1274 * the interlock so the count is accurate.
1275 */
0a7de745 1276 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
2d21ac55
A
1277 mach_absolute_time() - wait_interval, 1,
1278 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1279 }
1280 }
1281 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1282#endif
91447636
A
1283}
1284
5ba3f43e
A
1285/*
1286 * Routine: lck_rw_done
1287 */
1288
0a7de745
A
1289lck_rw_type_t
1290lck_rw_done(lck_rw_t *lock)
5ba3f43e 1291{
0a7de745 1292 uint32_t data, prev;
5ba3f43e 1293
0a7de745 1294 for (;;) {
5ba3f43e 1295 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
0a7de745 1296 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
5ba3f43e
A
1297 atomic_exchange_abort();
1298 lck_rw_interlock_spin(lock);
1299 continue;
1300 }
1301 if (data & LCK_RW_SHARED_MASK) {
1302 data -= LCK_RW_SHARED_READER;
0a7de745 1303 if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
5ba3f43e 1304 goto check_waiters;
0a7de745
A
1305 }
1306 } else { /* if reader count == 0, must be exclusive lock */
5ba3f43e
A
1307 if (data & LCK_RW_WANT_UPGRADE) {
1308 data &= ~(LCK_RW_WANT_UPGRADE);
1309 } else {
0a7de745 1310 if (data & LCK_RW_WANT_WRITE) {
5ba3f43e 1311 data &= ~(LCK_RW_WANT_EXCL);
0a7de745 1312 } else { /* lock is not 'owned', panic */
5ba3f43e 1313 panic("Releasing non-exclusive RW lock without a reader refcount!");
0a7de745 1314 }
5ba3f43e
A
1315 }
1316check_waiters:
1317 if (prev & LCK_RW_W_WAITING) {
1318 data &= ~(LCK_RW_W_WAITING);
0a7de745 1319 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
5ba3f43e 1320 data &= ~(LCK_RW_R_WAITING);
0a7de745
A
1321 }
1322 } else {
5ba3f43e 1323 data &= ~(LCK_RW_R_WAITING);
0a7de745 1324 }
5ba3f43e 1325 }
0a7de745 1326 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
5ba3f43e 1327 break;
0a7de745 1328 }
5ba3f43e
A
1329 cpu_pause();
1330 }
1331 return lck_rw_done_gen(lock, prev);
1332}
91447636
A
1333
1334/*
2d21ac55 1335 * Routine: lck_rw_done_gen
b0d623f7 1336 *
5ba3f43e 1337 * called from lck_rw_done()
b0d623f7 1338 * prior_lock_state is the value in the 1st
0a7de745 1339 * word of the lock at the time of a successful
b0d623f7 1340 * atomic compare and exchange with the new value...
0a7de745 1341 * it represents the state of the lock before we
b0d623f7 1342 * decremented the rw_shared_count or cleared either
0a7de745 1343 * rw_want_upgrade or rw_want_write and
b0d623f7 1344 * the lck_x_waiting bits... since the wrapper
0a7de745 1345 * routine has already changed the state atomically,
b0d623f7
A
1346 * we just need to decide if we should
1347 * wake up anyone and what value to return... we do
1348 * this by examining the state of the lock before
1349 * we changed it
91447636 1350 */
5ba3f43e 1351static lck_rw_type_t
2d21ac55 1352lck_rw_done_gen(
0a7de745
A
1353 lck_rw_t *lck,
1354 uint32_t prior_lock_state)
91447636 1355{
0a7de745
A
1356 lck_rw_t *fake_lck;
1357 lck_rw_type_t lock_type;
1358 thread_t thread;
1359 uint32_t rwlock_count;
39236c6e 1360
0a7de745
A
1361 thread = current_thread();
1362 rwlock_count = thread->rwlock_count--;
b0d623f7 1363 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1364
0a7de745
A
1365 if (lck->lck_rw_can_sleep) {
1366 /*
1367 * prior_lock state is a snapshot of the 1st word of the
1368 * lock in question... we'll fake up a pointer to it
1369 * and carefully not access anything beyond whats defined
1370 * in the first word of a lck_rw_t
1371 */
91447636 1372
0a7de745
A
1373 if (fake_lck->lck_rw_shared_count <= 1) {
1374 if (fake_lck->lck_w_waiting) {
1375 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1376 }
2d21ac55 1377
0a7de745
A
1378 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1379 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1380 }
1381 }
fe8ab488 1382#if MACH_LDEBUG
0a7de745
A
1383 if (rwlock_count == 0) {
1384 panic("rw lock count underflow for thread %p", thread);
1385 }
fe8ab488 1386#endif
0a7de745
A
1387 /* Check if dropping the lock means that we need to unpromote */
1388
1389 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1390 /* sched_flags checked without lock, but will be rechecked while clearing */
1391 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1392 }
1393 }
1394 if (fake_lck->lck_rw_shared_count) {
1395 lock_type = LCK_RW_TYPE_SHARED;
1396 } else {
1397 lock_type = LCK_RW_TYPE_EXCLUSIVE;
fe8ab488
A
1398 }
1399
2d21ac55 1400#if CONFIG_DTRACE
b0d623f7 1401 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2d21ac55
A
1402#endif
1403
0a7de745 1404 return lock_type;
91447636
A
1405}
1406
1407
91447636
A
1408/*
1409 * Routine: lck_rw_unlock
1410 */
1411void
1412lck_rw_unlock(
0a7de745
A
1413 lck_rw_t *lck,
1414 lck_rw_type_t lck_rw_type)
91447636 1415{
0a7de745 1416 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
91447636 1417 lck_rw_unlock_shared(lck);
0a7de745 1418 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
91447636 1419 lck_rw_unlock_exclusive(lck);
0a7de745 1420 } else {
91447636 1421 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
0a7de745 1422 }
91447636
A
1423}
1424
1425
1426/*
1427 * Routine: lck_rw_unlock_shared
1428 */
1429void
1430lck_rw_unlock_shared(
0a7de745 1431 lck_rw_t *lck)
91447636 1432{
0a7de745 1433 lck_rw_type_t ret;
91447636 1434
a39ff7e2 1435 assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
91447636
A
1436 ret = lck_rw_done(lck);
1437
0a7de745 1438 if (ret != LCK_RW_TYPE_SHARED) {
39037602 1439 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
0a7de745 1440 }
91447636
A
1441}
1442
1443
1444/*
1445 * Routine: lck_rw_unlock_exclusive
1446 */
1447void
1448lck_rw_unlock_exclusive(
0a7de745 1449 lck_rw_t *lck)
91447636 1450{
0a7de745 1451 lck_rw_type_t ret;
91447636
A
1452
1453 ret = lck_rw_done(lck);
1454
0a7de745 1455 if (ret != LCK_RW_TYPE_EXCLUSIVE) {
91447636 1456 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
0a7de745 1457 }
91447636
A
1458}
1459
1460
1461/*
1462 * Routine: lck_rw_lock
1463 */
1464void
1465lck_rw_lock(
0a7de745
A
1466 lck_rw_t *lck,
1467 lck_rw_type_t lck_rw_type)
91447636 1468{
0a7de745 1469 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
91447636 1470 lck_rw_lock_shared(lck);
0a7de745 1471 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
91447636 1472 lck_rw_lock_exclusive(lck);
0a7de745 1473 } else {
91447636 1474 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
0a7de745 1475 }
91447636
A
1476}
1477
5ba3f43e
A
1478/*
1479 * Routine: lck_rw_lock_shared
1480 */
1481void
1482lck_rw_lock_shared(lck_rw_t *lock)
1483{
0a7de745 1484 uint32_t data, prev;
5ba3f43e
A
1485
1486 current_thread()->rwlock_count++;
0a7de745 1487 for (;;) {
5ba3f43e
A
1488 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1489 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1490 atomic_exchange_abort();
0a7de745
A
1491 if (lock->lck_rw_can_sleep) {
1492 lck_rw_lock_shared_gen(lock);
1493 } else {
1494 cpu_pause();
1495 continue;
1496 }
5ba3f43e
A
1497 break;
1498 }
1499 data += LCK_RW_SHARED_READER;
0a7de745 1500 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1501 break;
0a7de745 1502 }
5ba3f43e
A
1503 cpu_pause();
1504 }
0a7de745 1505#if CONFIG_DTRACE
5ba3f43e 1506 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
0a7de745 1507#endif /* CONFIG_DTRACE */
5ba3f43e
A
1508 return;
1509}
91447636
A
1510
1511/*
2d21ac55 1512 * Routine: lck_rw_lock_shared_gen
b0d623f7
A
1513 * Function:
1514 * assembly fast path code has determined that this lock
1515 * is held exclusively... this is where we spin/block
1516 * until we can acquire the lock in the shared mode
91447636 1517 */
5ba3f43e 1518static void
2d21ac55 1519lck_rw_lock_shared_gen(
0a7de745 1520 lck_rw_t *lck)
91447636 1521{
0a7de745
A
1522 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1523 uint64_t deadline = 0;
1524 int gotlock = 0;
1525 int slept = 0;
1526 wait_result_t res = 0;
1527 boolean_t istate = -1;
3e170ce0 1528
0a7de745 1529#if CONFIG_DTRACE
2d21ac55 1530 uint64_t wait_interval = 0;
b0d623f7
A
1531 int readers_at_sleep = 0;
1532 boolean_t dtrace_ls_initialized = FALSE;
1533 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
2d21ac55 1534#endif
91447636 1535
0a7de745
A
1536 while (!lck_rw_grab_shared(lck)) {
1537#if CONFIG_DTRACE
b0d623f7
A
1538 if (dtrace_ls_initialized == FALSE) {
1539 dtrace_ls_initialized = TRUE;
1540 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1541 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1542 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1543 if (dtrace_ls_enabled) {
1544 /*
1545 * Either sleeping or spinning is happening,
1546 * start a timing of our delay interval now.
1547 */
1548 readers_at_sleep = lck->lck_rw_shared_count;
1549 wait_interval = mach_absolute_time();
1550 }
1551 }
2d21ac55 1552#endif
0a7de745 1553 if (istate == -1) {
b0d623f7 1554 istate = ml_get_interrupts_enabled();
0a7de745 1555 }
91447636 1556
b0d623f7 1557 deadline = lck_rw_deadline_for_spin(lck);
0c530ab8 1558
b0d623f7 1559 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
0a7de745 1560 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
91447636 1561
0a7de745 1562 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
b0d623f7 1563 lck_rw_lock_pause(istate);
0a7de745 1564 }
b0d623f7
A
1565
1566 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
0a7de745 1567 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
b0d623f7 1568
0a7de745 1569 if (gotlock) {
b0d623f7 1570 break;
0a7de745 1571 }
b0d623f7
A
1572 /*
1573 * if we get here, the deadline has expired w/o us
1574 * being able to grab the lock for read
1575 * check to see if we're allowed to do a thread_block
1576 */
1577 if (lck->lck_rw_can_sleep) {
91447636 1578 istate = lck_interlock_lock(lck);
91447636 1579
b0d623f7
A
1580 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1581 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
b0d623f7 1582 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
0a7de745 1583 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
b0d623f7
A
1584
1585 lck->lck_r_waiting = TRUE;
1586
813fb2f6 1587 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
d9a64523 1588 res = assert_wait(RW_LOCK_READER_EVENT(lck),
0a7de745 1589 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1590 lck_interlock_unlock(lck, istate);
b0d623f7
A
1591
1592 if (res == THREAD_WAITING) {
1593 res = thread_block(THREAD_CONTINUE_NULL);
1594 slept++;
1595 }
1596 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
0a7de745 1597 trace_lck, res, slept, 0, 0);
b0d623f7
A
1598 } else {
1599 lck->lck_rw_shared_count++;
1600 lck_interlock_unlock(lck, istate);
1601 break;
91447636
A
1602 }
1603 }
91447636
A
1604 }
1605
0a7de745 1606#if CONFIG_DTRACE
b0d623f7 1607 if (dtrace_ls_enabled == TRUE) {
2d21ac55 1608 if (slept == 0) {
0a7de745 1609 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
2d21ac55 1610 } else {
0a7de745 1611 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
2d21ac55
A
1612 mach_absolute_time() - wait_interval, 0,
1613 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1614 }
1615 }
1616 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1617#endif
91447636
A
1618}
1619
1620
5ba3f43e
A
1621/*
1622 * Routine: lck_rw_lock_exclusive
1623 */
1624
1625void
1626lck_rw_lock_exclusive(lck_rw_t *lock)
1627{
1628 current_thread()->rwlock_count++;
1629 if (atomic_test_and_set32(&lock->data,
0a7de745
A
1630 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1631 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1632#if CONFIG_DTRACE
5ba3f43e 1633 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
0a7de745
A
1634#endif /* CONFIG_DTRACE */
1635 } else {
5ba3f43e 1636 lck_rw_lock_exclusive_gen(lock);
0a7de745 1637 }
5ba3f43e
A
1638}
1639
1640
1641/*
1642 * Routine: lck_rw_lock_shared_to_exclusive
1643 */
1644
1645boolean_t
1646lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1647{
0a7de745 1648 uint32_t data, prev;
5ba3f43e 1649
0a7de745 1650 for (;;) {
5ba3f43e
A
1651 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1652 if (data & LCK_RW_INTERLOCK) {
1653 atomic_exchange_abort();
1654 lck_rw_interlock_spin(lock);
1655 continue;
1656 }
1657 if (data & LCK_RW_WANT_UPGRADE) {
1658 data -= LCK_RW_SHARED_READER;
0a7de745
A
1659 if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */
1660 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1661 }
1662 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1663 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
0a7de745 1664 }
5ba3f43e 1665 } else {
0a7de745
A
1666 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1667 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1668 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1669 break;
0a7de745 1670 }
5ba3f43e
A
1671 }
1672 cpu_pause();
1673 }
0a7de745
A
1674 /* we now own the WANT_UPGRADE */
1675 if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */
1676 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1677 }
1678#if CONFIG_DTRACE
5ba3f43e
A
1679 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1680#endif
1681 return TRUE;
1682}
1683
1684
91447636 1685/*
b0d623f7 1686 * Routine: lck_rw_lock_shared_to_exclusive_failure
91447636 1687 * Function:
b0d623f7
A
1688 * assembly fast path code has already dropped our read
1689 * count and determined that someone else owns 'lck_rw_want_upgrade'
1690 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1691 * all we need to do here is determine if a wakeup is needed
91447636 1692 */
5ba3f43e 1693static boolean_t
b0d623f7 1694lck_rw_lock_shared_to_exclusive_failure(
0a7de745
A
1695 lck_rw_t *lck,
1696 uint32_t prior_lock_state)
91447636 1697{
0a7de745
A
1698 lck_rw_t *fake_lck;
1699 thread_t thread = current_thread();
1700 uint32_t rwlock_count;
39236c6e
A
1701
1702 /* Check if dropping the lock means that we need to unpromote */
1703 rwlock_count = thread->rwlock_count--;
1704#if MACH_LDEBUG
1705 if (rwlock_count == 0) {
1706 panic("rw lock count underflow for thread %p", thread);
1707 }
1708#endif
b0d623f7 1709 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1710
b0d623f7 1711 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
91447636
A
1712 /*
1713 * Someone else has requested upgrade.
b0d623f7
A
1714 * Since we've released the read lock, wake
1715 * him up if he's blocked waiting
91447636 1716 */
b0d623f7
A
1717 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1718 }
5ba3f43e
A
1719
1720 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1721 /* sched_flags checked without lock, but will be rechecked while clearing */
d9a64523 1722 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
5ba3f43e
A
1723 }
1724
b0d623f7 1725 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
0a7de745 1726 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
91447636 1727
0a7de745 1728 return FALSE;
b0d623f7 1729}
91447636 1730
91447636 1731
b0d623f7
A
1732/*
1733 * Routine: lck_rw_lock_shared_to_exclusive_failure
1734 * Function:
1735 * assembly fast path code has already dropped our read
1736 * count and successfully acquired 'lck_rw_want_upgrade'
1737 * we just need to wait for the rest of the readers to drain
1738 * and then we can return as the exclusive holder of this lock
1739 */
5ba3f43e 1740static boolean_t
b0d623f7 1741lck_rw_lock_shared_to_exclusive_success(
0a7de745 1742 lck_rw_t *lck)
b0d623f7 1743{
0a7de745
A
1744 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1745 uint64_t deadline = 0;
1746 int slept = 0;
1747 int still_shared = 0;
1748 wait_result_t res;
1749 boolean_t istate = -1;
91447636 1750
0a7de745 1751#if CONFIG_DTRACE
b0d623f7
A
1752 uint64_t wait_interval = 0;
1753 int readers_at_sleep = 0;
1754 boolean_t dtrace_ls_initialized = FALSE;
1755 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1756#endif
91447636 1757
2d21ac55 1758 while (lck->lck_rw_shared_count != 0) {
0a7de745 1759#if CONFIG_DTRACE
b0d623f7
A
1760 if (dtrace_ls_initialized == FALSE) {
1761 dtrace_ls_initialized = TRUE;
1762 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1763 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1764 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1765 if (dtrace_ls_enabled) {
1766 /*
1767 * Either sleeping or spinning is happening,
1768 * start a timing of our delay interval now.
1769 */
1770 readers_at_sleep = lck->lck_rw_shared_count;
1771 wait_interval = mach_absolute_time();
1772 }
2d21ac55
A
1773 }
1774#endif
0a7de745 1775 if (istate == -1) {
b0d623f7 1776 istate = ml_get_interrupts_enabled();
0a7de745 1777 }
b0d623f7
A
1778
1779 deadline = lck_rw_deadline_for_spin(lck);
1780
1781 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
0a7de745 1782 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7 1783
0a7de745 1784 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
b0d623f7 1785 lck_rw_lock_pause(istate);
0a7de745 1786 }
b0d623f7
A
1787
1788 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
0a7de745 1789 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7 1790
0a7de745 1791 if (!still_shared) {
b0d623f7 1792 break;
0a7de745 1793 }
b0d623f7
A
1794 /*
1795 * if we get here, the deadline has expired w/o
1796 * the rw_shared_count having drained to 0
1797 * check to see if we're allowed to do a thread_block
1798 */
1799 if (lck->lck_rw_can_sleep) {
91447636 1800 istate = lck_interlock_lock(lck);
0a7de745 1801
b0d623f7
A
1802 if (lck->lck_rw_shared_count != 0) {
1803 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
0a7de745 1804 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7
A
1805
1806 lck->lck_w_waiting = TRUE;
91447636 1807
813fb2f6 1808 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
d9a64523 1809 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
0a7de745 1810 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1811 lck_interlock_unlock(lck, istate);
b0d623f7
A
1812
1813 if (res == THREAD_WAITING) {
1814 res = thread_block(THREAD_CONTINUE_NULL);
1815 slept++;
1816 }
1817 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
0a7de745 1818 trace_lck, res, slept, 0, 0);
b0d623f7
A
1819 } else {
1820 lck_interlock_unlock(lck, istate);
1821 break;
91447636
A
1822 }
1823 }
91447636 1824 }
0a7de745 1825#if CONFIG_DTRACE
2d21ac55
A
1826 /*
1827 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1828 */
b0d623f7 1829 if (dtrace_ls_enabled == TRUE) {
2d21ac55 1830 if (slept == 0) {
0a7de745 1831 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
2d21ac55 1832 } else {
0a7de745 1833 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
2d21ac55
A
1834 mach_absolute_time() - wait_interval, 1,
1835 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1836 }
1837 }
2d21ac55
A
1838 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1839#endif
0a7de745 1840 return TRUE;
91447636
A
1841}
1842
5ba3f43e
A
1843/*
1844 * Routine: lck_rw_lock_exclusive_to_shared
1845 */
1846
0a7de745
A
1847void
1848lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
5ba3f43e 1849{
0a7de745 1850 uint32_t data, prev;
5ba3f43e 1851
0a7de745 1852 for (;;) {
5ba3f43e
A
1853 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1854 if (data & LCK_RW_INTERLOCK) {
1855 atomic_exchange_abort();
0a7de745 1856 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
5ba3f43e
A
1857 continue;
1858 }
1859 data += LCK_RW_SHARED_READER;
0a7de745 1860 if (data & LCK_RW_WANT_UPGRADE) {
5ba3f43e 1861 data &= ~(LCK_RW_WANT_UPGRADE);
0a7de745 1862 } else {
5ba3f43e 1863 data &= ~(LCK_RW_WANT_EXCL);
0a7de745
A
1864 }
1865 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
5ba3f43e 1866 data &= ~(LCK_RW_W_WAITING);
0a7de745
A
1867 }
1868 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
5ba3f43e 1869 break;
0a7de745 1870 }
5ba3f43e
A
1871 cpu_pause();
1872 }
1873 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1874}
1875
b0d623f7 1876
91447636 1877/*
5ba3f43e 1878 * Routine: lck_rw_lock_exclusive_to_shared_gen
0a7de745 1879 * Function:
b0d623f7
A
1880 * assembly fast path has already dropped
1881 * our exclusive state and bumped lck_rw_shared_count
1882 * all we need to do here is determine if anyone
1883 * needs to be awakened.
91447636 1884 */
5ba3f43e 1885static void
b0d623f7 1886lck_rw_lock_exclusive_to_shared_gen(
0a7de745
A
1887 lck_rw_t *lck,
1888 uint32_t prior_lock_state)
91447636 1889{
0a7de745
A
1890 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1891 lck_rw_t *fake_lck;
91447636 1892
b0d623f7 1893 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1894
b0d623f7 1895 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
0a7de745 1896 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
91447636 1897
b0d623f7
A
1898 /*
1899 * don't wake up anyone waiting to take the lock exclusively
1900 * since we hold a read count... when the read count drops to 0,
1901 * the writers will be woken.
1902 *
1903 * wake up any waiting readers if we don't have any writers waiting,
1904 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1905 */
0a7de745 1906 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
2d21ac55 1907 thread_wakeup(RW_LOCK_READER_EVENT(lck));
0a7de745 1908 }
91447636
A
1909
1910 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
0a7de745 1911 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
91447636 1912
2d21ac55
A
1913#if CONFIG_DTRACE
1914 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1915#endif
91447636
A
1916}
1917
1918
1919/*
1920 * Routine: lck_rw_try_lock
1921 */
1922boolean_t
1923lck_rw_try_lock(
0a7de745
A
1924 lck_rw_t *lck,
1925 lck_rw_type_t lck_rw_type)
1926{
1927 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1928 return lck_rw_try_lock_shared(lck);
1929 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1930 return lck_rw_try_lock_exclusive(lck);
1931 } else {
91447636 1932 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
0a7de745
A
1933 }
1934 return FALSE;
91447636
A
1935}
1936
5ba3f43e
A
1937/*
1938 * Routine: lck_rw_try_lock_shared
1939 */
1940
0a7de745
A
1941boolean_t
1942lck_rw_try_lock_shared(lck_rw_t *lock)
5ba3f43e 1943{
0a7de745 1944 uint32_t data, prev;
5ba3f43e 1945
0a7de745 1946 for (;;) {
5ba3f43e
A
1947 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1948 if (data & LCK_RW_INTERLOCK) {
1949 atomic_exchange_abort();
1950 lck_rw_interlock_spin(lock);
1951 continue;
1952 }
1953 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1954 atomic_exchange_abort();
0a7de745 1955 return FALSE; /* lock is busy */
5ba3f43e 1956 }
0a7de745
A
1957 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1958 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1959 break;
0a7de745 1960 }
5ba3f43e
A
1961 cpu_pause();
1962 }
1963 current_thread()->rwlock_count++;
1964 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
0a7de745 1965#if CONFIG_DTRACE
5ba3f43e 1966 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
0a7de745 1967#endif /* CONFIG_DTRACE */
5ba3f43e
A
1968 return TRUE;
1969}
1970
1971
1972/*
1973 * Routine: lck_rw_try_lock_exclusive
1974 */
1975
0a7de745
A
1976boolean_t
1977lck_rw_try_lock_exclusive(lck_rw_t *lock)
5ba3f43e 1978{
0a7de745 1979 uint32_t data, prev;
5ba3f43e 1980
0a7de745 1981 for (;;) {
5ba3f43e
A
1982 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1983 if (data & LCK_RW_INTERLOCK) {
1984 atomic_exchange_abort();
1985 lck_rw_interlock_spin(lock);
1986 continue;
1987 }
1988 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1989 atomic_exchange_abort();
0a7de745 1990 return FALSE; /* can't get it */
5ba3f43e
A
1991 }
1992 data |= LCK_RW_WANT_EXCL;
0a7de745 1993 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1994 break;
0a7de745 1995 }
5ba3f43e
A
1996 cpu_pause();
1997 }
1998
1999 current_thread()->rwlock_count++;
0a7de745 2000#if CONFIG_DTRACE
5ba3f43e 2001 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
0a7de745 2002#endif /* CONFIG_DTRACE */
5ba3f43e
A
2003 return TRUE;
2004}
2005
91447636 2006
2d21ac55
A
2007void
2008lck_rw_assert(
0a7de745
A
2009 lck_rw_t *lck,
2010 unsigned int type)
2d21ac55
A
2011{
2012 switch (type) {
2013 case LCK_RW_ASSERT_SHARED:
2014 if (lck->lck_rw_shared_count != 0) {
2015 return;
2016 }
2017 break;
2018 case LCK_RW_ASSERT_EXCLUSIVE:
2019 if ((lck->lck_rw_want_write ||
0a7de745 2020 lck->lck_rw_want_upgrade) &&
2d21ac55
A
2021 lck->lck_rw_shared_count == 0) {
2022 return;
2023 }
2024 break;
2025 case LCK_RW_ASSERT_HELD:
2026 if (lck->lck_rw_want_write ||
2027 lck->lck_rw_want_upgrade ||
2028 lck->lck_rw_shared_count != 0) {
2029 return;
2030 }
2031 break;
39236c6e
A
2032 case LCK_RW_ASSERT_NOTHELD:
2033 if (!(lck->lck_rw_want_write ||
0a7de745
A
2034 lck->lck_rw_want_upgrade ||
2035 lck->lck_rw_shared_count != 0)) {
39236c6e
A
2036 return;
2037 }
2038 break;
2d21ac55
A
2039 default:
2040 break;
2041 }
2042
39236c6e
A
2043 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
2044}
2045
2046/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
2047void
2048lck_rw_clear_promotions_x86(thread_t thread)
2049{
2050#if MACH_LDEBUG
2051 /* It's fatal to leave a RW lock locked and return to userspace */
2052 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2053#else
2054 /* Paper over the issue */
2055 thread->rwlock_count = 0;
d9a64523 2056 lck_rw_clear_promotion(thread, 0);
39236c6e 2057#endif
2d21ac55
A
2058}
2059
5ba3f43e
A
2060boolean_t
2061lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2062{
2063 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2064
2065 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2066 lck_rw_unlock_shared(lck);
2067 mutex_pause(2);
2068 lck_rw_lock_shared(lck);
2069 return TRUE;
2070 }
2071
2072 return FALSE;
2073}
39236c6e 2074
3e170ce0
A
2075/*
2076 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2077 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2078 */
2079boolean_t
0a7de745
A
2080kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2081{
3e170ce0
A
2082 if (not_in_kdp) {
2083 panic("panic: rw lock exclusive check done outside of kernel debugger");
2084 }
2085 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2086}
2087
d9a64523
A
2088/*
2089 * Slow path routines for lck_mtx locking and unlocking functions.
2090 *
2091 * These functions were previously implemented in x86 assembly,
2092 * and some optimizations are in place in this c code to obtain a compiled code
2093 * as performant and compact as the assembly version.
2094 *
2095 * To avoid to inline these functions on the fast path, all functions directly called by
2096 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2097 * in such a way the fast path can tail call into them. In this way the return address
2098 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2099 *
2100 * Slow path code is structured in such a way there are no calls to functions that will return
2101 * on the context of the caller function, i.e. all functions called are or tail call functions
2102 * or inline functions. The number of arguments of the tail call functions are less then six,
2103 * so that they can be passed over registers and do not need to be pushed on stack.
2104 * This allows the compiler to not create a stack frame for the functions.
2105 *
2106 * __improbable and __probable are used to compile the slow path code in such a way
2107 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2108 * to make this case the most optimized even if falling through the slow path.
2109 */
2110
2111/*
2112 * Intel lock invariants:
2113 *
2114 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2115 * lck_mtx_pri: contains the max priority of all waiters during a contention period
2116 * not cleared on last unlock, but stomped over on next first contention
2117 * lck_mtx_promoted: set when the current lock owner has been promoted
2118 * cleared when lock owner unlocks, set on acquire or wait.
2119 *
2120 * The lock owner is promoted to the max priority of all its waiters only if it
2121 * was a lower priority when it acquired or was an owner when a waiter waited.
2122 * Max priority is capped at MAXPRI_PROMOTE.
2123 *
2124 * The last waiter will not be promoted as it is woken up, but the last
2125 * lock owner may not have been the last thread to have been woken up depending on the
2126 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2127 * flag set.
2128 *
2129 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2130 * priority from dropping priority in the future without having to take thread lock
2131 * on acquire.
2132 */
3e170ce0 2133
0a7de745 2134#ifdef MUTEX_ZONE
6d2010ae
A
2135extern zone_t lck_mtx_zone;
2136#endif
d9a64523 2137
91447636
A
2138/*
2139 * Routine: lck_mtx_alloc_init
2140 */
2141lck_mtx_t *
2142lck_mtx_alloc_init(
0a7de745
A
2143 lck_grp_t *grp,
2144 lck_attr_t *attr)
91447636 2145{
0a7de745
A
2146 lck_mtx_t *lck;
2147#ifdef MUTEX_ZONE
2148 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0) {
6d2010ae 2149 lck_mtx_init(lck, grp, attr);
0a7de745 2150 }
6d2010ae 2151#else
0a7de745 2152 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0) {
91447636 2153 lck_mtx_init(lck, grp, attr);
0a7de745
A
2154 }
2155#endif
2156 return lck;
91447636
A
2157}
2158
2159/*
2160 * Routine: lck_mtx_free
2161 */
2162void
2163lck_mtx_free(
0a7de745
A
2164 lck_mtx_t *lck,
2165 lck_grp_t *grp)
91447636
A
2166{
2167 lck_mtx_destroy(lck, grp);
0a7de745 2168#ifdef MUTEX_ZONE
6d2010ae
A
2169 zfree(lck_mtx_zone, lck);
2170#else
91447636 2171 kfree(lck, sizeof(lck_mtx_t));
6d2010ae 2172#endif
91447636
A
2173}
2174
2175/*
2176 * Routine: lck_mtx_ext_init
2177 */
2178static void
2179lck_mtx_ext_init(
0a7de745
A
2180 lck_mtx_ext_t *lck,
2181 lck_grp_t *grp,
2182 lck_attr_t *attr)
91447636 2183{
2d21ac55 2184 bzero((void *)lck, sizeof(lck_mtx_ext_t));
91447636
A
2185
2186 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
91447636
A
2187 lck->lck_mtx_deb.type = MUTEX_TAG;
2188 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2189 }
2190
2191 lck->lck_mtx_grp = grp;
2d21ac55 2192
0a7de745 2193 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
6d2010ae 2194 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
0a7de745 2195 }
b0d623f7 2196
6d2010ae 2197 lck->lck_mtx.lck_mtx_is_ext = 1;
39037602 2198 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
91447636
A
2199}
2200
2201/*
2202 * Routine: lck_mtx_init
2203 */
2204void
2205lck_mtx_init(
0a7de745
A
2206 lck_mtx_t *lck,
2207 lck_grp_t *grp,
2208 lck_attr_t *attr)
91447636 2209{
0a7de745
A
2210 lck_mtx_ext_t *lck_ext;
2211 lck_attr_t *lck_attr;
2d21ac55 2212
0a7de745 2213 if (attr != LCK_ATTR_NULL) {
2d21ac55 2214 lck_attr = attr;
0a7de745 2215 } else {
2d21ac55 2216 lck_attr = &LockDefaultLckAttr;
0a7de745 2217 }
91447636 2218
2d21ac55 2219 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
91447636 2220 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
0a7de745 2221 lck_mtx_ext_init(lck_ext, grp, lck_attr);
91447636
A
2222 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2223 lck->lck_mtx_ptr = lck_ext;
2224 }
2225 } else {
b0d623f7 2226 lck->lck_mtx_owner = 0;
6d2010ae 2227 lck->lck_mtx_state = 0;
91447636 2228 }
39037602 2229 lck->lck_mtx_pad32 = 0xFFFFFFFF;
91447636
A
2230 lck_grp_reference(grp);
2231 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2232}
2233
2d21ac55
A
2234/*
2235 * Routine: lck_mtx_init_ext
2236 */
2237void
2238lck_mtx_init_ext(
0a7de745
A
2239 lck_mtx_t *lck,
2240 lck_mtx_ext_t *lck_ext,
2241 lck_grp_t *grp,
2242 lck_attr_t *attr)
2d21ac55 2243{
0a7de745 2244 lck_attr_t *lck_attr;
2d21ac55 2245
0a7de745 2246 if (attr != LCK_ATTR_NULL) {
2d21ac55 2247 lck_attr = attr;
0a7de745 2248 } else {
2d21ac55 2249 lck_attr = &LockDefaultLckAttr;
0a7de745 2250 }
2d21ac55
A
2251
2252 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2253 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2254 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2255 lck->lck_mtx_ptr = lck_ext;
2256 } else {
b0d623f7 2257 lck->lck_mtx_owner = 0;
6d2010ae 2258 lck->lck_mtx_state = 0;
2d21ac55 2259 }
39037602 2260 lck->lck_mtx_pad32 = 0xFFFFFFFF;
6d2010ae 2261
2d21ac55
A
2262 lck_grp_reference(grp);
2263 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2264}
2265
d9a64523
A
2266static void
2267lck_mtx_lock_mark_destroyed(
2268 lck_mtx_t *mutex,
2269 boolean_t indirect)
2270{
2271 uint32_t state;
2272
2273 if (indirect) {
2274 /* convert to destroyed state */
2275 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2276 return;
2277 }
2278
2279 state = ordered_load_mtx_state(mutex);
2280 lck_mtx_interlock_lock(mutex, &state);
2281
2282 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2283
2284 enable_preemption();
2285}
2286
91447636
A
2287/*
2288 * Routine: lck_mtx_destroy
2289 */
2290void
2291lck_mtx_destroy(
0a7de745
A
2292 lck_mtx_t *lck,
2293 lck_grp_t *grp)
91447636 2294{
d9a64523 2295 boolean_t indirect;
0a7de745
A
2296
2297 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
91447636 2298 return;
0a7de745 2299 }
39236c6e
A
2300#if MACH_LDEBUG
2301 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2302#endif
d9a64523 2303 indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
b0d623f7 2304
d9a64523 2305 lck_mtx_lock_mark_destroyed(lck, indirect);
b0d623f7 2306
0a7de745 2307 if (indirect) {
91447636 2308 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
0a7de745 2309 }
91447636
A
2310 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2311 lck_grp_deallocate(grp);
2312 return;
2313}
2314
b0d623f7 2315
d9a64523
A
2316#if DEVELOPMENT | DEBUG
2317__attribute__((noinline))
2318void
2319lck_mtx_owner_check_panic(
2320 lck_mtx_t *lock)
2321{
2322 thread_t owner = (thread_t)lock->lck_mtx_owner;
2323 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2324}
2325#endif
2326
2327__attribute__((always_inline))
2328static boolean_t
2329get_indirect_mutex(
2330 lck_mtx_t **lock,
0a7de745 2331 uint32_t *state)
d9a64523
A
2332{
2333 *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2334 *state = ordered_load_mtx_state(*lock);
2335 return TRUE;
2336}
2337
2338/*
0a7de745 2339 * Routine: lck_mtx_unlock_slow
d9a64523
A
2340 *
2341 * Unlocks a mutex held by current thread.
2342 *
2343 * It will wake up waiters if necessary and
2344 * drop promotions.
2345 *
2346 * Interlock can be held.
2347 */
2348__attribute__((noinline))
2349void
2350lck_mtx_unlock_slow(
0a7de745 2351 lck_mtx_t *lock)
d9a64523 2352{
0a7de745
A
2353 thread_t thread;
2354 uint32_t state, prev;
2355 boolean_t indirect = FALSE;
d9a64523
A
2356
2357 state = ordered_load_mtx_state(lock);
2358
2359 /* Is this an indirect mutex? */
2360 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2361 indirect = get_indirect_mutex(&lock, &state);
2362 }
2363
2364 thread = current_thread();
2365
2366#if DEVELOPMENT | DEBUG
2367 thread_t owner = (thread_t)lock->lck_mtx_owner;
0a7de745 2368 if (__improbable(owner != thread)) {
d9a64523 2369 return lck_mtx_owner_check_panic(lock);
0a7de745 2370 }
d9a64523
A
2371#endif
2372
2373 /* check if it is held as a spinlock */
0a7de745 2374 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
d9a64523 2375 goto unlock;
0a7de745 2376 }
d9a64523
A
2377
2378 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2379
2380unlock:
2381 /* preemption disabled, interlock held and mutex not held */
2382
2383 /* clear owner */
2384 ordered_store_mtx_owner(lock, 0);
2385 /* keep original state in prev for later evaluation */
2386 prev = state;
2387 /* release interlock, promotion and clear spin flag */
2388 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK));
0a7de745
A
2389 if ((state & LCK_MTX_WAITERS_MSK)) {
2390 state -= LCK_MTX_WAITER; /* decrement waiter count */
2391 }
2392 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
d9a64523 2393
0a7de745 2394#if MACH_LDEBUG
d9a64523 2395 /* perform lock statistics after drop to prevent delay */
0a7de745
A
2396 if (thread) {
2397 thread->mutex_count--; /* lock statistic */
2398 }
2399#endif /* MACH_LDEBUG */
d9a64523
A
2400
2401 /* check if there are waiters to wake up or priority to drop */
0a7de745 2402 if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK))) {
d9a64523 2403 return lck_mtx_unlock_wakeup_tail(lock, prev, indirect);
0a7de745 2404 }
d9a64523
A
2405
2406 /* re-enable preemption */
2407 lck_mtx_unlock_finish_inline(lock, FALSE);
2408
2409 return;
2410}
2411
0a7de745
A
2412#define LCK_MTX_LCK_WAIT_CODE 0x20
2413#define LCK_MTX_LCK_WAKEUP_CODE 0x21
2414#define LCK_MTX_LCK_SPIN_CODE 0x22
2415#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2416#define LCK_MTX_LCK_DEMOTE_CODE 0x24
b0d623f7 2417
b0d623f7 2418/*
d9a64523 2419 * Routine: lck_mtx_unlock_wakeup_tail
b0d623f7 2420 *
d9a64523
A
2421 * Invoked on unlock when there is
2422 * contention, i.e. the assembly routine sees
2423 * that mutex->lck_mtx_waiters != 0 or
2424 * that mutex->lck_mtx_promoted != 0
b0d623f7 2425 *
6d2010ae 2426 * neither the mutex or interlock is held
d9a64523
A
2427 *
2428 * Note that this routine might not be called if there are pending
2429 * waiters which have previously been woken up, and they didn't
2430 * end up boosting the old owner.
2431 *
2432 * assembly routine previously did the following to mutex:
2433 * (after saving the state in prior_lock_state)
2434 * cleared lck_mtx_promoted
2435 * decremented lck_mtx_waiters if nonzero
2436 *
2437 * This function needs to be called as a tail call
2438 * to optimize the compiled code.
b0d623f7 2439 */
d9a64523
A
2440__attribute__((noinline))
2441static void
0a7de745
A
2442lck_mtx_unlock_wakeup_tail(
2443 lck_mtx_t *mutex,
2444 int prior_lock_state,
2445 boolean_t indirect)
b0d623f7 2446{
0a7de745
A
2447 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2448 lck_mtx_t fake_lck;
6d2010ae
A
2449
2450 /*
2451 * prior_lock state is a snapshot of the 2nd word of the
2452 * lock in question... we'll fake up a lock with the bits
2453 * copied into place and carefully not access anything
2454 * beyond whats defined in the second word of a lck_mtx_t
2455 */
2456 fake_lck.lck_mtx_state = prior_lock_state;
2457
2458 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
0a7de745 2459 trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
b0d623f7 2460
6d2010ae 2461 if (__probable(fake_lck.lck_mtx_waiters)) {
d9a64523
A
2462 kern_return_t did_wake;
2463
0a7de745 2464 if (fake_lck.lck_mtx_waiters > 1) {
d9a64523 2465 did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
0a7de745 2466 } else {
d9a64523 2467 did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex));
0a7de745 2468 }
d9a64523
A
2469 /*
2470 * The waiters count always precisely matches the number of threads on the waitqueue.
2471 * i.e. we should never see ret == KERN_NOT_WAITING.
2472 */
2473 assert(did_wake == KERN_SUCCESS);
6d2010ae 2474 }
b0d623f7 2475
d9a64523 2476 /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */
6d2010ae 2477 if (__improbable(fake_lck.lck_mtx_promoted)) {
d9a64523 2478 thread_t thread = current_thread();
b0d623f7 2479
d9a64523
A
2480 spl_t s = splsched();
2481 thread_lock(thread);
b0d623f7 2482
6d2010ae 2483 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
0a7de745 2484 thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
d9a64523
A
2485 assert(thread->was_promoted_on_wakeup == 0);
2486 assert(thread->promotions > 0);
b0d623f7 2487
d9a64523 2488 assert_promotions_invariant(thread);
b0d623f7 2489
0a7de745 2490 if (--thread->promotions == 0) {
d9a64523 2491 sched_thread_unpromote(thread, trace_lck);
0a7de745 2492 }
b0d623f7 2493
d9a64523 2494 assert_promotions_invariant(thread);
b0d623f7 2495
d9a64523
A
2496 thread_unlock(thread);
2497 splx(s);
b0d623f7 2498 }
d9a64523 2499
6d2010ae 2500 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
0a7de745 2501 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7 2502
d9a64523
A
2503 lck_mtx_unlock_finish_inline(mutex, indirect);
2504}
b0d623f7
A
2505
2506/*
0a7de745 2507 * Routine: lck_mtx_lock_acquire_x86
b0d623f7
A
2508 *
2509 * Invoked on acquiring the mutex when there is
6d2010ae 2510 * contention (i.e. the assembly routine sees that
0a7de745 2511 * that mutex->lck_mtx_waiters != 0 or
6d2010ae
A
2512 * thread->was_promoted_on_wakeup != 0)...
2513 *
2514 * mutex is owned... interlock is held... preemption is disabled
b0d623f7 2515 */
d9a64523
A
2516__attribute__((always_inline))
2517static void
2518lck_mtx_lock_acquire_inline(
0a7de745 2519 lck_mtx_t *mutex)
b0d623f7 2520{
0a7de745
A
2521 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2522 integer_t priority;
b0d623f7 2523
6d2010ae 2524 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
0a7de745 2525 trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
b0d623f7 2526
0a7de745 2527 if (mutex->lck_mtx_waiters) {
6d2010ae 2528 priority = mutex->lck_mtx_pri;
0a7de745 2529 } else {
d9a64523 2530 priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */
0a7de745 2531 }
d9a64523
A
2532 /* the priority must have been set correctly by wait */
2533 assert(priority <= MAXPRI_PROMOTE);
2534 assert(priority == 0 || priority >= BASEPRI_DEFAULT);
b0d623f7 2535
d9a64523
A
2536 /* if the mutex wasn't owned, then the owner wasn't promoted */
2537 assert(mutex->lck_mtx_promoted == 0);
b0d623f7 2538
d9a64523 2539 thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */
b0d623f7 2540
d9a64523
A
2541 if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
2542 spl_t s = splsched();
6d2010ae 2543 thread_lock(thread);
b0d623f7 2544
0a7de745 2545 if (thread->was_promoted_on_wakeup) {
d9a64523 2546 assert(thread->promotions > 0);
0a7de745 2547 }
d9a64523
A
2548
2549 /* Intel only promotes if priority goes up */
2550 if (thread->sched_pri < priority && thread->promotion_priority < priority) {
2551 /* Remember that I need to drop this promotion on unlock */
6d2010ae 2552 mutex->lck_mtx_promoted = 1;
d9a64523
A
2553
2554 if (thread->promotions++ == 0) {
2555 /* This is the first promotion for the owner */
2556 sched_thread_promote_to_pri(thread, priority, trace_lck);
2557 } else {
2558 /*
2559 * Holder was previously promoted due to a different mutex,
2560 * raise to match this one.
2561 * Or, this thread was promoted on wakeup but someone else
2562 * later contended on mutex at higher priority before we got here
2563 */
2564 sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
2565 }
b0d623f7 2566 }
d9a64523
A
2567
2568 if (thread->was_promoted_on_wakeup) {
2569 thread->was_promoted_on_wakeup = 0;
0a7de745 2570 if (--thread->promotions == 0) {
d9a64523 2571 sched_thread_unpromote(thread, trace_lck);
0a7de745 2572 }
d9a64523
A
2573 }
2574
6d2010ae
A
2575 thread_unlock(thread);
2576 splx(s);
b0d623f7 2577 }
6d2010ae 2578 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
0a7de745 2579 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7
A
2580}
2581
d9a64523
A
2582void
2583lck_mtx_lock_acquire_x86(
0a7de745 2584 lck_mtx_t *mutex)
d9a64523
A
2585{
2586 return lck_mtx_lock_acquire_inline(mutex);
2587}
2588
2589/*
2590 * Tail call helpers for lock functions that perform
2591 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2592 * the caller's compiled code.
2593 */
b0d623f7 2594
d9a64523
A
2595__attribute__((noinline))
2596static void
2597lck_mtx_lock_acquire_tail(
0a7de745
A
2598 lck_mtx_t *mutex,
2599 boolean_t indirect)
d9a64523
A
2600{
2601 lck_mtx_lock_acquire_inline(mutex);
2602 lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect);
2603}
2604
2605__attribute__((noinline))
2606static boolean_t
2607lck_mtx_try_lock_acquire_tail(
0a7de745 2608 lck_mtx_t *mutex)
d9a64523
A
2609{
2610 lck_mtx_lock_acquire_inline(mutex);
2611 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2612
2613 return TRUE;
2614}
2615
2616__attribute__((noinline))
2617static void
2618lck_mtx_convert_spin_acquire_tail(
0a7de745 2619 lck_mtx_t *mutex)
d9a64523
A
2620{
2621 lck_mtx_lock_acquire_inline(mutex);
2622 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2623}
2624
2625boolean_t
2626lck_mtx_ilk_unlock(
2627 lck_mtx_t *mutex)
2628{
2629 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2630 return TRUE;
2631}
2632
2633static inline void
2634lck_mtx_interlock_lock_set_and_clear_flags(
2635 lck_mtx_t *mutex,
2636 uint32_t xor_flags,
2637 uint32_t and_flags,
2638 uint32_t *new_state)
3e170ce0 2639{
d9a64523
A
2640 uint32_t state, prev;
2641 state = *new_state;
2642
0a7de745 2643 for (;;) {
d9a64523
A
2644 /* have to wait for interlock to clear */
2645 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2646 cpu_pause();
2647 state = ordered_load_mtx_state(mutex);
2648 }
2649 prev = state; /* prev contains snapshot for exchange */
2650 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
0a7de745 2651 state &= ~and_flags; /* clear flags */
d9a64523
A
2652
2653 disable_preemption();
0a7de745 2654 if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
d9a64523 2655 break;
0a7de745 2656 }
d9a64523
A
2657 enable_preemption();
2658 cpu_pause();
2659 state = ordered_load_mtx_state(mutex);
2660 }
2661 *new_state = state;
2662 return;
2663}
2664
2665static inline void
2666lck_mtx_interlock_lock_clear_flags(
2667 lck_mtx_t *mutex,
2668 uint32_t and_flags,
2669 uint32_t *new_state)
2670{
2671 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2672}
2673
2674static inline void
2675lck_mtx_interlock_lock(
2676 lck_mtx_t *mutex,
2677 uint32_t *new_state)
2678{
2679 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2680}
2681
2682static inline int
2683lck_mtx_interlock_try_lock_set_flags(
2684 lck_mtx_t *mutex,
2685 uint32_t or_flags,
2686 uint32_t *new_state)
2687{
2688 uint32_t state, prev;
2689 state = *new_state;
2690
2691 /* have to wait for interlock to clear */
2692 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2693 return 0;
2694 }
0a7de745
A
2695 prev = state; /* prev contains snapshot for exchange */
2696 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
d9a64523
A
2697 disable_preemption();
2698 if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
0a7de745
A
2699 *new_state = state;
2700 return 1;
d9a64523
A
2701 }
2702
2703 enable_preemption();
2704 return 0;
2705}
2706
2707static inline int
2708lck_mtx_interlock_try_lock(
2709 lck_mtx_t *mutex,
2710 uint32_t *new_state)
2711{
2712 return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state);
2713}
2714
2715static inline int
2716lck_mtx_interlock_try_lock_disable_interrupts(
2717 lck_mtx_t *mutex,
2718 boolean_t *istate)
2719{
0a7de745 2720 uint32_t state;
3e170ce0
A
2721
2722 *istate = ml_set_interrupts_enabled(FALSE);
d9a64523 2723 state = ordered_load_mtx_state(mutex);
3e170ce0 2724
d9a64523
A
2725 if (lck_mtx_interlock_try_lock(mutex, &state)) {
2726 return 1;
2727 } else {
3e170ce0 2728 ml_set_interrupts_enabled(*istate);
d9a64523
A
2729 return 0;
2730 }
3e170ce0
A
2731}
2732
d9a64523
A
2733static inline void
2734lck_mtx_interlock_unlock_enable_interrupts(
2735 lck_mtx_t *mutex,
2736 boolean_t istate)
2737{
3e170ce0
A
2738 lck_mtx_ilk_unlock(mutex);
2739 ml_set_interrupts_enabled(istate);
2740}
2741
d9a64523
A
2742__attribute__((noinline))
2743static void
2744lck_mtx_lock_contended(
2745 lck_mtx_t *lock,
2746 boolean_t indirect,
2747 boolean_t *first_miss)
2748{
2749 lck_mtx_spinwait_ret_type_t ret;
2750 uint32_t state;
2751 thread_t thread;
2752
2753try_again:
2754
2755 if (indirect) {
0a7de745 2756 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
d9a64523
A
2757 }
2758
2759 ret = lck_mtx_lock_spinwait_x86(lock);
2760 state = ordered_load_mtx_state(lock);
2761 switch (ret) {
2762 case LCK_MTX_SPINWAIT_NO_SPIN:
2763 /*
2764 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2765 * try to spin.
2766 */
2767 if (indirect) {
0a7de745 2768 lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
d9a64523
A
2769 }
2770
0a7de745 2771 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
d9a64523
A
2772 case LCK_MTX_SPINWAIT_SPUN:
2773 /*
2774 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2775 * interlock not held
2776 */
2777 lck_mtx_interlock_lock(lock, &state);
2778 assert(state & LCK_MTX_ILOCKED_MSK);
2779
2780 if (state & LCK_MTX_MLOCKED_MSK) {
2781 if (indirect) {
0a7de745 2782 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
d9a64523
A
2783 }
2784 lck_mtx_lock_wait_x86(lock);
2785 /*
2786 * interlock is not held here.
2787 */
2788 goto try_again;
2789 } else {
d9a64523
A
2790 /* grab the mutex */
2791 state |= LCK_MTX_MLOCKED_MSK;
2792 ordered_store_mtx_state_release(lock, state);
2793 thread = current_thread();
2794 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2795#if MACH_LDEBUG
2796 if (thread) {
2797 thread->mutex_count++;
2798 }
2799#endif /* MACH_LDEBUG */
2800 }
2801
2802 break;
2803 case LCK_MTX_SPINWAIT_ACQUIRED:
2804 /*
2805 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2806 * interlock is held and preemption disabled
2807 * owner is set and mutex marked as locked
2808 * statistics updated too
2809 */
2810 break;
2811 default:
2812 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2813 }
2814
2815 /*
2816 * interlock is already acquired here
2817 */
2818
2819 /* mutex has been acquired */
2820 thread = (thread_t)lock->lck_mtx_owner;
2821 if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) {
2822 return lck_mtx_lock_acquire_tail(lock, indirect);
2823 }
2824
2825 /* release the interlock */
2826 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2827}
2828
2829/*
2830 * Helper noinline functions for calling
2831 * panic to optimize compiled code.
2832 */
2833
2834__attribute__((noinline))
2835static void
2836lck_mtx_destroyed(
2837 lck_mtx_t *lock)
2838{
2839 panic("trying to interlock destroyed mutex (%p)", lock);
2840}
2841
2842__attribute__((noinline))
2843static boolean_t
2844lck_mtx_try_destroyed(
2845 lck_mtx_t *lock)
2846{
2847 panic("trying to interlock destroyed mutex (%p)", lock);
2848 return FALSE;
2849}
2850
2851__attribute__((always_inline))
2852static boolean_t
2853lck_mtx_lock_wait_interlock_to_clear(
2854 lck_mtx_t *lock,
2855 uint32_t* new_state)
2856{
2857 uint32_t state;
2858
0a7de745 2859 for (;;) {
d9a64523
A
2860 cpu_pause();
2861 state = ordered_load_mtx_state(lock);
2862 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2863 *new_state = state;
2864 return TRUE;
2865 }
2866 if (state & LCK_MTX_MLOCKED_MSK) {
2867 /* if it is held as mutex, just fail */
2868 return FALSE;
2869 }
2870 }
2871}
2872
2873__attribute__((always_inline))
2874static boolean_t
2875lck_mtx_try_lock_wait_interlock_to_clear(
2876 lck_mtx_t *lock,
2877 uint32_t* new_state)
2878{
2879 uint32_t state;
2880
0a7de745 2881 for (;;) {
d9a64523
A
2882 cpu_pause();
2883 state = ordered_load_mtx_state(lock);
2884 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2885 /* if it is held as mutex or spin, just fail */
2886 return FALSE;
2887 }
2888 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2889 *new_state = state;
2890 return TRUE;
2891 }
2892 }
2893}
2894
2895/*
2896 * Routine: lck_mtx_lock_slow
2897 *
2898 * Locks a mutex for current thread.
2899 * If the lock is contended this function might
2900 * sleep.
2901 *
2902 * Called with interlock not held.
2903 */
2904__attribute__((noinline))
2905void
2906lck_mtx_lock_slow(
2907 lck_mtx_t *lock)
2908{
0a7de745
A
2909 boolean_t indirect = FALSE;
2910 uint32_t state;
2911 int first_miss = 0;
d9a64523
A
2912
2913 state = ordered_load_mtx_state(lock);
2914
2915 /* is the interlock or mutex held */
2916 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2917 /*
2918 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2919 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2920 * set in state (state == lck_mtx_tag)
2921 */
2922
2923
2924 /* is the mutex already held and not indirect */
0a7de745 2925 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
2926 /* no, must have been the mutex */
2927 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2928 }
2929
2930 /* check to see if it is marked destroyed */
2931 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2932 return lck_mtx_destroyed(lock);
2933 }
2934
2935 /* Is this an indirect mutex? */
2936 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2937 indirect = get_indirect_mutex(&lock, &state);
2938
2939 first_miss = 0;
0a7de745 2940 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
2941
2942 if (state & LCK_MTX_SPIN_MSK) {
0a7de745 2943 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
d9a64523 2944 assert(state & LCK_MTX_ILOCKED_MSK);
0a7de745 2945 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
d9a64523
A
2946 }
2947 }
2948
2949 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2950 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2951 }
2952 }
2953
2954 /* no - can't be INDIRECT, DESTROYED or locked */
2955 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2956 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2957 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2958 }
2959 }
2960
2961 /* lock and interlock acquired */
2962
2963 thread_t thread = current_thread();
2964 /* record owner of mutex */
2965 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2966
2967#if MACH_LDEBUG
2968 if (thread) {
0a7de745 2969 thread->mutex_count++; /* lock statistic */
d9a64523
A
2970 }
2971#endif
2972 /*
2973 * Check if there are waiters to
2974 * inherit their priority.
2975 */
2976 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2977 return lck_mtx_lock_acquire_tail(lock, indirect);
2978 }
2979
2980 /* release the interlock */
2981 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2982
2983 return;
2984}
2985
2986__attribute__((noinline))
2987boolean_t
2988lck_mtx_try_lock_slow(
2989 lck_mtx_t *lock)
2990{
2991 boolean_t indirect = FALSE;
2992 uint32_t state;
2993 int first_miss = 0;
2994
2995 state = ordered_load_mtx_state(lock);
2996
2997 /* is the interlock or mutex held */
2998 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2999 /*
3000 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3001 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3002 * set in state (state == lck_mtx_tag)
3003 */
3004
3005 /* is the mutex already held and not indirect */
0a7de745 3006 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3007 return FALSE;
3008 }
3009
3010 /* check to see if it is marked destroyed */
3011 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3012 return lck_mtx_try_destroyed(lock);
3013 }
3014
3015 /* Is this an indirect mutex? */
3016 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3017 indirect = get_indirect_mutex(&lock, &state);
3018
3019 first_miss = 0;
0a7de745 3020 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3021 }
3022
3023 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
0a7de745
A
3024 if (indirect) {
3025 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3026 }
d9a64523
A
3027 return FALSE;
3028 }
3029 }
3030
3031 /* no - can't be INDIRECT, DESTROYED or locked */
3032 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3033 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
0a7de745
A
3034 if (indirect) {
3035 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3036 }
d9a64523
A
3037 return FALSE;
3038 }
3039 }
3040
3041 /* lock and interlock acquired */
3042
3043 thread_t thread = current_thread();
3044 /* record owner of mutex */
3045 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3046
3047#if MACH_LDEBUG
3048 if (thread) {
0a7de745 3049 thread->mutex_count++; /* lock statistic */
d9a64523
A
3050 }
3051#endif
3052 /*
3053 * Check if there are waiters to
3054 * inherit their priority.
3055 */
3056 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3057 return lck_mtx_try_lock_acquire_tail(lock);
3058 }
3059
3060 /* release the interlock */
3061 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3062
3063 return TRUE;
d9a64523
A
3064}
3065
3066__attribute__((noinline))
3067void
3068lck_mtx_lock_spin_slow(
0a7de745 3069 lck_mtx_t *lock)
d9a64523
A
3070{
3071 boolean_t indirect = FALSE;
3072 uint32_t state;
3073 int first_miss = 0;
3074
3075 state = ordered_load_mtx_state(lock);
3076
3077 /* is the interlock or mutex held */
3078 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3079 /*
3080 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3081 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3082 * set in state (state == lck_mtx_tag)
3083 */
3084
3085
3086 /* is the mutex already held and not indirect */
0a7de745 3087 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3088 /* no, must have been the mutex */
3089 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3090 }
3091
3092 /* check to see if it is marked destroyed */
3093 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3094 return lck_mtx_destroyed(lock);
3095 }
3096
3097 /* Is this an indirect mutex? */
3098 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3099 indirect = get_indirect_mutex(&lock, &state);
3100
3101 first_miss = 0;
0a7de745 3102 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3103
3104 if (state & LCK_MTX_SPIN_MSK) {
0a7de745 3105 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
d9a64523 3106 assert(state & LCK_MTX_ILOCKED_MSK);
0a7de745 3107 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
d9a64523
A
3108 }
3109 }
3110
3111 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3112 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3113 }
3114 }
3115
3116 /* no - can't be INDIRECT, DESTROYED or locked */
0a7de745 3117 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
d9a64523
A
3118 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3119 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3120 }
3121 }
3122
3123 /* lock as spinlock and interlock acquired */
3124
3125 thread_t thread = current_thread();
3126 /* record owner of mutex */
3127 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3128
3129#if MACH_LDEBUG
3130 if (thread) {
3131 thread->mutex_count++; /* lock statistic */
3132 }
3133#endif
3134
0a7de745 3135#if CONFIG_DTRACE
d9a64523
A
3136 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3137#endif
3138 /* return with the interlock held and preemption disabled */
3139 return;
3140}
3141
3142__attribute__((noinline))
3143boolean_t
3144lck_mtx_try_lock_spin_slow(
3145 lck_mtx_t *lock)
3146{
3147 boolean_t indirect = FALSE;
3148 uint32_t state;
3149 int first_miss = 0;
3150
3151 state = ordered_load_mtx_state(lock);
3152
3153 /* is the interlock or mutex held */
3154 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3155 /*
3156 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3157 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3158 * set in state (state == lck_mtx_tag)
3159 */
3160
3161 /* is the mutex already held and not indirect */
0a7de745 3162 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3163 return FALSE;
3164 }
3165
3166 /* check to see if it is marked destroyed */
3167 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3168 return lck_mtx_try_destroyed(lock);
3169 }
3170
3171 /* Is this an indirect mutex? */
3172 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3173 indirect = get_indirect_mutex(&lock, &state);
3174
3175 first_miss = 0;
0a7de745 3176 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3177 }
3178
3179 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
0a7de745
A
3180 if (indirect) {
3181 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3182 }
d9a64523
A
3183 return FALSE;
3184 }
3185 }
3186
3187 /* no - can't be INDIRECT, DESTROYED or locked */
3188 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3189 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
0a7de745
A
3190 if (indirect) {
3191 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3192 }
d9a64523
A
3193 return FALSE;
3194 }
3195 }
3196
3197 /* lock and interlock acquired */
3198
3199 thread_t thread = current_thread();
3200 /* record owner of mutex */
3201 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3202
3203#if MACH_LDEBUG
3204 if (thread) {
0a7de745 3205 thread->mutex_count++; /* lock statistic */
d9a64523
A
3206 }
3207#endif
3208
3209#if CONFIG_DTRACE
3210 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3211#endif
3212 return TRUE;
d9a64523
A
3213}
3214
3215__attribute__((noinline))
3216void
3217lck_mtx_convert_spin(
0a7de745 3218 lck_mtx_t *lock)
d9a64523
A
3219{
3220 uint32_t state;
3221
3222 state = ordered_load_mtx_state(lock);
3223
3224 /* Is this an indirect mutex? */
3225 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3226 /* If so, take indirection */
3227 get_indirect_mutex(&lock, &state);
3228 }
3229
3230 assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3231
3232 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3233 /* already owned as a mutex, just return */
3234 return;
3235 }
3236
3237 assert(get_preemption_level() > 0);
3238 assert(state & LCK_MTX_ILOCKED_MSK);
3239 assert(state & LCK_MTX_SPIN_MSK);
3240
3241 /*
3242 * Check if there are waiters to
3243 * inherit their priority.
3244 */
3245 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3246 return lck_mtx_convert_spin_acquire_tail(lock);
3247 }
3248
3249 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3250
3251 return;
3252}
3253
3254static inline boolean_t
3255lck_mtx_lock_grab_mutex(
0a7de745 3256 lck_mtx_t *lock)
d9a64523
A
3257{
3258 uint32_t state;
3259
3260 state = ordered_load_mtx_state(lock);
3261
3262 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3263 return FALSE;
3264 }
3265
3266 /* lock and interlock acquired */
3267
3268 thread_t thread = current_thread();
3269 /* record owner of mutex */
3270 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3271
3272#if MACH_LDEBUG
3273 if (thread) {
0a7de745 3274 thread->mutex_count++; /* lock statistic */
d9a64523
A
3275 }
3276#endif
3277 return TRUE;
3278}
3279
3280__attribute__((noinline))
3281void
3282lck_mtx_assert(
0a7de745
A
3283 lck_mtx_t *lock,
3284 unsigned int type)
d9a64523
A
3285{
3286 thread_t thread, owner;
3287 uint32_t state;
3288
3289 thread = current_thread();
3290 state = ordered_load_mtx_state(lock);
3291
3292 if (state == LCK_MTX_TAG_INDIRECT) {
3293 get_indirect_mutex(&lock, &state);
3294 }
3295
3296 owner = (thread_t)lock->lck_mtx_owner;
3297
3298 if (type == LCK_MTX_ASSERT_OWNED) {
0a7de745 3299 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
d9a64523 3300 panic("mutex (%p) not owned\n", lock);
0a7de745 3301 }
d9a64523 3302 } else {
0a7de745
A
3303 assert(type == LCK_MTX_ASSERT_NOTOWNED);
3304 if (owner == thread) {
d9a64523 3305 panic("mutex (%p) owned\n", lock);
0a7de745 3306 }
d9a64523
A
3307 }
3308}
b0d623f7 3309
91447636 3310/*
0a7de745 3311 * Routine: lck_mtx_lock_spinwait_x86
0c530ab8
A
3312 *
3313 * Invoked trying to acquire a mutex when there is contention but
3314 * the holder is running on another processor. We spin for up to a maximum
3315 * time waiting for the lock to be released.
3316 *
3317 * Called with the interlock unlocked.
d9a64523
A
3318 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3319 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3320 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
0c530ab8 3321 */
d9a64523
A
3322__attribute__((noinline))
3323lck_mtx_spinwait_ret_type_t
b0d623f7 3324lck_mtx_lock_spinwait_x86(
0a7de745 3325 lck_mtx_t *mutex)
0c530ab8 3326{
0a7de745
A
3327 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3328 thread_t holder;
3329 uint64_t overall_deadline;
3330 uint64_t check_owner_deadline;
3331 uint64_t cur_time;
3332 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN;
3333 int loopcount = 0;
0c530ab8 3334
6d2010ae 3335 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
0a7de745 3336 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
0c530ab8 3337
3e170ce0
A
3338 cur_time = mach_absolute_time();
3339 overall_deadline = cur_time + MutexSpin;
3340 check_owner_deadline = cur_time;
b0d623f7 3341
0c530ab8
A
3342 /*
3343 * Spin while:
3344 * - mutex is locked, and
b0d623f7 3345 * - its locked as a spin lock, and
0c530ab8 3346 * - owner is running on another processor, and
2d21ac55 3347 * - owner (processor) is not idling, and
0c530ab8
A
3348 * - we haven't spun for long enough.
3349 */
b0d623f7 3350 do {
6d2010ae 3351 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
d9a64523 3352 retval = LCK_MTX_SPINWAIT_ACQUIRED;
b0d623f7 3353 break;
2d21ac55 3354 }
3e170ce0 3355 cur_time = mach_absolute_time();
b0d623f7 3356
0a7de745 3357 if (cur_time >= overall_deadline) {
3e170ce0 3358 break;
0a7de745 3359 }
3e170ce0
A
3360
3361 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
0a7de745 3362 boolean_t istate;
3e170ce0 3363
d9a64523
A
3364 /*
3365 * We will repeatedly peek at the state of the lock while spinning,
3366 * and we will acquire the interlock to do so.
3367 * The thread that will unlock the mutex will also need to acquire
3368 * the interlock, and we want to avoid to slow it down.
3369 * To avoid to get an interrupt while holding the interlock
3370 * and increase the time we are holding it, we
3371 * will try to acquire the interlock with interrupts disabled.
3372 * This is safe because it is a "try_lock", if we can't acquire
3373 * the interlock we re-enable the interrupts and fail, so it is
3374 * ok to call it even if the interlock was already held.
0a7de745 3375 */
d9a64523 3376 if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
3e170ce0 3377 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
0a7de745
A
3378 if (!(holder->machine.specFlags & OnProc) ||
3379 (holder->state & TH_IDLE)) {
d9a64523 3380 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3e170ce0 3381
0a7de745 3382 if (loopcount == 0) {
d9a64523 3383 retval = LCK_MTX_SPINWAIT_NO_SPIN;
0a7de745 3384 }
3e170ce0
A
3385 break;
3386 }
3387 }
d9a64523 3388 lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3e170ce0
A
3389
3390 check_owner_deadline = cur_time + (MutexSpin / 4);
b0d623f7
A
3391 }
3392 }
3393 cpu_pause();
3394
3395 loopcount++;
3e170ce0 3396 } while (TRUE);
b0d623f7 3397
0a7de745 3398#if CONFIG_DTRACE
2d21ac55 3399 /*
3e170ce0 3400 * We've already kept a count via overall_deadline of how long we spun.
2d21ac55
A
3401 * If dtrace is active, then we compute backwards to decide how
3402 * long we spun.
3403 *
3404 * Note that we record a different probe id depending on whether
0a7de745 3405 * this is a direct or indirect mutex. This allows us to
2d21ac55
A
3406 * penalize only lock groups that have debug/stats enabled
3407 * with dtrace processing if desired.
3408 */
6d2010ae 3409 if (__probable(mutex->lck_mtx_is_ext == 0)) {
b0d623f7 3410 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
0a7de745 3411 mach_absolute_time() - (overall_deadline - MutexSpin));
2d21ac55 3412 } else {
b0d623f7 3413 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
0a7de745 3414 mach_absolute_time() - (overall_deadline - MutexSpin));
2d21ac55
A
3415 }
3416 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3417#endif
b0d623f7 3418
6d2010ae 3419 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
0a7de745 3420 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
b0d623f7
A
3421
3422 return retval;
0c530ab8
A
3423}
3424
b0d623f7
A
3425
3426
0c530ab8 3427/*
0a7de745 3428 * Routine: lck_mtx_lock_wait_x86
b0d623f7
A
3429 *
3430 * Invoked in order to wait on contention.
3431 *
3432 * Called with the interlock locked and
d9a64523 3433 * preemption disabled...
6d2010ae 3434 * returns it unlocked and with preemption enabled
d9a64523
A
3435 *
3436 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3437 * A runnable waiter can exist between wait and acquire
3438 * without a waiters count being set.
3439 * This allows us to never make a spurious wakeup call.
3440 *
3441 * Priority:
3442 * This avoids taking the thread lock if the owning thread is the same priority.
3443 * This optimizes the case of same-priority threads contending on a lock.
3444 * However, that allows the owning thread to drop in priority while holding the lock,
3445 * because there is no state that the priority change can notice that
3446 * says that the targeted thread holds a contended mutex.
3447 *
3448 * One possible solution: priority changes could look for some atomic tag
3449 * on the thread saying 'holding contended lock', and then set up a promotion.
3450 * Needs a story for dropping that promotion - the last contended unlock
3451 * has to notice that this has happened.
0c530ab8 3452 */
d9a64523 3453__attribute__((noinline))
0c530ab8 3454void
0a7de745
A
3455lck_mtx_lock_wait_x86(
3456 lck_mtx_t *mutex)
0c530ab8 3457{
0a7de745 3458#if CONFIG_DTRACE
d9a64523 3459 uint64_t sleep_start = 0;
b0d623f7
A
3460
3461 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3462 sleep_start = mach_absolute_time();
3463 }
3464#endif
d9a64523
A
3465 thread_t self = current_thread();
3466 assert(self->waiting_for_mutex == NULL);
3467
3468 self->waiting_for_mutex = mutex;
3469
3470 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3471
6d2010ae 3472 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
0a7de745
A
3473 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3474 mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
b0d623f7 3475
d9a64523
A
3476 integer_t waiter_pri = self->sched_pri;
3477 waiter_pri = MAX(waiter_pri, self->base_pri);
3478 waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT);
3479 waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE);
b0d623f7 3480
d9a64523 3481 assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE);
b0d623f7 3482
d9a64523 3483 /* Re-initialize lck_mtx_pri if this is the first contention */
0a7de745 3484 if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri) {
d9a64523 3485 mutex->lck_mtx_pri = waiter_pri;
0a7de745 3486 }
39236c6e 3487
d9a64523 3488 thread_t holder = (thread_t)mutex->lck_mtx_owner;
b0d623f7 3489
d9a64523
A
3490 assert(holder != NULL);
3491
3492 /*
3493 * Intel only causes a promotion when priority needs to change,
3494 * reducing thread lock holds but leaving us vulnerable to the holder
3495 * dropping priority.
3496 */
3497 if (holder->sched_pri < mutex->lck_mtx_pri) {
3498 int promote_pri = mutex->lck_mtx_pri;
3499
3500 spl_t s = splsched();
b0d623f7
A
3501 thread_lock(holder);
3502
d9a64523
A
3503 /* Check again in case sched_pri changed */
3504 if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) {
b0d623f7 3505 if (mutex->lck_mtx_promoted == 0) {
d9a64523 3506 /* This is the first promotion for this mutex */
b0d623f7 3507 mutex->lck_mtx_promoted = 1;
d9a64523
A
3508
3509 if (holder->promotions++ == 0) {
3510 /* This is the first promotion for holder */
3511 sched_thread_promote_to_pri(holder, promote_pri, trace_lck);
3512 } else {
3513 /*
3514 * Holder was previously promoted due to a different mutex,
3515 * check if it needs to raise to match this one
3516 */
3517 sched_thread_update_promotion_to_pri(holder, promote_pri,
0a7de745 3518 trace_lck);
d9a64523
A
3519 }
3520 } else {
3521 /*
3522 * Holder was previously promoted due to this mutex,
3523 * check if the pri needs to go up
3524 */
3525 sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck);
b0d623f7
A
3526 }
3527 }
d9a64523 3528
b0d623f7
A
3529 thread_unlock(holder);
3530 splx(s);
3531 }
d9a64523
A
3532
3533 mutex->lck_mtx_waiters++;
3534
813fb2f6 3535 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
d9a64523 3536 assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
b0d623f7
A
3537
3538 lck_mtx_ilk_unlock(mutex);
3539
3540 thread_block(THREAD_CONTINUE_NULL);
3541
d9a64523
A
3542 self->waiting_for_mutex = NULL;
3543
6d2010ae 3544 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
0a7de745
A
3545 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3546 mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
b0d623f7 3547
0a7de745 3548#if CONFIG_DTRACE
b0d623f7
A
3549 /*
3550 * Record the Dtrace lockstat probe for blocking, block time
3551 * measured from when we were entered.
3552 */
3553 if (sleep_start) {
6d2010ae 3554 if (mutex->lck_mtx_is_ext == 0) {
b0d623f7
A
3555 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3556 mach_absolute_time() - sleep_start);
3557 } else {
3558 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3559 mach_absolute_time() - sleep_start);
3560 }
3561 }
3562#endif
0c530ab8 3563}
3e170ce0
A
3564
3565/*
3566 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3567 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3568 * Returns: TRUE if lock is acquired.
3569 */
3570boolean_t
0a7de745 3571kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3e170ce0
A
3572{
3573 if (not_in_kdp) {
3574 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3575 }
3576
39037602 3577 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3e170ce0
A
3578 return TRUE;
3579 }
3580
3581 return FALSE;
3582}
3583
813fb2f6
A
3584void
3585kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3586{
3587 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3588 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3589 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3590 waitinfo->owner = thread_tid(holder);
3591}
3592
3593void
3594kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3595{
3596 lck_rw_t *rwlck = NULL;
0a7de745
A
3597 switch (waitinfo->wait_type) {
3598 case kThreadWaitKernelRWLockRead:
3599 rwlck = READ_EVENT_TO_RWLOCK(event);
3600 break;
3601 case kThreadWaitKernelRWLockWrite:
3602 case kThreadWaitKernelRWLockUpgrade:
3603 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3604 break;
3605 default:
3606 panic("%s was called with an invalid blocking type", __FUNCTION__);
3607 break;
813fb2f6
A
3608 }
3609 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3610 waitinfo->owner = 0;
3611}