]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/locks_i386.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
CommitLineData
91447636 1/*
f427ee49 2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
91447636 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
91447636
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
0a7de745 31/*
91447636
A
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
0a7de745 35 *
91447636
A
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
0a7de745 41 *
91447636
A
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745 45 *
91447636 46 * Carnegie Mellon requests users of this software to return to
0a7de745 47 *
91447636
A
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
0a7de745 52 *
91447636
A
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
d9a64523
A
64#define LOCK_PRIVATE 1
65
91447636
A
66#include <mach_ldebug.h>
67
0a7de745 68#include <kern/lock_stat.h>
91447636 69#include <kern/locks.h>
f427ee49 70#include <kern/zalloc.h>
91447636
A
71#include <kern/misc_protos.h>
72#include <kern/thread.h>
73#include <kern/processor.h>
74#include <kern/cpu_data.h>
75#include <kern/cpu_number.h>
76#include <kern/sched_prim.h>
91447636
A
77#include <kern/debug.h>
78#include <string.h>
79
060df5ea 80#include <i386/machine_routines.h> /* machine_timeout_suspended() */
5ba3f43e 81#include <machine/atomic.h>
b0d623f7 82#include <machine/machine_cpu.h>
060df5ea 83#include <i386/mp.h>
d9a64523 84#include <machine/atomic.h>
91447636 85#include <sys/kdebug.h>
d9a64523 86#include <i386/locks_i386_inlines.h>
ea3f0419
A
87#include <kern/cpu_number.h>
88#include <os/hash.h>
91447636 89
ea3f0419
A
90#if CONFIG_DTRACE
91#define DTRACE_RW_SHARED 0x0 //reader
92#define DTRACE_RW_EXCL 0x1 //writer
93#define DTRACE_NO_FLAG 0x0 //not applicable
0a7de745 94#endif /* CONFIG_DTRACE */
2d21ac55 95
ea3f0419
A
96#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
97#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
98#define LCK_RW_LCK_SHARED_CODE 0x102
99#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
100#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
101#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
91447636 102
ea3f0419
A
103#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
104#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
105#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
106#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
107#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
108#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
109#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
110#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
b0d623f7 111
91447636 112
ea3f0419 113#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
91447636 114
91447636
A
115/* Forwards */
116
ea3f0419 117#if USLOCK_DEBUG
91447636
A
118/*
119 * Perform simple lock checks.
120 */
ea3f0419
A
121int uslock_check = 1;
122int max_lock_loops = 100000000;
123decl_simple_lock_data(extern, printf_lock);
124decl_simple_lock_data(extern, panic_lock);
125#endif /* USLOCK_DEBUG */
91447636 126
fe8ab488 127extern unsigned int not_in_kdp;
91447636 128
c3c9b80d
A
129#if !LOCK_STATS
130#define usimple_lock_nopreempt(lck, grp) \
131 usimple_lock_nopreempt(lck)
132#define usimple_lock_try_nopreempt(lck, grp) \
133 usimple_lock_try_nopreempt(lck)
134#endif
135static void usimple_lock_nopreempt(usimple_lock_t, lck_grp_t *);
136static unsigned int usimple_lock_try_nopreempt(usimple_lock_t, lck_grp_t *);
137
91447636
A
138/*
139 * We often want to know the addresses of the callers
140 * of the various lock routines. However, this information
141 * is only used for debugging and statistics.
142 */
ea3f0419
A
143typedef void *pc_t;
144#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
145#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
146#if ANY_LOCK_DEBUG
147#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
148#define DECL_PC(pc) pc_t pc;
149#else /* ANY_LOCK_DEBUG */
91447636 150#define DECL_PC(pc)
ea3f0419 151#ifdef lint
91447636
A
152/*
153 * Eliminate lint complaints about unused local pc variables.
154 */
ea3f0419
A
155#define OBTAIN_PC(pc) ++pc
156#else /* lint */
157#define OBTAIN_PC(pc)
158#endif /* lint */
159#endif /* USLOCK_DEBUG */
91447636 160
f427ee49
A
161ZONE_VIEW_DEFINE(ZV_LCK_SPIN, "lck_spin",
162 KHEAP_ID_DEFAULT, sizeof(lck_spin_t));
163
164ZONE_VIEW_DEFINE(ZV_LCK_MTX, "lck_mtx",
165 KHEAP_ID_DEFAULT, sizeof(lck_mtx_t));
166
167ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT, "lck_mtx_ext",
168 KHEAP_ID_DEFAULT, sizeof(lck_mtx_ext_t));
169
170ZONE_VIEW_DEFINE(ZV_LCK_RW, "lck_rw",
171 KHEAP_ID_DEFAULT, sizeof(lck_rw_t));
172
5ba3f43e
A
173/*
174 * atomic exchange API is a low level abstraction of the operations
175 * to atomically read, modify, and write a pointer. This abstraction works
176 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
177 * well as the ARM exclusive instructions.
178 *
179 * atomic_exchange_begin() - begin exchange and retrieve current value
180 * atomic_exchange_complete() - conclude an exchange
181 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
182 */
183static uint32_t
184atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
185{
ea3f0419 186 uint32_t val;
5ba3f43e 187
ea3f0419 188 (void)ord; // Memory order not used
cb323159 189 val = os_atomic_load(target, relaxed);
5ba3f43e
A
190 *previous = val;
191 return val;
192}
193
194static boolean_t
195atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
196{
197 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
198}
199
200static void
ea3f0419
A
201atomic_exchange_abort(void)
202{
203}
5ba3f43e
A
204
205static boolean_t
206atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
207{
ea3f0419 208 uint32_t value, prev;
5ba3f43e 209
ea3f0419 210 for (;;) {
5ba3f43e
A
211 value = atomic_exchange_begin32(target, &prev, ord);
212 if (value & test_mask) {
ea3f0419 213 if (wait) {
5ba3f43e 214 cpu_pause();
ea3f0419 215 } else {
5ba3f43e 216 atomic_exchange_abort();
ea3f0419 217 }
5ba3f43e
A
218 return FALSE;
219 }
220 value |= set_mask;
ea3f0419 221 if (atomic_exchange_complete32(target, prev, value, ord)) {
5ba3f43e 222 return TRUE;
ea3f0419 223 }
5ba3f43e
A
224 }
225}
91447636 226
cb323159
A
227inline boolean_t
228hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
229{
230 return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
231}
232
91447636
A
233/*
234 * Portable lock package implementation of usimple_locks.
235 */
236
ea3f0419
A
237#if USLOCK_DEBUG
238#define USLDBG(stmt) stmt
239void usld_lock_init(usimple_lock_t, unsigned short);
240void usld_lock_pre(usimple_lock_t, pc_t);
241void usld_lock_post(usimple_lock_t, pc_t);
242void usld_unlock(usimple_lock_t, pc_t);
243void usld_lock_try_pre(usimple_lock_t, pc_t);
244void usld_lock_try_post(usimple_lock_t, pc_t);
245int usld_lock_common_checks(usimple_lock_t, char *);
246#else /* USLOCK_DEBUG */
247#define USLDBG(stmt)
248#endif /* USLOCK_DEBUG */
91447636 249
2d21ac55
A
250/*
251 * Forward definitions
252 */
253
5ba3f43e
A
254static void lck_rw_lock_shared_gen(lck_rw_t *lck);
255static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
256static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
257static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
258static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
259static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
39236c6e 260void lck_rw_clear_promotions_x86(thread_t thread);
5ba3f43e
A
261static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
262static boolean_t lck_rw_grab_want(lck_rw_t *lock);
263static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
cb323159 264static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect);
d9a64523
A
265static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
266static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
d9a64523
A
267static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
268static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
269static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
270
39236c6e 271
91447636
A
272/*
273 * Routine: lck_spin_alloc_init
274 */
275lck_spin_t *
276lck_spin_alloc_init(
ea3f0419
A
277 lck_grp_t *grp,
278 lck_attr_t *attr)
91447636 279{
f427ee49 280 lck_spin_t *lck;
91447636 281
f427ee49
A
282 lck = zalloc(ZV_LCK_SPIN);
283 lck_spin_init(lck, grp, attr);
ea3f0419 284 return lck;
91447636
A
285}
286
287/*
288 * Routine: lck_spin_free
289 */
290void
291lck_spin_free(
ea3f0419
A
292 lck_spin_t *lck,
293 lck_grp_t *grp)
91447636
A
294{
295 lck_spin_destroy(lck, grp);
f427ee49 296 zfree(ZV_LCK_SPIN, lck);
91447636
A
297}
298
299/*
300 * Routine: lck_spin_init
301 */
302void
303lck_spin_init(
ea3f0419
A
304 lck_spin_t *lck,
305 lck_grp_t *grp,
306 __unused lck_attr_t *attr)
91447636
A
307{
308 usimple_lock_init((usimple_lock_t) lck, 0);
cb323159
A
309 if (grp) {
310 lck_grp_reference(grp);
311 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
312 }
91447636
A
313}
314
315/*
316 * Routine: lck_spin_destroy
317 */
318void
319lck_spin_destroy(
ea3f0419
A
320 lck_spin_t *lck,
321 lck_grp_t *grp)
91447636 322{
ea3f0419 323 if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
91447636 324 return;
ea3f0419 325 }
b0d623f7 326 lck->interlock = LCK_SPIN_TAG_DESTROYED;
cb323159
A
327 if (grp) {
328 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
329 lck_grp_deallocate(grp);
330 }
91447636
A
331 return;
332}
333
334/*
335 * Routine: lck_spin_lock
336 */
0a7de745
A
337void
338lck_spin_lock_grp(
ea3f0419
A
339 lck_spin_t *lck,
340 lck_grp_t *grp)
0a7de745
A
341{
342#pragma unused(grp)
343 usimple_lock((usimple_lock_t) lck, grp);
344}
345
91447636
A
346void
347lck_spin_lock(
ea3f0419 348 lck_spin_t *lck)
91447636 349{
0a7de745 350 usimple_lock((usimple_lock_t) lck, NULL);
91447636
A
351}
352
c3c9b80d
A
353void
354lck_spin_lock_nopreempt(
355 lck_spin_t *lck)
356{
357 usimple_lock_nopreempt((usimple_lock_t) lck, NULL);
358}
359
360void
361lck_spin_lock_nopreempt_grp(
362 lck_spin_t *lck,
363 lck_grp_t *grp)
364{
365#pragma unused(grp)
366 usimple_lock_nopreempt((usimple_lock_t) lck, grp);
367}
368
91447636
A
369/*
370 * Routine: lck_spin_unlock
371 */
372void
373lck_spin_unlock(
ea3f0419 374 lck_spin_t *lck)
91447636
A
375{
376 usimple_unlock((usimple_lock_t) lck);
377}
378
c3c9b80d
A
379void
380lck_spin_unlock_nopreempt(
381 lck_spin_t *lck)
382{
383 usimple_unlock_nopreempt((usimple_lock_t) lck);
384}
385
0a7de745
A
386boolean_t
387lck_spin_try_lock_grp(
ea3f0419
A
388 lck_spin_t *lck,
389 lck_grp_t *grp)
0a7de745
A
390{
391#pragma unused(grp)
392 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
ea3f0419 393#if DEVELOPMENT || DEBUG
0a7de745
A
394 if (lrval) {
395 pltrace(FALSE);
396 }
397#endif
ea3f0419 398 return lrval;
0a7de745
A
399}
400
91447636
A
401
402/*
403 * Routine: lck_spin_try_lock
404 */
405boolean_t
406lck_spin_try_lock(
ea3f0419 407 lck_spin_t *lck)
91447636 408{
0a7de745 409 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
ea3f0419 410#if DEVELOPMENT || DEBUG
39037602
A
411 if (lrval) {
412 pltrace(FALSE);
413 }
414#endif
ea3f0419 415 return lrval;
39037602
A
416}
417
c3c9b80d
A
418int
419lck_spin_try_lock_nopreempt(
420 lck_spin_t *lck)
421{
422 boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, LCK_GRP_NULL);
423#if DEVELOPMENT || DEBUG
424 if (lrval) {
425 pltrace(FALSE);
426 }
427#endif
428 return lrval;
429}
430
431int
432lck_spin_try_lock_nopreempt_grp(
433 lck_spin_t *lck,
434 lck_grp_t *grp)
435{
436#pragma unused(grp)
437 boolean_t lrval = (boolean_t)usimple_lock_try_nopreempt((usimple_lock_t) lck, grp);
438#if DEVELOPMENT || DEBUG
439 if (lrval) {
440 pltrace(FALSE);
441 }
442#endif
443 return lrval;
444}
445
39037602
A
446/*
447 * Routine: lck_spin_assert
448 */
449void
450lck_spin_assert(lck_spin_t *lock, unsigned int type)
451{
452 thread_t thread, holder;
453 uintptr_t state;
454
455 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
456 panic("lck_spin_assert(): invalid arg (%u)", type);
457 }
458
459 state = lock->interlock;
460 holder = (thread_t)state;
461 thread = current_thread();
462 if (type == LCK_ASSERT_OWNED) {
463 if (__improbable(holder == THREAD_NULL)) {
464 panic("Lock not owned %p = %lx", lock, state);
465 }
466 if (__improbable(holder != thread)) {
467 panic("Lock not owned by current thread %p = %lx", lock, state);
468 }
469 } else if (type == LCK_ASSERT_NOTOWNED) {
470 if (__improbable(holder != THREAD_NULL)) {
471 if (holder == thread) {
472 panic("Lock owned by current thread %p = %lx", lock, state);
39037602
A
473 }
474 }
475 }
91447636
A
476}
477
fe8ab488 478/*
3e170ce0 479 * Routine: kdp_lck_spin_is_acquired
fe8ab488
A
480 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
481 * Returns: TRUE if lock is acquired.
482 */
483boolean_t
ea3f0419
A
484kdp_lck_spin_is_acquired(lck_spin_t *lck)
485{
fe8ab488
A
486 if (not_in_kdp) {
487 panic("panic: spinlock acquired check done outside of kernel debugger");
488 }
489 return (lck->interlock != 0)? TRUE : FALSE;
490}
491
91447636
A
492/*
493 * Initialize a usimple_lock.
494 *
495 * No change in preemption state.
496 */
497void
498usimple_lock_init(
ea3f0419
A
499 usimple_lock_t l,
500 __unused unsigned short tag)
91447636 501{
91447636
A
502 USLDBG(usld_lock_init(l, tag));
503 hw_lock_init(&l->interlock);
91447636
A
504}
505
060df5ea
A
506volatile uint32_t spinlock_owner_cpu = ~0;
507volatile usimple_lock_t spinlock_timed_out;
508
ea3f0419
A
509uint32_t
510spinlock_timeout_NMI(uintptr_t thread_addr)
511{
060df5ea
A
512 uint32_t i;
513
514 for (i = 0; i < real_ncpus; i++) {
a39ff7e2 515 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
060df5ea 516 spinlock_owner_cpu = i;
5ba3f43e
A
517 if ((uint32_t) cpu_number() != i) {
518 /* Cause NMI and panic on the owner's cpu */
519 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
520 }
060df5ea
A
521 break;
522 }
523 }
524
525 return spinlock_owner_cpu;
526}
91447636 527
c3c9b80d
A
528__abortlike
529static void
530usimple_lock_acquire_timeout_panic(usimple_lock_t l)
531{
532 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
533 uint32_t lock_cpu;
534
535 spinlock_timed_out = l; /* spinlock_timeout_NMI consumes this */
536 lock_cpu = spinlock_timeout_NMI(lowner);
537 panic("Spinlock acquisition timed out: lock=%p, "
538 "lock owner thread=0x%lx, current_thread: %p, "
539 "lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
540 l, lowner, current_thread(), lock_cpu,
541 (uintptr_t)l->interlock.lock_data, mach_absolute_time());
542}
543
91447636
A
544/*
545 * Acquire a usimple_lock.
546 *
547 * Returns with preemption disabled. Note
548 * that the hw_lock routines are responsible for
549 * maintaining preemption state.
550 */
551void
0a7de745 552(usimple_lock)(
ea3f0419 553 usimple_lock_t l
0a7de745 554 LCK_GRP_ARG(lck_grp_t *grp))
91447636 555{
2d21ac55 556 DECL_PC(pc);
91447636 557
b0d623f7 558 OBTAIN_PC(pc);
91447636 559 USLDBG(usld_lock_pre(l, pc));
6d2010ae 560
c3c9b80d
A
561 while (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
562 if (!machine_timeout_suspended()) {
563 usimple_lock_acquire_timeout_panic(l);
6d2010ae 564 }
c3c9b80d
A
565 enable_preemption();
566 }
567
568#if DEVELOPMENT || DEBUG
569 pltrace(FALSE);
570#endif
571
572 USLDBG(usld_lock_post(l, pc));
573#if CONFIG_DTRACE
574 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
575#endif
576}
577
578/*
579 * Acquire a usimple_lock_nopreempt
580 *
581 * Called and returns with preemption disabled. Note
582 * that the hw_lock routines are responsible for
583 * maintaining preemption state.
584 */
585static void
586usimple_lock_nopreempt(
587 usimple_lock_t l,
588 lck_grp_t *grp)
589{
590 DECL_PC(pc);
591
592 OBTAIN_PC(pc);
593 USLDBG(usld_lock_pre(l, pc));
6d2010ae 594
c3c9b80d
A
595 while (__improbable(hw_lock_to_nopreempt(&l->interlock, LockTimeOutTSC, grp) == 0)) {
596 if (!machine_timeout_suspended()) {
597 usimple_lock_acquire_timeout_panic(l);
060df5ea 598 }
c3c9b80d 599 enable_preemption();
b0d623f7 600 }
c3c9b80d 601
39037602 602#if DEVELOPMENT || DEBUG
ea3f0419 603 pltrace(FALSE);
39037602
A
604#endif
605
91447636 606 USLDBG(usld_lock_post(l, pc));
5ba3f43e 607#if CONFIG_DTRACE
0a7de745 608 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
5ba3f43e 609#endif
91447636
A
610}
611
612
613/*
614 * Release a usimple_lock.
615 *
616 * Returns with preemption enabled. Note
617 * that the hw_lock routines are responsible for
618 * maintaining preemption state.
619 */
620void
621usimple_unlock(
ea3f0419 622 usimple_lock_t l)
91447636 623{
91447636
A
624 DECL_PC(pc);
625
b0d623f7 626 OBTAIN_PC(pc);
91447636 627 USLDBG(usld_unlock(l, pc));
39037602 628#if DEVELOPMENT || DEBUG
ea3f0419 629 pltrace(TRUE);
39037602 630#endif
91447636 631 hw_lock_unlock(&l->interlock);
91447636
A
632}
633
c3c9b80d
A
634/*
635 * Release a usimple_unlock_nopreempt.
636 *
637 * Called and returns with preemption enabled. Note
638 * that the hw_lock routines are responsible for
639 * maintaining preemption state.
640 */
641void
642usimple_unlock_nopreempt(
643 usimple_lock_t l)
644{
645 DECL_PC(pc);
646
647 OBTAIN_PC(pc);
648 USLDBG(usld_unlock(l, pc));
649#if DEVELOPMENT || DEBUG
650 pltrace(TRUE);
651#endif
652 hw_lock_unlock_nopreempt(&l->interlock);
653}
91447636
A
654
655/*
656 * Conditionally acquire a usimple_lock.
657 *
658 * On success, returns with preemption disabled.
659 * On failure, returns with preemption in the same state
660 * as when first invoked. Note that the hw_lock routines
661 * are responsible for maintaining preemption state.
662 *
663 * XXX No stats are gathered on a miss; I preserved this
664 * behavior from the original assembly-language code, but
665 * doesn't it make sense to log misses? XXX
666 */
667unsigned int
668usimple_lock_try(
ea3f0419 669 usimple_lock_t l,
0a7de745 670 lck_grp_t *grp)
91447636 671{
ea3f0419 672 unsigned int success;
2d21ac55 673 DECL_PC(pc);
91447636 674
b0d623f7 675 OBTAIN_PC(pc);
91447636 676 USLDBG(usld_lock_try_pre(l, pc));
0a7de745 677 if ((success = hw_lock_try(&l->interlock, grp))) {
39037602
A
678#if DEVELOPMENT || DEBUG
679 pltrace(FALSE);
680#endif
ea3f0419 681 USLDBG(usld_lock_try_post(l, pc));
91447636
A
682 }
683 return success;
c3c9b80d
A
684}
685
686/*
687 * Conditionally acquire a usimple_lock.
688 *
689 * Called and returns with preemption disabled. Note
690 * that the hw_lock routines are responsible for
691 * maintaining preemption state.
692 *
693 * XXX No stats are gathered on a miss; I preserved this
694 * behavior from the original assembly-language code, but
695 * doesn't it make sense to log misses? XXX
696 */
697static unsigned int
698usimple_lock_try_nopreempt(
699 usimple_lock_t l,
700 lck_grp_t *grp)
701{
702 unsigned int success;
703 DECL_PC(pc);
704
705 OBTAIN_PC(pc);
706 USLDBG(usld_lock_try_pre(l, pc));
707 if ((success = hw_lock_try_nopreempt(&l->interlock, grp))) {
708#if DEVELOPMENT || DEBUG
709 pltrace(FALSE);
91447636 710#endif
c3c9b80d
A
711 USLDBG(usld_lock_try_post(l, pc));
712 }
713 return success;
91447636
A
714}
715
39037602 716/*
cb323159 717 * Acquire a usimple_lock while polling for pending cpu signals
39037602
A
718 * and spinning on a lock.
719 *
720 */
ea3f0419
A
721unsigned
722int
cb323159 723(usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
ea3f0419
A
724 uint64_t deadline
725 LCK_GRP_ARG(lck_grp_t *grp))
39037602
A
726{
727 boolean_t istate = ml_get_interrupts_enabled();
cb323159
A
728
729 if (deadline < mach_absolute_time()) {
730 return 0;
731 }
732
0a7de745 733 while (!simple_lock_try(l, grp)) {
ea3f0419 734 if (!istate) {
cb323159 735 cpu_signal_handler(NULL);
ea3f0419
A
736 }
737
cb323159
A
738 if (deadline < mach_absolute_time()) {
739 return 0;
0a7de745 740 }
cb323159 741
39037602
A
742 cpu_pause();
743 }
cb323159
A
744
745 return 1;
746}
747
748void
749(usimple_lock_try_lock_loop)(usimple_lock_t l
ea3f0419 750 LCK_GRP_ARG(lck_grp_t *grp))
cb323159 751{
f427ee49
A
752 /* When the lock is not contended, grab the lock and go. */
753 if (!simple_lock_try(l, grp)) {
754 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
755 }
39037602
A
756}
757
ea3f0419
A
758unsigned
759int
cb323159 760(usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
ea3f0419
A
761 uint64_t duration
762 LCK_GRP_ARG(lck_grp_t *grp))
cb323159
A
763{
764 uint64_t deadline;
f427ee49 765 uint64_t base_at;
cb323159
A
766 uint64_t duration_at;
767
f427ee49
A
768 /* Fast track for uncontended locks */
769 if (simple_lock_try(l, grp)) {
770 return 1;
771 }
772
773 base_at = mach_absolute_time();
774
cb323159
A
775 nanoseconds_to_absolutetime(duration, &duration_at);
776 deadline = base_at + duration_at;
777 if (deadline < base_at) {
778 /* deadline has overflowed, make it saturate */
779 deadline = ULLONG_MAX;
780 }
781
782 return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
783}
784
ea3f0419 785#if USLOCK_DEBUG
91447636
A
786/*
787 * States of a usimple_lock. The default when initializing
788 * a usimple_lock is setting it up for debug checking.
789 */
ea3f0419
A
790#define USLOCK_CHECKED 0x0001 /* lock is being checked */
791#define USLOCK_TAKEN 0x0002 /* lock has been taken */
792#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
793#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
794#define USLOCK_CHECKING(l) (uslock_check && \
795 ((l)->debug.state & USLOCK_CHECKED))
91447636
A
796
797/*
798 * Initialize the debugging information contained
799 * in a usimple_lock.
800 */
801void
802usld_lock_init(
ea3f0419
A
803 usimple_lock_t l,
804 __unused unsigned short tag)
91447636 805{
ea3f0419 806 if (l == USIMPLE_LOCK_NULL) {
91447636 807 panic("lock initialization: null lock pointer");
ea3f0419 808 }
91447636
A
809 l->lock_type = USLOCK_TAG;
810 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
811 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
812 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
813 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
814 l->debug.duration[0] = l->debug.duration[1] = 0;
815 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
816 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
817 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
818}
819
820
821/*
822 * These checks apply to all usimple_locks, not just
823 * those with USLOCK_CHECKED turned on.
824 */
825int
826usld_lock_common_checks(
ea3f0419
A
827 usimple_lock_t l,
828 char *caller)
91447636 829{
ea3f0419 830 if (l == USIMPLE_LOCK_NULL) {
91447636 831 panic("%s: null lock pointer", caller);
ea3f0419
A
832 }
833 if (l->lock_type != USLOCK_TAG) {
ebb1b9f4 834 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
ea3f0419
A
835 }
836 if (!(l->debug.state & USLOCK_INIT)) {
ebb1b9f4 837 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
ea3f0419 838 }
91447636
A
839 return USLOCK_CHECKING(l);
840}
841
842
843/*
844 * Debug checks on a usimple_lock just before attempting
845 * to acquire it.
846 */
847/* ARGSUSED */
848void
849usld_lock_pre(
ea3f0419
A
850 usimple_lock_t l,
851 pc_t pc)
91447636 852{
ea3f0419 853 char caller[] = "usimple_lock";
91447636
A
854
855
ea3f0419 856 if (!usld_lock_common_checks(l, caller)) {
91447636 857 return;
ea3f0419 858 }
91447636
A
859
860/*
861 * Note that we have a weird case where we are getting a lock when we are]
862 * in the process of putting the system to sleep. We are running with no
863 * current threads, therefore we can't tell if we are trying to retake a lock
864 * we have or someone on the other processor has it. Therefore we just
865 * ignore this test if the locking thread is 0.
866 */
867
868 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
869 l->debug.lock_thread == (void *) current_thread()) {
2d21ac55 870 printf("%s: lock %p already locked (at %p) by",
ea3f0419 871 caller, l, l->debug.lock_pc);
2d21ac55 872 printf(" current thread %p (new attempt at pc %p)\n",
ea3f0419 873 l->debug.lock_thread, pc);
2d21ac55 874 panic("%s", caller);
91447636
A
875 }
876 mp_disable_preemption();
91447636
A
877 mp_enable_preemption();
878}
879
880
881/*
882 * Debug checks on a usimple_lock just after acquiring it.
883 *
884 * Pre-emption has been disabled at this point,
885 * so we are safe in using cpu_number.
886 */
887void
888usld_lock_post(
ea3f0419
A
889 usimple_lock_t l,
890 pc_t pc)
91447636 891{
f427ee49 892 unsigned int mycpu;
ea3f0419 893 char caller[] = "successful usimple_lock";
91447636
A
894
895
ea3f0419 896 if (!usld_lock_common_checks(l, caller)) {
91447636 897 return;
ea3f0419 898 }
91447636 899
ea3f0419 900 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
b0d623f7 901 panic("%s: lock %p became uninitialized",
ea3f0419
A
902 caller, l);
903 }
904 if ((l->debug.state & USLOCK_TAKEN)) {
b0d623f7 905 panic("%s: lock 0x%p became TAKEN by someone else",
ea3f0419
A
906 caller, l);
907 }
91447636 908
f427ee49
A
909 mycpu = (unsigned int)cpu_number();
910 assert(mycpu <= UCHAR_MAX);
911
91447636
A
912 l->debug.lock_thread = (void *)current_thread();
913 l->debug.state |= USLOCK_TAKEN;
914 l->debug.lock_pc = pc;
f427ee49 915 l->debug.lock_cpu = (unsigned char)mycpu;
91447636
A
916}
917
918
919/*
920 * Debug checks on a usimple_lock just before
921 * releasing it. Note that the caller has not
922 * yet released the hardware lock.
923 *
924 * Preemption is still disabled, so there's
925 * no problem using cpu_number.
926 */
927void
928usld_unlock(
ea3f0419
A
929 usimple_lock_t l,
930 pc_t pc)
91447636 931{
f427ee49 932 unsigned int mycpu;
ea3f0419 933 char caller[] = "usimple_unlock";
91447636
A
934
935
ea3f0419 936 if (!usld_lock_common_checks(l, caller)) {
91447636 937 return;
ea3f0419 938 }
91447636
A
939
940 mycpu = cpu_number();
f427ee49 941 assert(mycpu <= UCHAR_MAX);
91447636 942
ea3f0419 943 if (!(l->debug.state & USLOCK_TAKEN)) {
b0d623f7 944 panic("%s: lock 0x%p hasn't been taken",
ea3f0419
A
945 caller, l);
946 }
947 if (l->debug.lock_thread != (void *) current_thread()) {
b0d623f7 948 panic("%s: unlocking lock 0x%p, owned by thread %p",
ea3f0419
A
949 caller, l, l->debug.lock_thread);
950 }
91447636 951 if (l->debug.lock_cpu != mycpu) {
b0d623f7 952 printf("%s: unlocking lock 0x%p on cpu 0x%x",
ea3f0419 953 caller, l, mycpu);
91447636 954 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
2d21ac55 955 panic("%s", caller);
91447636 956 }
91447636
A
957
958 l->debug.unlock_thread = l->debug.lock_thread;
959 l->debug.lock_thread = INVALID_PC;
960 l->debug.state &= ~USLOCK_TAKEN;
961 l->debug.unlock_pc = pc;
f427ee49 962 l->debug.unlock_cpu = (unsigned char)mycpu;
91447636
A
963}
964
965
966/*
967 * Debug checks on a usimple_lock just before
968 * attempting to acquire it.
969 *
970 * Preemption isn't guaranteed to be disabled.
971 */
972void
973usld_lock_try_pre(
ea3f0419
A
974 usimple_lock_t l,
975 __unused pc_t pc)
91447636 976{
ea3f0419 977 char caller[] = "usimple_lock_try";
91447636 978
ea3f0419 979 if (!usld_lock_common_checks(l, caller)) {
91447636 980 return;
ea3f0419 981 }
91447636
A
982}
983
984
985/*
986 * Debug checks on a usimple_lock just after
987 * successfully attempting to acquire it.
988 *
989 * Preemption has been disabled by the
990 * lock acquisition attempt, so it's safe
991 * to use cpu_number.
992 */
993void
994usld_lock_try_post(
ea3f0419
A
995 usimple_lock_t l,
996 pc_t pc)
91447636 997{
f427ee49 998 unsigned int mycpu;
ea3f0419 999 char caller[] = "successful usimple_lock_try";
91447636 1000
ea3f0419 1001 if (!usld_lock_common_checks(l, caller)) {
91447636 1002 return;
ea3f0419 1003 }
91447636 1004
ea3f0419 1005 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
b0d623f7 1006 panic("%s: lock 0x%p became uninitialized",
ea3f0419
A
1007 caller, l);
1008 }
1009 if ((l->debug.state & USLOCK_TAKEN)) {
b0d623f7 1010 panic("%s: lock 0x%p became TAKEN by someone else",
ea3f0419
A
1011 caller, l);
1012 }
91447636
A
1013
1014 mycpu = cpu_number();
f427ee49
A
1015 assert(mycpu <= UCHAR_MAX);
1016
91447636
A
1017 l->debug.lock_thread = (void *) current_thread();
1018 l->debug.state |= USLOCK_TAKEN;
1019 l->debug.lock_pc = pc;
f427ee49 1020 l->debug.lock_cpu = (unsigned char)mycpu;
91447636 1021}
ea3f0419 1022#endif /* USLOCK_DEBUG */
91447636 1023
91447636
A
1024/*
1025 * Routine: lck_rw_alloc_init
1026 */
1027lck_rw_t *
1028lck_rw_alloc_init(
ea3f0419
A
1029 lck_grp_t *grp,
1030 lck_attr_t *attr)
1031{
f427ee49 1032 lck_rw_t *lck;
b0d623f7 1033
f427ee49
A
1034 lck = zalloc_flags(ZV_LCK_RW, Z_WAITOK | Z_ZERO);
1035 lck_rw_init(lck, grp, attr);
ea3f0419 1036 return lck;
91447636
A
1037}
1038
1039/*
1040 * Routine: lck_rw_free
1041 */
1042void
1043lck_rw_free(
ea3f0419
A
1044 lck_rw_t *lck,
1045 lck_grp_t *grp)
1046{
91447636 1047 lck_rw_destroy(lck, grp);
f427ee49 1048 zfree(ZV_LCK_RW, lck);
91447636
A
1049}
1050
1051/*
1052 * Routine: lck_rw_init
1053 */
1054void
1055lck_rw_init(
ea3f0419
A
1056 lck_rw_t *lck,
1057 lck_grp_t *grp,
1058 lck_attr_t *attr)
0c530ab8 1059{
ea3f0419
A
1060 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
1061 attr : &LockDefaultLckAttr;
91447636 1062
2d21ac55
A
1063 hw_lock_byte_init(&lck->lck_rw_interlock);
1064 lck->lck_rw_want_write = FALSE;
1065 lck->lck_rw_want_upgrade = FALSE;
1066 lck->lck_rw_shared_count = 0;
1067 lck->lck_rw_can_sleep = TRUE;
b0d623f7 1068 lck->lck_r_waiting = lck->lck_w_waiting = 0;
91447636 1069 lck->lck_rw_tag = 0;
2d21ac55 1070 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
ea3f0419 1071 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
91447636
A
1072
1073 lck_grp_reference(grp);
1074 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
1075}
1076
1077/*
1078 * Routine: lck_rw_destroy
1079 */
1080void
1081lck_rw_destroy(
ea3f0419
A
1082 lck_rw_t *lck,
1083 lck_grp_t *grp)
b0d623f7 1084{
ea3f0419 1085 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
91447636 1086 return;
ea3f0419 1087 }
39236c6e
A
1088#if MACH_LDEBUG
1089 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
1090#endif
91447636
A
1091 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
1092 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
1093 lck_grp_deallocate(grp);
1094 return;
1095}
1096
1097/*
1098 * Sleep locks. These use the same data structure and algorithm
1099 * as the spin locks, but the process sleeps while it is waiting
1100 * for the lock. These work on uniprocessor systems.
1101 */
1102
1103#define DECREMENTER_TIMEOUT 1000000
1104
91447636 1105/*
6d2010ae
A
1106 * We disable interrupts while holding the RW interlock to prevent an
1107 * interrupt from exacerbating hold time.
91447636
A
1108 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
1109 */
5ba3f43e 1110static inline boolean_t
91447636
A
1111lck_interlock_lock(lck_rw_t *lck)
1112{
ea3f0419 1113 boolean_t istate;
91447636 1114
0a7de745 1115 istate = ml_set_interrupts_enabled(FALSE);
2d21ac55 1116 hw_lock_byte_lock(&lck->lck_rw_interlock);
91447636
A
1117 return istate;
1118}
1119
5ba3f43e 1120static inline void
91447636 1121lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
0a7de745 1122{
2d21ac55 1123 hw_lock_byte_unlock(&lck->lck_rw_interlock);
91447636
A
1124 ml_set_interrupts_enabled(istate);
1125}
1126
0c530ab8
A
1127/*
1128 * This inline is used when busy-waiting for an rw lock.
1129 * If interrupts were disabled when the lock primitive was called,
1130 * we poll the IPI handler for pending tlb flushes.
1131 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
1132 */
1133static inline void
1134lck_rw_lock_pause(boolean_t interrupts_enabled)
1135{
ea3f0419 1136 if (!interrupts_enabled) {
0c530ab8 1137 handle_pending_TLB_flushes();
ea3f0419 1138 }
0c530ab8
A
1139 cpu_pause();
1140}
1141
5ba3f43e
A
1142static inline boolean_t
1143lck_rw_held_read_or_upgrade(lck_rw_t *lock)
1144{
ea3f0419 1145 if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
5ba3f43e 1146 return TRUE;
ea3f0419 1147 }
5ba3f43e
A
1148 return FALSE;
1149}
b0d623f7
A
1150
1151/*
1152 * compute the deadline to spin against when
1153 * waiting for a change of state on a lck_rw_t
1154 */
1155static inline uint64_t
1156lck_rw_deadline_for_spin(lck_rw_t *lck)
1157{
1158 if (lck->lck_rw_can_sleep) {
1159 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
1160 /*
1161 * there are already threads waiting on this lock... this
ea3f0419 1162 * implies that they have spun beyond their deadlines waiting for
b0d623f7
A
1163 * the desired state to show up so we will not bother spinning at this time...
1164 * or
1165 * the current number of threads sharing this lock exceeds our capacity to run them
1166 * concurrently and since all states we're going to spin for require the rw_shared_count
1167 * to be at 0, we'll not bother spinning since the latency for this to happen is
1168 * unpredictable...
1169 */
ea3f0419 1170 return mach_absolute_time();
b0d623f7 1171 }
ea3f0419
A
1172 return mach_absolute_time() + MutexSpin;
1173 } else {
1174 return mach_absolute_time() + (100000LL * 1000000000LL);
1175 }
b0d623f7
A
1176}
1177
1178
5ba3f43e
A
1179/*
1180 * Spin while interlock is held.
1181 */
1182
1183static inline void
1184lck_rw_interlock_spin(lck_rw_t *lock)
1185{
1186 while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1187 cpu_pause();
1188 }
1189}
1190
1191static boolean_t
1192lck_rw_grab_want(lck_rw_t *lock)
1193{
ea3f0419 1194 uint32_t data, prev;
5ba3f43e 1195
ea3f0419 1196 for (;;) {
5ba3f43e 1197 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
ea3f0419 1198 if ((data & LCK_RW_INTERLOCK) == 0) {
5ba3f43e 1199 break;
ea3f0419 1200 }
5ba3f43e
A
1201 atomic_exchange_abort();
1202 lck_rw_interlock_spin(lock);
1203 }
1204 if (data & LCK_RW_WANT_WRITE) {
1205 atomic_exchange_abort();
1206 return FALSE;
1207 }
1208 data |= LCK_RW_WANT_WRITE;
1209 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1210}
1211
1212static boolean_t
1213lck_rw_grab_shared(lck_rw_t *lock)
1214{
ea3f0419 1215 uint32_t data, prev;
5ba3f43e 1216
ea3f0419 1217 for (;;) {
5ba3f43e 1218 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
ea3f0419 1219 if ((data & LCK_RW_INTERLOCK) == 0) {
5ba3f43e 1220 break;
ea3f0419 1221 }
5ba3f43e
A
1222 atomic_exchange_abort();
1223 lck_rw_interlock_spin(lock);
1224 }
1225 if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1226 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1227 atomic_exchange_abort();
1228 return FALSE;
1229 }
1230 }
1231 data += LCK_RW_SHARED_READER;
1232 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1233}
1234
91447636
A
1235/*
1236 * Routine: lck_rw_lock_exclusive
1237 */
5ba3f43e 1238static void
b0d623f7 1239lck_rw_lock_exclusive_gen(
ea3f0419 1240 lck_rw_t *lck)
91447636 1241{
ea3f0419
A
1242 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1243 uint64_t deadline = 0;
1244 int slept = 0;
1245 int gotlock = 0;
1246 int lockheld = 0;
1247 wait_result_t res = 0;
1248 boolean_t istate = -1;
91447636 1249
ea3f0419 1250#if CONFIG_DTRACE
b0d623f7 1251 boolean_t dtrace_ls_initialized = FALSE;
ea3f0419 1252 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
b0d623f7
A
1253 uint64_t wait_interval = 0;
1254 int readers_at_sleep = 0;
2d21ac55 1255#endif
91447636 1256
91447636 1257 /*
2d21ac55 1258 * Try to acquire the lck_rw_want_write bit.
91447636 1259 */
ea3f0419
A
1260 while (!lck_rw_grab_want(lck)) {
1261#if CONFIG_DTRACE
b0d623f7
A
1262 if (dtrace_ls_initialized == FALSE) {
1263 dtrace_ls_initialized = TRUE;
1264 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1265 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1266 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1267 if (dtrace_ls_enabled) {
1268 /*
1269 * Either sleeping or spinning is happening,
1270 * start a timing of our delay interval now.
1271 */
1272 readers_at_sleep = lck->lck_rw_shared_count;
1273 wait_interval = mach_absolute_time();
1274 }
91447636 1275 }
2d21ac55 1276#endif
ea3f0419 1277 if (istate == -1) {
b0d623f7 1278 istate = ml_get_interrupts_enabled();
ea3f0419 1279 }
91447636 1280
b0d623f7
A
1281 deadline = lck_rw_deadline_for_spin(lck);
1282
3e170ce0 1283 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
0a7de745 1284
ea3f0419 1285 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
b0d623f7 1286 lck_rw_lock_pause(istate);
ea3f0419 1287 }
b0d623f7 1288
3e170ce0 1289 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
b0d623f7 1290
ea3f0419 1291 if (gotlock) {
b0d623f7 1292 break;
ea3f0419 1293 }
b0d623f7
A
1294 /*
1295 * if we get here, the deadline has expired w/o us
1296 * being able to grab the lock exclusively
1297 * check to see if we're allowed to do a thread_block
1298 */
1299 if (lck->lck_rw_can_sleep) {
91447636 1300 istate = lck_interlock_lock(lck);
91447636 1301
b0d623f7 1302 if (lck->lck_rw_want_write) {
3e170ce0 1303 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
91447636 1304
b0d623f7 1305 lck->lck_w_waiting = TRUE;
91447636 1306
813fb2f6 1307 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
d9a64523 1308 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
ea3f0419 1309 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
b0d623f7 1310 lck_interlock_unlock(lck, istate);
91447636 1311
b0d623f7
A
1312 if (res == THREAD_WAITING) {
1313 res = thread_block(THREAD_CONTINUE_NULL);
1314 slept++;
1315 }
3e170ce0 1316 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
b0d623f7
A
1317 } else {
1318 lck->lck_rw_want_write = TRUE;
1319 lck_interlock_unlock(lck, istate);
1320 break;
1321 }
1322 }
1323 }
1324 /*
1325 * Wait for readers (and upgrades) to finish...
1326 * the test for these conditions must be done simultaneously with
1327 * a check of the interlock not being held since
1328 * the rw_shared_count will drop to 0 first and then want_upgrade
1329 * will be set to 1 in the shared_to_exclusive scenario... those
1330 * adjustments are done behind the interlock and represent an
1331 * atomic change in state and must be considered as such
1332 * however, once we see the read count at 0, the want_upgrade not set
1333 * and the interlock not held, we are safe to proceed
1334 */
1335 while (lck_rw_held_read_or_upgrade(lck)) {
ea3f0419 1336#if CONFIG_DTRACE
2d21ac55
A
1337 /*
1338 * Either sleeping or spinning is happening, start
1339 * a timing of our delay interval now. If we set it
1340 * to -1 we don't have accurate data so we cannot later
1341 * decide to record a dtrace spin or sleep event.
1342 */
b0d623f7
A
1343 if (dtrace_ls_initialized == FALSE) {
1344 dtrace_ls_initialized = TRUE;
1345 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1346 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1347 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1348 if (dtrace_ls_enabled) {
1349 /*
1350 * Either sleeping or spinning is happening,
1351 * start a timing of our delay interval now.
1352 */
1353 readers_at_sleep = lck->lck_rw_shared_count;
1354 wait_interval = mach_absolute_time();
1355 }
2d21ac55
A
1356 }
1357#endif
ea3f0419 1358 if (istate == -1) {
b0d623f7 1359 istate = ml_get_interrupts_enabled();
ea3f0419 1360 }
b0d623f7
A
1361
1362 deadline = lck_rw_deadline_for_spin(lck);
1363
3e170ce0 1364 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7 1365
ea3f0419 1366 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
b0d623f7 1367 lck_rw_lock_pause(istate);
ea3f0419 1368 }
b0d623f7 1369
3e170ce0 1370 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
b0d623f7 1371
ea3f0419 1372 if (!lockheld) {
b0d623f7 1373 break;
ea3f0419 1374 }
b0d623f7
A
1375 /*
1376 * if we get here, the deadline has expired w/o us
1377 * being able to grab the lock exclusively
1378 * check to see if we're allowed to do a thread_block
1379 */
1380 if (lck->lck_rw_can_sleep) {
91447636 1381 istate = lck_interlock_lock(lck);
91447636 1382
b0d623f7 1383 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
3e170ce0 1384 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7
A
1385
1386 lck->lck_w_waiting = TRUE;
1387
813fb2f6 1388 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
d9a64523 1389 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
ea3f0419 1390 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1391 lck_interlock_unlock(lck, istate);
b0d623f7
A
1392
1393 if (res == THREAD_WAITING) {
1394 res = thread_block(THREAD_CONTINUE_NULL);
1395 slept++;
1396 }
3e170ce0 1397 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
b0d623f7
A
1398 } else {
1399 lck_interlock_unlock(lck, istate);
1400 /*
1401 * must own the lock now, since we checked for
1402 * readers or upgrade owner behind the interlock
1403 * no need for a call to 'lck_rw_held_read_or_upgrade'
1404 */
1405 break;
91447636
A
1406 }
1407 }
91447636
A
1408 }
1409
ea3f0419 1410#if CONFIG_DTRACE
2d21ac55
A
1411 /*
1412 * Decide what latencies we suffered that are Dtrace events.
1413 * If we have set wait_interval, then we either spun or slept.
1414 * At least we get out from under the interlock before we record
1415 * which is the best we can do here to minimize the impact
1416 * of the tracing.
1417 * If we have set wait_interval to -1, then dtrace was not enabled when we
1418 * started sleeping/spinning so we don't record this event.
1419 */
b0d623f7 1420 if (dtrace_ls_enabled == TRUE) {
2d21ac55 1421 if (slept == 0) {
0a7de745 1422 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
2d21ac55
A
1423 mach_absolute_time() - wait_interval, 1);
1424 } else {
1425 /*
1426 * For the blocking case, we also record if when we blocked
1427 * it was held for read or write, and how many readers.
1428 * Notice that above we recorded this before we dropped
1429 * the interlock so the count is accurate.
1430 */
0a7de745 1431 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
2d21ac55
A
1432 mach_absolute_time() - wait_interval, 1,
1433 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1434 }
1435 }
1436 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1437#endif
91447636
A
1438}
1439
5ba3f43e
A
1440/*
1441 * Routine: lck_rw_done
1442 */
1443
ea3f0419
A
1444lck_rw_type_t
1445lck_rw_done(lck_rw_t *lock)
5ba3f43e 1446{
ea3f0419 1447 uint32_t data, prev;
5ba3f43e 1448
ea3f0419 1449 for (;;) {
5ba3f43e 1450 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
ea3f0419 1451 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
5ba3f43e
A
1452 atomic_exchange_abort();
1453 lck_rw_interlock_spin(lock);
1454 continue;
1455 }
1456 if (data & LCK_RW_SHARED_MASK) {
1457 data -= LCK_RW_SHARED_READER;
ea3f0419 1458 if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
5ba3f43e 1459 goto check_waiters;
ea3f0419
A
1460 }
1461 } else { /* if reader count == 0, must be exclusive lock */
5ba3f43e
A
1462 if (data & LCK_RW_WANT_UPGRADE) {
1463 data &= ~(LCK_RW_WANT_UPGRADE);
1464 } else {
ea3f0419 1465 if (data & LCK_RW_WANT_WRITE) {
5ba3f43e 1466 data &= ~(LCK_RW_WANT_EXCL);
ea3f0419 1467 } else { /* lock is not 'owned', panic */
5ba3f43e 1468 panic("Releasing non-exclusive RW lock without a reader refcount!");
ea3f0419 1469 }
5ba3f43e
A
1470 }
1471check_waiters:
1472 if (prev & LCK_RW_W_WAITING) {
1473 data &= ~(LCK_RW_W_WAITING);
ea3f0419 1474 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
5ba3f43e 1475 data &= ~(LCK_RW_R_WAITING);
ea3f0419
A
1476 }
1477 } else {
5ba3f43e 1478 data &= ~(LCK_RW_R_WAITING);
ea3f0419 1479 }
5ba3f43e 1480 }
ea3f0419 1481 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
5ba3f43e 1482 break;
ea3f0419 1483 }
5ba3f43e
A
1484 cpu_pause();
1485 }
1486 return lck_rw_done_gen(lock, prev);
1487}
91447636
A
1488
1489/*
2d21ac55 1490 * Routine: lck_rw_done_gen
b0d623f7 1491 *
5ba3f43e 1492 * called from lck_rw_done()
b0d623f7 1493 * prior_lock_state is the value in the 1st
ea3f0419 1494 * word of the lock at the time of a successful
b0d623f7 1495 * atomic compare and exchange with the new value...
ea3f0419 1496 * it represents the state of the lock before we
b0d623f7 1497 * decremented the rw_shared_count or cleared either
ea3f0419 1498 * rw_want_upgrade or rw_want_write and
b0d623f7 1499 * the lck_x_waiting bits... since the wrapper
ea3f0419 1500 * routine has already changed the state atomically,
b0d623f7
A
1501 * we just need to decide if we should
1502 * wake up anyone and what value to return... we do
1503 * this by examining the state of the lock before
1504 * we changed it
91447636 1505 */
5ba3f43e 1506static lck_rw_type_t
2d21ac55 1507lck_rw_done_gen(
0a7de745
A
1508 lck_rw_t *lck,
1509 uint32_t prior_lock_state)
91447636 1510{
0a7de745
A
1511 lck_rw_t *fake_lck;
1512 lck_rw_type_t lock_type;
1513 thread_t thread;
1514 uint32_t rwlock_count;
39236c6e 1515
0a7de745
A
1516 thread = current_thread();
1517 rwlock_count = thread->rwlock_count--;
b0d623f7 1518 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1519
0a7de745
A
1520 if (lck->lck_rw_can_sleep) {
1521 /*
1522 * prior_lock state is a snapshot of the 1st word of the
1523 * lock in question... we'll fake up a pointer to it
1524 * and carefully not access anything beyond whats defined
1525 * in the first word of a lck_rw_t
1526 */
91447636 1527
0a7de745
A
1528 if (fake_lck->lck_rw_shared_count <= 1) {
1529 if (fake_lck->lck_w_waiting) {
1530 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1531 }
2d21ac55 1532
0a7de745
A
1533 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1534 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1535 }
1536 }
fe8ab488 1537#if MACH_LDEBUG
0a7de745
A
1538 if (rwlock_count == 0) {
1539 panic("rw lock count underflow for thread %p", thread);
1540 }
fe8ab488 1541#endif
0a7de745
A
1542 /* Check if dropping the lock means that we need to unpromote */
1543
1544 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1545 /* sched_flags checked without lock, but will be rechecked while clearing */
1546 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1547 }
1548 }
1549 if (fake_lck->lck_rw_shared_count) {
1550 lock_type = LCK_RW_TYPE_SHARED;
1551 } else {
1552 lock_type = LCK_RW_TYPE_EXCLUSIVE;
fe8ab488
A
1553 }
1554
2d21ac55 1555#if CONFIG_DTRACE
b0d623f7 1556 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2d21ac55
A
1557#endif
1558
0a7de745 1559 return lock_type;
91447636
A
1560}
1561
1562
91447636
A
1563/*
1564 * Routine: lck_rw_unlock
1565 */
1566void
1567lck_rw_unlock(
ea3f0419
A
1568 lck_rw_t *lck,
1569 lck_rw_type_t lck_rw_type)
91447636 1570{
ea3f0419 1571 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
91447636 1572 lck_rw_unlock_shared(lck);
ea3f0419 1573 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
91447636 1574 lck_rw_unlock_exclusive(lck);
ea3f0419 1575 } else {
91447636 1576 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
ea3f0419 1577 }
91447636
A
1578}
1579
1580
1581/*
1582 * Routine: lck_rw_unlock_shared
1583 */
1584void
1585lck_rw_unlock_shared(
ea3f0419 1586 lck_rw_t *lck)
91447636 1587{
ea3f0419 1588 lck_rw_type_t ret;
91447636 1589
a39ff7e2 1590 assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
91447636
A
1591 ret = lck_rw_done(lck);
1592
ea3f0419 1593 if (ret != LCK_RW_TYPE_SHARED) {
39037602 1594 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
ea3f0419 1595 }
91447636
A
1596}
1597
1598
1599/*
1600 * Routine: lck_rw_unlock_exclusive
1601 */
1602void
1603lck_rw_unlock_exclusive(
ea3f0419 1604 lck_rw_t *lck)
91447636 1605{
ea3f0419 1606 lck_rw_type_t ret;
91447636
A
1607
1608 ret = lck_rw_done(lck);
1609
ea3f0419 1610 if (ret != LCK_RW_TYPE_EXCLUSIVE) {
91447636 1611 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
ea3f0419 1612 }
91447636
A
1613}
1614
1615
1616/*
1617 * Routine: lck_rw_lock
1618 */
1619void
1620lck_rw_lock(
ea3f0419
A
1621 lck_rw_t *lck,
1622 lck_rw_type_t lck_rw_type)
91447636 1623{
ea3f0419 1624 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
91447636 1625 lck_rw_lock_shared(lck);
ea3f0419 1626 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
91447636 1627 lck_rw_lock_exclusive(lck);
ea3f0419 1628 } else {
91447636 1629 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
ea3f0419 1630 }
91447636
A
1631}
1632
5ba3f43e
A
1633/*
1634 * Routine: lck_rw_lock_shared
1635 */
1636void
1637lck_rw_lock_shared(lck_rw_t *lock)
1638{
ea3f0419 1639 uint32_t data, prev;
5ba3f43e
A
1640
1641 current_thread()->rwlock_count++;
ea3f0419 1642 for (;;) {
5ba3f43e
A
1643 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1644 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1645 atomic_exchange_abort();
0a7de745
A
1646 if (lock->lck_rw_can_sleep) {
1647 lck_rw_lock_shared_gen(lock);
1648 } else {
1649 cpu_pause();
1650 continue;
1651 }
5ba3f43e
A
1652 break;
1653 }
1654 data += LCK_RW_SHARED_READER;
ea3f0419 1655 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1656 break;
ea3f0419 1657 }
5ba3f43e
A
1658 cpu_pause();
1659 }
ea3f0419 1660#if CONFIG_DTRACE
5ba3f43e 1661 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
ea3f0419 1662#endif /* CONFIG_DTRACE */
5ba3f43e
A
1663 return;
1664}
91447636
A
1665
1666/*
2d21ac55 1667 * Routine: lck_rw_lock_shared_gen
b0d623f7
A
1668 * Function:
1669 * assembly fast path code has determined that this lock
1670 * is held exclusively... this is where we spin/block
1671 * until we can acquire the lock in the shared mode
91447636 1672 */
5ba3f43e 1673static void
2d21ac55 1674lck_rw_lock_shared_gen(
ea3f0419 1675 lck_rw_t *lck)
91447636 1676{
ea3f0419
A
1677 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1678 uint64_t deadline = 0;
1679 int gotlock = 0;
1680 int slept = 0;
1681 wait_result_t res = 0;
1682 boolean_t istate = -1;
3e170ce0 1683
ea3f0419 1684#if CONFIG_DTRACE
2d21ac55 1685 uint64_t wait_interval = 0;
b0d623f7
A
1686 int readers_at_sleep = 0;
1687 boolean_t dtrace_ls_initialized = FALSE;
1688 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
2d21ac55 1689#endif
91447636 1690
ea3f0419
A
1691 while (!lck_rw_grab_shared(lck)) {
1692#if CONFIG_DTRACE
b0d623f7
A
1693 if (dtrace_ls_initialized == FALSE) {
1694 dtrace_ls_initialized = TRUE;
1695 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1696 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1697 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1698 if (dtrace_ls_enabled) {
1699 /*
1700 * Either sleeping or spinning is happening,
1701 * start a timing of our delay interval now.
1702 */
1703 readers_at_sleep = lck->lck_rw_shared_count;
1704 wait_interval = mach_absolute_time();
1705 }
1706 }
2d21ac55 1707#endif
ea3f0419 1708 if (istate == -1) {
b0d623f7 1709 istate = ml_get_interrupts_enabled();
ea3f0419 1710 }
91447636 1711
b0d623f7 1712 deadline = lck_rw_deadline_for_spin(lck);
0c530ab8 1713
b0d623f7 1714 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
ea3f0419 1715 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
91447636 1716
ea3f0419 1717 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
b0d623f7 1718 lck_rw_lock_pause(istate);
ea3f0419 1719 }
b0d623f7
A
1720
1721 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
ea3f0419 1722 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
b0d623f7 1723
ea3f0419 1724 if (gotlock) {
b0d623f7 1725 break;
ea3f0419 1726 }
b0d623f7
A
1727 /*
1728 * if we get here, the deadline has expired w/o us
1729 * being able to grab the lock for read
1730 * check to see if we're allowed to do a thread_block
1731 */
1732 if (lck->lck_rw_can_sleep) {
91447636 1733 istate = lck_interlock_lock(lck);
91447636 1734
b0d623f7
A
1735 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1736 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
b0d623f7 1737 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
ea3f0419 1738 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
b0d623f7
A
1739
1740 lck->lck_r_waiting = TRUE;
1741
813fb2f6 1742 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
d9a64523 1743 res = assert_wait(RW_LOCK_READER_EVENT(lck),
ea3f0419 1744 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1745 lck_interlock_unlock(lck, istate);
b0d623f7
A
1746
1747 if (res == THREAD_WAITING) {
1748 res = thread_block(THREAD_CONTINUE_NULL);
1749 slept++;
1750 }
1751 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
ea3f0419 1752 trace_lck, res, slept, 0, 0);
b0d623f7
A
1753 } else {
1754 lck->lck_rw_shared_count++;
1755 lck_interlock_unlock(lck, istate);
1756 break;
91447636
A
1757 }
1758 }
91447636
A
1759 }
1760
ea3f0419 1761#if CONFIG_DTRACE
b0d623f7 1762 if (dtrace_ls_enabled == TRUE) {
2d21ac55 1763 if (slept == 0) {
0a7de745 1764 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
2d21ac55 1765 } else {
0a7de745 1766 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
2d21ac55
A
1767 mach_absolute_time() - wait_interval, 0,
1768 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1769 }
1770 }
1771 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1772#endif
91447636
A
1773}
1774
f427ee49
A
1775#define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->data, \
1776 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1777 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1778
1779/*
1780 * Routine: lck_rw_lock_exclusive_check_contended
1781 */
1782
1783bool
1784lck_rw_lock_exclusive_check_contended(lck_rw_t *lock)
1785{
1786 bool contended = false;
1787 current_thread()->rwlock_count++;
1788 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1789#if CONFIG_DTRACE
1790 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1791#endif /* CONFIG_DTRACE */
1792 } else {
1793 contended = true;
1794 lck_rw_lock_exclusive_gen(lock);
1795 }
1796 return contended;
1797}
91447636 1798
5ba3f43e
A
1799/*
1800 * Routine: lck_rw_lock_exclusive
1801 */
1802
1803void
1804lck_rw_lock_exclusive(lck_rw_t *lock)
1805{
1806 current_thread()->rwlock_count++;
f427ee49 1807 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
ea3f0419 1808#if CONFIG_DTRACE
5ba3f43e 1809 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
ea3f0419
A
1810#endif /* CONFIG_DTRACE */
1811 } else {
5ba3f43e 1812 lck_rw_lock_exclusive_gen(lock);
ea3f0419 1813 }
5ba3f43e
A
1814}
1815
1816
1817/*
1818 * Routine: lck_rw_lock_shared_to_exclusive
cb323159
A
1819 *
1820 * False returned upon failure, in this case the shared lock is dropped.
5ba3f43e
A
1821 */
1822
1823boolean_t
1824lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1825{
ea3f0419 1826 uint32_t data, prev;
5ba3f43e 1827
ea3f0419 1828 for (;;) {
5ba3f43e
A
1829 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1830 if (data & LCK_RW_INTERLOCK) {
1831 atomic_exchange_abort();
1832 lck_rw_interlock_spin(lock);
1833 continue;
1834 }
1835 if (data & LCK_RW_WANT_UPGRADE) {
1836 data -= LCK_RW_SHARED_READER;
ea3f0419
A
1837 if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */
1838 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1839 }
1840 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1841 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
ea3f0419 1842 }
5ba3f43e 1843 } else {
ea3f0419
A
1844 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1845 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1846 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1847 break;
ea3f0419 1848 }
5ba3f43e
A
1849 }
1850 cpu_pause();
1851 }
ea3f0419
A
1852 /* we now own the WANT_UPGRADE */
1853 if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */
1854 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1855 }
1856#if CONFIG_DTRACE
5ba3f43e
A
1857 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1858#endif
1859 return TRUE;
1860}
1861
1862
91447636 1863/*
b0d623f7 1864 * Routine: lck_rw_lock_shared_to_exclusive_failure
91447636 1865 * Function:
b0d623f7
A
1866 * assembly fast path code has already dropped our read
1867 * count and determined that someone else owns 'lck_rw_want_upgrade'
1868 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1869 * all we need to do here is determine if a wakeup is needed
91447636 1870 */
5ba3f43e 1871static boolean_t
b0d623f7 1872lck_rw_lock_shared_to_exclusive_failure(
ea3f0419
A
1873 lck_rw_t *lck,
1874 uint32_t prior_lock_state)
91447636 1875{
ea3f0419
A
1876 lck_rw_t *fake_lck;
1877 thread_t thread = current_thread();
1878 uint32_t rwlock_count;
39236c6e
A
1879
1880 /* Check if dropping the lock means that we need to unpromote */
1881 rwlock_count = thread->rwlock_count--;
1882#if MACH_LDEBUG
1883 if (rwlock_count == 0) {
1884 panic("rw lock count underflow for thread %p", thread);
1885 }
1886#endif
b0d623f7 1887 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1888
b0d623f7 1889 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
91447636
A
1890 /*
1891 * Someone else has requested upgrade.
b0d623f7
A
1892 * Since we've released the read lock, wake
1893 * him up if he's blocked waiting
91447636 1894 */
b0d623f7
A
1895 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1896 }
5ba3f43e
A
1897
1898 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1899 /* sched_flags checked without lock, but will be rechecked while clearing */
d9a64523 1900 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
5ba3f43e
A
1901 }
1902
b0d623f7 1903 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
ea3f0419 1904 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
91447636 1905
ea3f0419 1906 return FALSE;
b0d623f7 1907}
91447636 1908
91447636 1909
b0d623f7
A
1910/*
1911 * Routine: lck_rw_lock_shared_to_exclusive_failure
1912 * Function:
1913 * assembly fast path code has already dropped our read
1914 * count and successfully acquired 'lck_rw_want_upgrade'
1915 * we just need to wait for the rest of the readers to drain
1916 * and then we can return as the exclusive holder of this lock
1917 */
5ba3f43e 1918static boolean_t
b0d623f7 1919lck_rw_lock_shared_to_exclusive_success(
ea3f0419 1920 lck_rw_t *lck)
b0d623f7 1921{
ea3f0419
A
1922 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1923 uint64_t deadline = 0;
1924 int slept = 0;
1925 int still_shared = 0;
1926 wait_result_t res;
1927 boolean_t istate = -1;
91447636 1928
ea3f0419 1929#if CONFIG_DTRACE
b0d623f7
A
1930 uint64_t wait_interval = 0;
1931 int readers_at_sleep = 0;
1932 boolean_t dtrace_ls_initialized = FALSE;
1933 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1934#endif
91447636 1935
2d21ac55 1936 while (lck->lck_rw_shared_count != 0) {
ea3f0419 1937#if CONFIG_DTRACE
b0d623f7
A
1938 if (dtrace_ls_initialized == FALSE) {
1939 dtrace_ls_initialized = TRUE;
1940 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1941 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1942 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1943 if (dtrace_ls_enabled) {
1944 /*
1945 * Either sleeping or spinning is happening,
1946 * start a timing of our delay interval now.
1947 */
1948 readers_at_sleep = lck->lck_rw_shared_count;
1949 wait_interval = mach_absolute_time();
1950 }
2d21ac55
A
1951 }
1952#endif
ea3f0419 1953 if (istate == -1) {
b0d623f7 1954 istate = ml_get_interrupts_enabled();
ea3f0419 1955 }
b0d623f7
A
1956
1957 deadline = lck_rw_deadline_for_spin(lck);
1958
1959 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
ea3f0419 1960 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7 1961
ea3f0419 1962 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
b0d623f7 1963 lck_rw_lock_pause(istate);
ea3f0419 1964 }
b0d623f7
A
1965
1966 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
ea3f0419 1967 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7 1968
ea3f0419 1969 if (!still_shared) {
b0d623f7 1970 break;
ea3f0419 1971 }
b0d623f7
A
1972 /*
1973 * if we get here, the deadline has expired w/o
1974 * the rw_shared_count having drained to 0
1975 * check to see if we're allowed to do a thread_block
1976 */
1977 if (lck->lck_rw_can_sleep) {
91447636 1978 istate = lck_interlock_lock(lck);
0a7de745 1979
b0d623f7
A
1980 if (lck->lck_rw_shared_count != 0) {
1981 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
ea3f0419 1982 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7
A
1983
1984 lck->lck_w_waiting = TRUE;
91447636 1985
813fb2f6 1986 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
d9a64523 1987 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
ea3f0419 1988 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1989 lck_interlock_unlock(lck, istate);
b0d623f7
A
1990
1991 if (res == THREAD_WAITING) {
1992 res = thread_block(THREAD_CONTINUE_NULL);
1993 slept++;
1994 }
1995 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
ea3f0419 1996 trace_lck, res, slept, 0, 0);
b0d623f7
A
1997 } else {
1998 lck_interlock_unlock(lck, istate);
1999 break;
91447636
A
2000 }
2001 }
91447636 2002 }
ea3f0419 2003#if CONFIG_DTRACE
2d21ac55
A
2004 /*
2005 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
2006 */
b0d623f7 2007 if (dtrace_ls_enabled == TRUE) {
2d21ac55 2008 if (slept == 0) {
0a7de745 2009 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
2d21ac55 2010 } else {
0a7de745 2011 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
2d21ac55
A
2012 mach_absolute_time() - wait_interval, 1,
2013 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
2014 }
2015 }
2d21ac55
A
2016 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
2017#endif
ea3f0419 2018 return TRUE;
91447636
A
2019}
2020
5ba3f43e
A
2021/*
2022 * Routine: lck_rw_lock_exclusive_to_shared
2023 */
2024
ea3f0419
A
2025void
2026lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
5ba3f43e 2027{
ea3f0419 2028 uint32_t data, prev;
5ba3f43e 2029
ea3f0419 2030 for (;;) {
5ba3f43e
A
2031 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
2032 if (data & LCK_RW_INTERLOCK) {
2033 atomic_exchange_abort();
ea3f0419 2034 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
5ba3f43e
A
2035 continue;
2036 }
2037 data += LCK_RW_SHARED_READER;
ea3f0419 2038 if (data & LCK_RW_WANT_UPGRADE) {
5ba3f43e 2039 data &= ~(LCK_RW_WANT_UPGRADE);
ea3f0419 2040 } else {
5ba3f43e 2041 data &= ~(LCK_RW_WANT_EXCL);
ea3f0419
A
2042 }
2043 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
5ba3f43e 2044 data &= ~(LCK_RW_W_WAITING);
ea3f0419
A
2045 }
2046 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
5ba3f43e 2047 break;
ea3f0419 2048 }
5ba3f43e
A
2049 cpu_pause();
2050 }
2051 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
2052}
2053
b0d623f7 2054
91447636 2055/*
5ba3f43e 2056 * Routine: lck_rw_lock_exclusive_to_shared_gen
ea3f0419 2057 * Function:
b0d623f7
A
2058 * assembly fast path has already dropped
2059 * our exclusive state and bumped lck_rw_shared_count
2060 * all we need to do here is determine if anyone
2061 * needs to be awakened.
91447636 2062 */
5ba3f43e 2063static void
b0d623f7 2064lck_rw_lock_exclusive_to_shared_gen(
ea3f0419
A
2065 lck_rw_t *lck,
2066 uint32_t prior_lock_state)
91447636 2067{
ea3f0419
A
2068 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
2069 lck_rw_t *fake_lck;
91447636 2070
b0d623f7 2071 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 2072
b0d623f7 2073 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
ea3f0419 2074 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
91447636 2075
b0d623f7
A
2076 /*
2077 * don't wake up anyone waiting to take the lock exclusively
2078 * since we hold a read count... when the read count drops to 0,
2079 * the writers will be woken.
2080 *
2081 * wake up any waiting readers if we don't have any writers waiting,
2082 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
2083 */
ea3f0419 2084 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
2d21ac55 2085 thread_wakeup(RW_LOCK_READER_EVENT(lck));
ea3f0419 2086 }
91447636
A
2087
2088 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
ea3f0419 2089 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
91447636 2090
2d21ac55
A
2091#if CONFIG_DTRACE
2092 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
2093#endif
91447636
A
2094}
2095
2096
2097/*
2098 * Routine: lck_rw_try_lock
2099 */
2100boolean_t
2101lck_rw_try_lock(
ea3f0419
A
2102 lck_rw_t *lck,
2103 lck_rw_type_t lck_rw_type)
2104{
2105 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
2106 return lck_rw_try_lock_shared(lck);
2107 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
2108 return lck_rw_try_lock_exclusive(lck);
2109 } else {
91447636 2110 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
ea3f0419
A
2111 }
2112 return FALSE;
91447636
A
2113}
2114
5ba3f43e
A
2115/*
2116 * Routine: lck_rw_try_lock_shared
2117 */
2118
ea3f0419
A
2119boolean_t
2120lck_rw_try_lock_shared(lck_rw_t *lock)
5ba3f43e 2121{
ea3f0419 2122 uint32_t data, prev;
5ba3f43e 2123
ea3f0419 2124 for (;;) {
5ba3f43e
A
2125 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
2126 if (data & LCK_RW_INTERLOCK) {
2127 atomic_exchange_abort();
2128 lck_rw_interlock_spin(lock);
2129 continue;
2130 }
2131 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2132 atomic_exchange_abort();
ea3f0419 2133 return FALSE; /* lock is busy */
5ba3f43e 2134 }
ea3f0419
A
2135 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
2136 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 2137 break;
ea3f0419 2138 }
5ba3f43e
A
2139 cpu_pause();
2140 }
2141 current_thread()->rwlock_count++;
2142 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
ea3f0419 2143#if CONFIG_DTRACE
5ba3f43e 2144 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
ea3f0419 2145#endif /* CONFIG_DTRACE */
5ba3f43e
A
2146 return TRUE;
2147}
2148
2149
2150/*
2151 * Routine: lck_rw_try_lock_exclusive
2152 */
2153
ea3f0419
A
2154boolean_t
2155lck_rw_try_lock_exclusive(lck_rw_t *lock)
5ba3f43e 2156{
ea3f0419 2157 uint32_t data, prev;
5ba3f43e 2158
ea3f0419 2159 for (;;) {
5ba3f43e
A
2160 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
2161 if (data & LCK_RW_INTERLOCK) {
2162 atomic_exchange_abort();
2163 lck_rw_interlock_spin(lock);
2164 continue;
2165 }
2166 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2167 atomic_exchange_abort();
ea3f0419 2168 return FALSE; /* can't get it */
5ba3f43e
A
2169 }
2170 data |= LCK_RW_WANT_EXCL;
ea3f0419 2171 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 2172 break;
ea3f0419 2173 }
5ba3f43e
A
2174 cpu_pause();
2175 }
2176
2177 current_thread()->rwlock_count++;
ea3f0419 2178#if CONFIG_DTRACE
5ba3f43e 2179 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
ea3f0419 2180#endif /* CONFIG_DTRACE */
5ba3f43e
A
2181 return TRUE;
2182}
2183
91447636 2184
2d21ac55
A
2185void
2186lck_rw_assert(
ea3f0419
A
2187 lck_rw_t *lck,
2188 unsigned int type)
2d21ac55
A
2189{
2190 switch (type) {
2191 case LCK_RW_ASSERT_SHARED:
2192 if (lck->lck_rw_shared_count != 0) {
2193 return;
2194 }
2195 break;
2196 case LCK_RW_ASSERT_EXCLUSIVE:
2197 if ((lck->lck_rw_want_write ||
ea3f0419 2198 lck->lck_rw_want_upgrade) &&
2d21ac55
A
2199 lck->lck_rw_shared_count == 0) {
2200 return;
2201 }
2202 break;
2203 case LCK_RW_ASSERT_HELD:
2204 if (lck->lck_rw_want_write ||
2205 lck->lck_rw_want_upgrade ||
2206 lck->lck_rw_shared_count != 0) {
2207 return;
2208 }
2209 break;
39236c6e
A
2210 case LCK_RW_ASSERT_NOTHELD:
2211 if (!(lck->lck_rw_want_write ||
ea3f0419
A
2212 lck->lck_rw_want_upgrade ||
2213 lck->lck_rw_shared_count != 0)) {
39236c6e
A
2214 return;
2215 }
2216 break;
2d21ac55
A
2217 default:
2218 break;
2219 }
2220
39236c6e
A
2221 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
2222}
2223
2224/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
cb323159
A
2225#if MACH_LDEBUG
2226__dead2
2227#endif
39236c6e
A
2228void
2229lck_rw_clear_promotions_x86(thread_t thread)
2230{
2231#if MACH_LDEBUG
2232 /* It's fatal to leave a RW lock locked and return to userspace */
2233 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2234#else
2235 /* Paper over the issue */
2236 thread->rwlock_count = 0;
d9a64523 2237 lck_rw_clear_promotion(thread, 0);
39236c6e 2238#endif
2d21ac55
A
2239}
2240
5ba3f43e
A
2241boolean_t
2242lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2243{
2244 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2245
2246 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2247 lck_rw_unlock_shared(lck);
2248 mutex_pause(2);
2249 lck_rw_lock_shared(lck);
2250 return TRUE;
2251 }
2252
2253 return FALSE;
2254}
39236c6e 2255
3e170ce0
A
2256/*
2257 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2258 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2259 */
2260boolean_t
ea3f0419
A
2261kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2262{
3e170ce0
A
2263 if (not_in_kdp) {
2264 panic("panic: rw lock exclusive check done outside of kernel debugger");
2265 }
2266 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2267}
2268
d9a64523
A
2269/*
2270 * Slow path routines for lck_mtx locking and unlocking functions.
2271 *
2272 * These functions were previously implemented in x86 assembly,
2273 * and some optimizations are in place in this c code to obtain a compiled code
2274 * as performant and compact as the assembly version.
2275 *
2276 * To avoid to inline these functions on the fast path, all functions directly called by
2277 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2278 * in such a way the fast path can tail call into them. In this way the return address
2279 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2280 *
2281 * Slow path code is structured in such a way there are no calls to functions that will return
2282 * on the context of the caller function, i.e. all functions called are or tail call functions
2283 * or inline functions. The number of arguments of the tail call functions are less then six,
2284 * so that they can be passed over registers and do not need to be pushed on stack.
2285 * This allows the compiler to not create a stack frame for the functions.
2286 *
2287 * __improbable and __probable are used to compile the slow path code in such a way
2288 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2289 * to make this case the most optimized even if falling through the slow path.
2290 */
2291
2292/*
2293 * Intel lock invariants:
2294 *
2295 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
d9a64523
A
2296 *
2297 * The lock owner is promoted to the max priority of all its waiters only if it
2298 * was a lower priority when it acquired or was an owner when a waiter waited.
2299 * Max priority is capped at MAXPRI_PROMOTE.
2300 *
2301 * The last waiter will not be promoted as it is woken up, but the last
2302 * lock owner may not have been the last thread to have been woken up depending on the
2303 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2304 * flag set.
2305 *
2306 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2307 * priority from dropping priority in the future without having to take thread lock
2308 * on acquire.
2309 */
3e170ce0 2310
91447636
A
2311/*
2312 * Routine: lck_mtx_alloc_init
2313 */
2314lck_mtx_t *
2315lck_mtx_alloc_init(
ea3f0419
A
2316 lck_grp_t *grp,
2317 lck_attr_t *attr)
91447636 2318{
f427ee49
A
2319 lck_mtx_t *lck;
2320
2321 lck = zalloc(ZV_LCK_MTX);
2322 lck_mtx_init(lck, grp, attr);
ea3f0419 2323 return lck;
91447636
A
2324}
2325
2326/*
2327 * Routine: lck_mtx_free
2328 */
2329void
2330lck_mtx_free(
ea3f0419
A
2331 lck_mtx_t *lck,
2332 lck_grp_t *grp)
91447636
A
2333{
2334 lck_mtx_destroy(lck, grp);
f427ee49 2335 zfree(ZV_LCK_MTX, lck);
91447636
A
2336}
2337
2338/*
2339 * Routine: lck_mtx_ext_init
2340 */
2341static void
2342lck_mtx_ext_init(
ea3f0419
A
2343 lck_mtx_ext_t *lck,
2344 lck_grp_t *grp,
2345 lck_attr_t *attr)
91447636 2346{
2d21ac55 2347 bzero((void *)lck, sizeof(lck_mtx_ext_t));
91447636
A
2348
2349 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
91447636
A
2350 lck->lck_mtx_deb.type = MUTEX_TAG;
2351 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2352 }
2353
2354 lck->lck_mtx_grp = grp;
2d21ac55 2355
ea3f0419 2356 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
6d2010ae 2357 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
ea3f0419 2358 }
b0d623f7 2359
6d2010ae 2360 lck->lck_mtx.lck_mtx_is_ext = 1;
39037602 2361 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
91447636
A
2362}
2363
2364/*
2365 * Routine: lck_mtx_init
2366 */
2367void
2368lck_mtx_init(
ea3f0419
A
2369 lck_mtx_t *lck,
2370 lck_grp_t *grp,
2371 lck_attr_t *attr)
91447636 2372{
ea3f0419
A
2373 lck_mtx_ext_t *lck_ext;
2374 lck_attr_t *lck_attr;
2d21ac55 2375
ea3f0419 2376 if (attr != LCK_ATTR_NULL) {
2d21ac55 2377 lck_attr = attr;
ea3f0419 2378 } else {
2d21ac55 2379 lck_attr = &LockDefaultLckAttr;
ea3f0419 2380 }
91447636 2381
2d21ac55 2382 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
f427ee49
A
2383 lck_ext = zalloc(ZV_LCK_MTX_EXT);
2384 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2385 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2386 lck->lck_mtx_ptr = lck_ext;
91447636 2387 } else {
b0d623f7 2388 lck->lck_mtx_owner = 0;
6d2010ae 2389 lck->lck_mtx_state = 0;
91447636 2390 }
39037602 2391 lck->lck_mtx_pad32 = 0xFFFFFFFF;
91447636
A
2392 lck_grp_reference(grp);
2393 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2394}
2395
2d21ac55
A
2396/*
2397 * Routine: lck_mtx_init_ext
2398 */
2399void
2400lck_mtx_init_ext(
ea3f0419
A
2401 lck_mtx_t *lck,
2402 lck_mtx_ext_t *lck_ext,
2403 lck_grp_t *grp,
2404 lck_attr_t *attr)
2d21ac55 2405{
ea3f0419 2406 lck_attr_t *lck_attr;
2d21ac55 2407
ea3f0419 2408 if (attr != LCK_ATTR_NULL) {
2d21ac55 2409 lck_attr = attr;
ea3f0419 2410 } else {
2d21ac55 2411 lck_attr = &LockDefaultLckAttr;
ea3f0419 2412 }
2d21ac55
A
2413
2414 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2415 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2416 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2417 lck->lck_mtx_ptr = lck_ext;
2418 } else {
b0d623f7 2419 lck->lck_mtx_owner = 0;
6d2010ae 2420 lck->lck_mtx_state = 0;
2d21ac55 2421 }
39037602 2422 lck->lck_mtx_pad32 = 0xFFFFFFFF;
6d2010ae 2423
2d21ac55
A
2424 lck_grp_reference(grp);
2425 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2426}
2427
d9a64523
A
2428static void
2429lck_mtx_lock_mark_destroyed(
2430 lck_mtx_t *mutex,
2431 boolean_t indirect)
2432{
2433 uint32_t state;
2434
2435 if (indirect) {
2436 /* convert to destroyed state */
2437 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2438 return;
2439 }
2440
2441 state = ordered_load_mtx_state(mutex);
2442 lck_mtx_interlock_lock(mutex, &state);
2443
2444 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2445
2446 enable_preemption();
2447}
2448
91447636
A
2449/*
2450 * Routine: lck_mtx_destroy
2451 */
2452void
2453lck_mtx_destroy(
ea3f0419
A
2454 lck_mtx_t *lck,
2455 lck_grp_t *grp)
91447636 2456{
d9a64523 2457 boolean_t indirect;
0a7de745 2458
ea3f0419 2459 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
91447636 2460 return;
ea3f0419 2461 }
39236c6e
A
2462#if MACH_LDEBUG
2463 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2464#endif
d9a64523 2465 indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
b0d623f7 2466
d9a64523 2467 lck_mtx_lock_mark_destroyed(lck, indirect);
b0d623f7 2468
ea3f0419 2469 if (indirect) {
f427ee49 2470 zfree(ZV_LCK_MTX_EXT, lck->lck_mtx_ptr);
ea3f0419 2471 }
91447636
A
2472 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2473 lck_grp_deallocate(grp);
2474 return;
2475}
2476
b0d623f7 2477
d9a64523
A
2478#if DEVELOPMENT | DEBUG
2479__attribute__((noinline))
2480void
2481lck_mtx_owner_check_panic(
2482 lck_mtx_t *lock)
2483{
2484 thread_t owner = (thread_t)lock->lck_mtx_owner;
2485 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2486}
2487#endif
2488
2489__attribute__((always_inline))
2490static boolean_t
2491get_indirect_mutex(
2492 lck_mtx_t **lock,
ea3f0419 2493 uint32_t *state)
d9a64523
A
2494{
2495 *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2496 *state = ordered_load_mtx_state(*lock);
2497 return TRUE;
2498}
2499
2500/*
ea3f0419 2501 * Routine: lck_mtx_unlock_slow
d9a64523
A
2502 *
2503 * Unlocks a mutex held by current thread.
2504 *
cb323159 2505 * It will wake up waiters if necessary.
d9a64523
A
2506 *
2507 * Interlock can be held.
2508 */
2509__attribute__((noinline))
2510void
2511lck_mtx_unlock_slow(
ea3f0419 2512 lck_mtx_t *lock)
d9a64523 2513{
ea3f0419
A
2514 thread_t thread;
2515 uint32_t state, prev;
2516 boolean_t indirect = FALSE;
d9a64523
A
2517
2518 state = ordered_load_mtx_state(lock);
2519
2520 /* Is this an indirect mutex? */
2521 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2522 indirect = get_indirect_mutex(&lock, &state);
2523 }
2524
2525 thread = current_thread();
2526
2527#if DEVELOPMENT | DEBUG
2528 thread_t owner = (thread_t)lock->lck_mtx_owner;
ea3f0419 2529 if (__improbable(owner != thread)) {
cb323159 2530 lck_mtx_owner_check_panic(lock);
ea3f0419 2531 }
d9a64523
A
2532#endif
2533
2534 /* check if it is held as a spinlock */
ea3f0419 2535 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
d9a64523 2536 goto unlock;
ea3f0419 2537 }
d9a64523
A
2538
2539 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2540
2541unlock:
2542 /* preemption disabled, interlock held and mutex not held */
2543
2544 /* clear owner */
2545 ordered_store_mtx_owner(lock, 0);
2546 /* keep original state in prev for later evaluation */
2547 prev = state;
d9a64523 2548
cb323159 2549 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
0a7de745 2550#if MACH_LDEBUG
ea3f0419 2551 if (thread) {
cb323159 2552 thread->mutex_count--;
ea3f0419 2553 }
cb323159
A
2554#endif
2555 return lck_mtx_unlock_wakeup_tail(lock, state, indirect);
0a7de745 2556 }
d9a64523 2557
cb323159
A
2558 /* release interlock, promotion and clear spin flag */
2559 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
ea3f0419 2560 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
cb323159 2561
ea3f0419 2562#if MACH_LDEBUG
cb323159 2563 /* perform lock statistics after drop to prevent delay */
ea3f0419
A
2564 if (thread) {
2565 thread->mutex_count--; /* lock statistic */
2566 }
2567#endif /* MACH_LDEBUG */
d9a64523
A
2568
2569 /* re-enable preemption */
2570 lck_mtx_unlock_finish_inline(lock, FALSE);
2571
2572 return;
2573}
2574
ea3f0419
A
2575#define LCK_MTX_LCK_WAIT_CODE 0x20
2576#define LCK_MTX_LCK_WAKEUP_CODE 0x21
2577#define LCK_MTX_LCK_SPIN_CODE 0x22
2578#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2579#define LCK_MTX_LCK_DEMOTE_CODE 0x24
b0d623f7 2580
b0d623f7 2581/*
d9a64523 2582 * Routine: lck_mtx_unlock_wakeup_tail
b0d623f7 2583 *
d9a64523
A
2584 * Invoked on unlock when there is
2585 * contention, i.e. the assembly routine sees
cb323159 2586 * that mutex->lck_mtx_waiters != 0
b0d623f7 2587 *
6d2010ae 2588 * neither the mutex or interlock is held
d9a64523
A
2589 *
2590 * Note that this routine might not be called if there are pending
2591 * waiters which have previously been woken up, and they didn't
2592 * end up boosting the old owner.
2593 *
2594 * assembly routine previously did the following to mutex:
2595 * (after saving the state in prior_lock_state)
d9a64523
A
2596 * decremented lck_mtx_waiters if nonzero
2597 *
2598 * This function needs to be called as a tail call
2599 * to optimize the compiled code.
b0d623f7 2600 */
d9a64523
A
2601__attribute__((noinline))
2602static void
ea3f0419
A
2603lck_mtx_unlock_wakeup_tail(
2604 lck_mtx_t *mutex,
cb323159 2605 uint32_t state,
ea3f0419 2606 boolean_t indirect)
b0d623f7 2607{
cb323159 2608 struct turnstile *ts;
6d2010ae 2609
ea3f0419 2610 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
cb323159 2611 kern_return_t did_wake;
6d2010ae
A
2612
2613 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
ea3f0419 2614 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7 2615
cb323159 2616 ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
d9a64523 2617
cb323159
A
2618 if (mutex->lck_mtx_waiters > 1) {
2619 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2620 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
2621 } else {
2622 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2623 turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
6d2010ae 2624 }
cb323159 2625 assert(did_wake == KERN_SUCCESS);
b0d623f7 2626
cb323159
A
2627 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2628 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
b0d623f7 2629
cb323159 2630 state -= LCK_MTX_WAITER;
ea3f0419 2631 state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
cb323159 2632 ordered_store_mtx_state_release(mutex, state);
b0d623f7 2633
cb323159 2634 assert(current_thread()->turnstile != NULL);
b0d623f7 2635
cb323159 2636 turnstile_cleanup();
d9a64523 2637
6d2010ae 2638 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
ea3f0419 2639 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7 2640
d9a64523
A
2641 lck_mtx_unlock_finish_inline(mutex, indirect);
2642}
b0d623f7
A
2643
2644/*
ea3f0419 2645 * Routine: lck_mtx_lock_acquire_x86
b0d623f7
A
2646 *
2647 * Invoked on acquiring the mutex when there is
6d2010ae 2648 * contention (i.e. the assembly routine sees that
cb323159 2649 * that mutex->lck_mtx_waiters != 0
6d2010ae
A
2650 *
2651 * mutex is owned... interlock is held... preemption is disabled
b0d623f7 2652 */
d9a64523
A
2653__attribute__((always_inline))
2654static void
2655lck_mtx_lock_acquire_inline(
ea3f0419 2656 lck_mtx_t *mutex,
cb323159 2657 struct turnstile *ts)
b0d623f7 2658{
ea3f0419 2659 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
b0d623f7 2660
6d2010ae 2661 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
ea3f0419 2662 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7 2663
d9a64523 2664 thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */
cb323159 2665 assert(thread->waiting_for_mutex == NULL);
b0d623f7 2666
cb323159
A
2667 if (mutex->lck_mtx_waiters > 0) {
2668 if (ts == NULL) {
2669 ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
0a7de745 2670 }
d9a64523 2671
cb323159
A
2672 turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
2673 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2674 }
d9a64523 2675
cb323159
A
2676 if (ts != NULL) {
2677 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2678 }
d9a64523 2679
cb323159 2680 assert(current_thread()->turnstile != NULL);
d9a64523 2681
6d2010ae 2682 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
ea3f0419 2683 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7
A
2684}
2685
d9a64523
A
2686void
2687lck_mtx_lock_acquire_x86(
ea3f0419 2688 lck_mtx_t *mutex)
d9a64523 2689{
cb323159 2690 return lck_mtx_lock_acquire_inline(mutex, NULL);
d9a64523
A
2691}
2692
2693/*
2694 * Tail call helpers for lock functions that perform
2695 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2696 * the caller's compiled code.
2697 */
b0d623f7 2698
d9a64523
A
2699__attribute__((noinline))
2700static void
2701lck_mtx_lock_acquire_tail(
ea3f0419
A
2702 lck_mtx_t *mutex,
2703 boolean_t indirect,
cb323159 2704 struct turnstile *ts)
d9a64523 2705{
cb323159
A
2706 lck_mtx_lock_acquire_inline(mutex, ts);
2707 lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect);
d9a64523
A
2708}
2709
2710__attribute__((noinline))
2711static boolean_t
2712lck_mtx_try_lock_acquire_tail(
ea3f0419 2713 lck_mtx_t *mutex)
d9a64523 2714{
cb323159 2715 lck_mtx_lock_acquire_inline(mutex, NULL);
d9a64523
A
2716 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2717
2718 return TRUE;
2719}
2720
2721__attribute__((noinline))
2722static void
2723lck_mtx_convert_spin_acquire_tail(
ea3f0419 2724 lck_mtx_t *mutex)
d9a64523 2725{
cb323159 2726 lck_mtx_lock_acquire_inline(mutex, NULL);
d9a64523
A
2727 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2728}
2729
2730boolean_t
2731lck_mtx_ilk_unlock(
2732 lck_mtx_t *mutex)
2733{
2734 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2735 return TRUE;
2736}
2737
2738static inline void
2739lck_mtx_interlock_lock_set_and_clear_flags(
2740 lck_mtx_t *mutex,
2741 uint32_t xor_flags,
2742 uint32_t and_flags,
2743 uint32_t *new_state)
3e170ce0 2744{
d9a64523
A
2745 uint32_t state, prev;
2746 state = *new_state;
2747
ea3f0419 2748 for (;;) {
d9a64523
A
2749 /* have to wait for interlock to clear */
2750 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2751 cpu_pause();
2752 state = ordered_load_mtx_state(mutex);
2753 }
2754 prev = state; /* prev contains snapshot for exchange */
2755 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
ea3f0419 2756 state &= ~and_flags; /* clear flags */
d9a64523
A
2757
2758 disable_preemption();
ea3f0419 2759 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
d9a64523 2760 break;
ea3f0419 2761 }
d9a64523
A
2762 enable_preemption();
2763 cpu_pause();
2764 state = ordered_load_mtx_state(mutex);
2765 }
2766 *new_state = state;
2767 return;
2768}
2769
2770static inline void
2771lck_mtx_interlock_lock_clear_flags(
2772 lck_mtx_t *mutex,
2773 uint32_t and_flags,
2774 uint32_t *new_state)
2775{
2776 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2777}
2778
2779static inline void
2780lck_mtx_interlock_lock(
2781 lck_mtx_t *mutex,
2782 uint32_t *new_state)
2783{
2784 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2785}
2786
2787static inline int
2788lck_mtx_interlock_try_lock_set_flags(
2789 lck_mtx_t *mutex,
2790 uint32_t or_flags,
2791 uint32_t *new_state)
2792{
2793 uint32_t state, prev;
2794 state = *new_state;
2795
2796 /* have to wait for interlock to clear */
2797 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2798 return 0;
2799 }
ea3f0419
A
2800 prev = state; /* prev contains snapshot for exchange */
2801 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
d9a64523 2802 disable_preemption();
cb323159 2803 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
ea3f0419
A
2804 *new_state = state;
2805 return 1;
d9a64523
A
2806 }
2807
2808 enable_preemption();
2809 return 0;
2810}
2811
d9a64523
A
2812__attribute__((noinline))
2813static void
2814lck_mtx_lock_contended(
2815 lck_mtx_t *lock,
2816 boolean_t indirect,
2817 boolean_t *first_miss)
2818{
2819 lck_mtx_spinwait_ret_type_t ret;
2820 uint32_t state;
2821 thread_t thread;
cb323159 2822 struct turnstile *ts = NULL;
d9a64523
A
2823
2824try_again:
2825
2826 if (indirect) {
0a7de745 2827 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
d9a64523
A
2828 }
2829
2830 ret = lck_mtx_lock_spinwait_x86(lock);
2831 state = ordered_load_mtx_state(lock);
2832 switch (ret) {
2833 case LCK_MTX_SPINWAIT_NO_SPIN:
2834 /*
2835 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2836 * try to spin.
2837 */
2838 if (indirect) {
0a7de745 2839 lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
d9a64523
A
2840 }
2841
f427ee49
A
2842 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2843 OS_FALLTHROUGH;
ea3f0419
A
2844 case LCK_MTX_SPINWAIT_SPUN_HIGH_THR:
2845 case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE:
2846 case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION:
2847 case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR:
d9a64523
A
2848 /*
2849 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2850 * interlock not held
2851 */
2852 lck_mtx_interlock_lock(lock, &state);
2853 assert(state & LCK_MTX_ILOCKED_MSK);
2854
2855 if (state & LCK_MTX_MLOCKED_MSK) {
2856 if (indirect) {
0a7de745 2857 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
d9a64523 2858 }
cb323159 2859 lck_mtx_lock_wait_x86(lock, &ts);
d9a64523
A
2860 /*
2861 * interlock is not held here.
2862 */
2863 goto try_again;
2864 } else {
d9a64523
A
2865 /* grab the mutex */
2866 state |= LCK_MTX_MLOCKED_MSK;
2867 ordered_store_mtx_state_release(lock, state);
2868 thread = current_thread();
2869 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2870#if MACH_LDEBUG
2871 if (thread) {
2872 thread->mutex_count++;
2873 }
2874#endif /* MACH_LDEBUG */
2875 }
2876
2877 break;
2878 case LCK_MTX_SPINWAIT_ACQUIRED:
2879 /*
2880 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2881 * interlock is held and preemption disabled
2882 * owner is set and mutex marked as locked
2883 * statistics updated too
2884 */
2885 break;
2886 default:
2887 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2888 }
2889
2890 /*
2891 * interlock is already acquired here
2892 */
2893
2894 /* mutex has been acquired */
2895 thread = (thread_t)lock->lck_mtx_owner;
cb323159
A
2896 if (state & LCK_MTX_WAITERS_MSK) {
2897 /*
2898 * lck_mtx_lock_acquire_tail will call
2899 * turnstile_complete.
2900 */
2901 return lck_mtx_lock_acquire_tail(lock, indirect, ts);
d9a64523
A
2902 }
2903
cb323159
A
2904 if (ts != NULL) {
2905 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2906 }
2907
2908 assert(current_thread()->turnstile != NULL);
2909
d9a64523 2910 /* release the interlock */
cb323159 2911 lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect);
d9a64523
A
2912}
2913
2914/*
2915 * Helper noinline functions for calling
2916 * panic to optimize compiled code.
2917 */
2918
cb323159 2919__attribute__((noinline)) __abortlike
d9a64523
A
2920static void
2921lck_mtx_destroyed(
2922 lck_mtx_t *lock)
2923{
2924 panic("trying to interlock destroyed mutex (%p)", lock);
2925}
2926
2927__attribute__((noinline))
2928static boolean_t
2929lck_mtx_try_destroyed(
2930 lck_mtx_t *lock)
2931{
2932 panic("trying to interlock destroyed mutex (%p)", lock);
2933 return FALSE;
2934}
2935
2936__attribute__((always_inline))
2937static boolean_t
2938lck_mtx_lock_wait_interlock_to_clear(
2939 lck_mtx_t *lock,
2940 uint32_t* new_state)
2941{
2942 uint32_t state;
2943
ea3f0419 2944 for (;;) {
d9a64523
A
2945 cpu_pause();
2946 state = ordered_load_mtx_state(lock);
2947 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2948 *new_state = state;
2949 return TRUE;
2950 }
2951 if (state & LCK_MTX_MLOCKED_MSK) {
2952 /* if it is held as mutex, just fail */
2953 return FALSE;
2954 }
2955 }
2956}
2957
2958__attribute__((always_inline))
2959static boolean_t
2960lck_mtx_try_lock_wait_interlock_to_clear(
2961 lck_mtx_t *lock,
2962 uint32_t* new_state)
2963{
2964 uint32_t state;
2965
ea3f0419 2966 for (;;) {
d9a64523
A
2967 cpu_pause();
2968 state = ordered_load_mtx_state(lock);
2969 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2970 /* if it is held as mutex or spin, just fail */
2971 return FALSE;
2972 }
2973 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2974 *new_state = state;
2975 return TRUE;
2976 }
2977 }
2978}
2979
2980/*
2981 * Routine: lck_mtx_lock_slow
2982 *
2983 * Locks a mutex for current thread.
2984 * If the lock is contended this function might
2985 * sleep.
2986 *
2987 * Called with interlock not held.
2988 */
2989__attribute__((noinline))
2990void
2991lck_mtx_lock_slow(
2992 lck_mtx_t *lock)
2993{
ea3f0419
A
2994 boolean_t indirect = FALSE;
2995 uint32_t state;
2996 int first_miss = 0;
d9a64523
A
2997
2998 state = ordered_load_mtx_state(lock);
2999
3000 /* is the interlock or mutex held */
3001 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3002 /*
3003 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3004 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3005 * set in state (state == lck_mtx_tag)
3006 */
3007
3008
3009 /* is the mutex already held and not indirect */
ea3f0419 3010 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3011 /* no, must have been the mutex */
3012 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3013 }
3014
3015 /* check to see if it is marked destroyed */
3016 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
cb323159 3017 lck_mtx_destroyed(lock);
d9a64523
A
3018 }
3019
3020 /* Is this an indirect mutex? */
3021 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3022 indirect = get_indirect_mutex(&lock, &state);
3023
3024 first_miss = 0;
0a7de745 3025 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3026
3027 if (state & LCK_MTX_SPIN_MSK) {
ea3f0419 3028 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
d9a64523 3029 assert(state & LCK_MTX_ILOCKED_MSK);
0a7de745 3030 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
d9a64523
A
3031 }
3032 }
3033
3034 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3035 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3036 }
3037 }
3038
3039 /* no - can't be INDIRECT, DESTROYED or locked */
3040 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3041 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3042 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3043 }
3044 }
3045
3046 /* lock and interlock acquired */
3047
3048 thread_t thread = current_thread();
3049 /* record owner of mutex */
3050 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3051
3052#if MACH_LDEBUG
3053 if (thread) {
ea3f0419 3054 thread->mutex_count++; /* lock statistic */
d9a64523
A
3055 }
3056#endif
3057 /*
3058 * Check if there are waiters to
3059 * inherit their priority.
3060 */
3061 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
cb323159 3062 return lck_mtx_lock_acquire_tail(lock, indirect, NULL);
d9a64523
A
3063 }
3064
3065 /* release the interlock */
3066 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
3067
3068 return;
3069}
3070
3071__attribute__((noinline))
3072boolean_t
3073lck_mtx_try_lock_slow(
3074 lck_mtx_t *lock)
3075{
3076 boolean_t indirect = FALSE;
3077 uint32_t state;
3078 int first_miss = 0;
3079
3080 state = ordered_load_mtx_state(lock);
3081
3082 /* is the interlock or mutex held */
3083 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3084 /*
3085 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3086 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3087 * set in state (state == lck_mtx_tag)
3088 */
3089
3090 /* is the mutex already held and not indirect */
ea3f0419 3091 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3092 return FALSE;
3093 }
3094
3095 /* check to see if it is marked destroyed */
3096 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
cb323159 3097 lck_mtx_try_destroyed(lock);
d9a64523
A
3098 }
3099
3100 /* Is this an indirect mutex? */
3101 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3102 indirect = get_indirect_mutex(&lock, &state);
3103
3104 first_miss = 0;
0a7de745 3105 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3106 }
3107
3108 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
ea3f0419 3109 if (indirect) {
0a7de745 3110 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
ea3f0419 3111 }
d9a64523
A
3112 return FALSE;
3113 }
3114 }
3115
3116 /* no - can't be INDIRECT, DESTROYED or locked */
3117 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3118 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
ea3f0419 3119 if (indirect) {
0a7de745 3120 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
ea3f0419 3121 }
d9a64523
A
3122 return FALSE;
3123 }
3124 }
3125
3126 /* lock and interlock acquired */
3127
3128 thread_t thread = current_thread();
3129 /* record owner of mutex */
3130 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3131
3132#if MACH_LDEBUG
3133 if (thread) {
ea3f0419 3134 thread->mutex_count++; /* lock statistic */
d9a64523
A
3135 }
3136#endif
3137 /*
3138 * Check if there are waiters to
3139 * inherit their priority.
3140 */
3141 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3142 return lck_mtx_try_lock_acquire_tail(lock);
3143 }
3144
3145 /* release the interlock */
3146 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3147
3148 return TRUE;
d9a64523
A
3149}
3150
3151__attribute__((noinline))
3152void
3153lck_mtx_lock_spin_slow(
ea3f0419 3154 lck_mtx_t *lock)
d9a64523
A
3155{
3156 boolean_t indirect = FALSE;
3157 uint32_t state;
3158 int first_miss = 0;
3159
3160 state = ordered_load_mtx_state(lock);
3161
3162 /* is the interlock or mutex held */
3163 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3164 /*
3165 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3166 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3167 * set in state (state == lck_mtx_tag)
3168 */
3169
3170
3171 /* is the mutex already held and not indirect */
ea3f0419 3172 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3173 /* no, must have been the mutex */
3174 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3175 }
3176
3177 /* check to see if it is marked destroyed */
3178 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
cb323159 3179 lck_mtx_destroyed(lock);
d9a64523
A
3180 }
3181
3182 /* Is this an indirect mutex? */
3183 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3184 indirect = get_indirect_mutex(&lock, &state);
3185
3186 first_miss = 0;
0a7de745 3187 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3188
3189 if (state & LCK_MTX_SPIN_MSK) {
ea3f0419 3190 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
d9a64523 3191 assert(state & LCK_MTX_ILOCKED_MSK);
0a7de745 3192 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
d9a64523
A
3193 }
3194 }
3195
3196 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3197 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3198 }
3199 }
3200
3201 /* no - can't be INDIRECT, DESTROYED or locked */
ea3f0419 3202 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
d9a64523
A
3203 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3204 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3205 }
3206 }
3207
3208 /* lock as spinlock and interlock acquired */
3209
3210 thread_t thread = current_thread();
3211 /* record owner of mutex */
3212 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3213
3214#if MACH_LDEBUG
3215 if (thread) {
3216 thread->mutex_count++; /* lock statistic */
3217 }
3218#endif
3219
ea3f0419 3220#if CONFIG_DTRACE
d9a64523
A
3221 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3222#endif
3223 /* return with the interlock held and preemption disabled */
3224 return;
3225}
3226
3227__attribute__((noinline))
3228boolean_t
3229lck_mtx_try_lock_spin_slow(
3230 lck_mtx_t *lock)
3231{
3232 boolean_t indirect = FALSE;
3233 uint32_t state;
3234 int first_miss = 0;
3235
3236 state = ordered_load_mtx_state(lock);
3237
3238 /* is the interlock or mutex held */
3239 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3240 /*
3241 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3242 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3243 * set in state (state == lck_mtx_tag)
3244 */
3245
3246 /* is the mutex already held and not indirect */
ea3f0419 3247 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3248 return FALSE;
3249 }
3250
3251 /* check to see if it is marked destroyed */
3252 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
cb323159 3253 lck_mtx_try_destroyed(lock);
d9a64523
A
3254 }
3255
3256 /* Is this an indirect mutex? */
3257 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3258 indirect = get_indirect_mutex(&lock, &state);
3259
3260 first_miss = 0;
0a7de745 3261 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3262 }
3263
3264 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
ea3f0419 3265 if (indirect) {
0a7de745 3266 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
ea3f0419 3267 }
d9a64523
A
3268 return FALSE;
3269 }
3270 }
3271
3272 /* no - can't be INDIRECT, DESTROYED or locked */
3273 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3274 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
ea3f0419 3275 if (indirect) {
0a7de745 3276 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
ea3f0419 3277 }
d9a64523
A
3278 return FALSE;
3279 }
3280 }
3281
3282 /* lock and interlock acquired */
3283
3284 thread_t thread = current_thread();
3285 /* record owner of mutex */
3286 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3287
3288#if MACH_LDEBUG
3289 if (thread) {
ea3f0419 3290 thread->mutex_count++; /* lock statistic */
d9a64523
A
3291 }
3292#endif
3293
3294#if CONFIG_DTRACE
3295 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3296#endif
3297 return TRUE;
d9a64523
A
3298}
3299
3300__attribute__((noinline))
3301void
3302lck_mtx_convert_spin(
ea3f0419 3303 lck_mtx_t *lock)
d9a64523
A
3304{
3305 uint32_t state;
3306
3307 state = ordered_load_mtx_state(lock);
3308
3309 /* Is this an indirect mutex? */
3310 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3311 /* If so, take indirection */
3312 get_indirect_mutex(&lock, &state);
3313 }
3314
3315 assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3316
3317 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3318 /* already owned as a mutex, just return */
3319 return;
3320 }
3321
3322 assert(get_preemption_level() > 0);
3323 assert(state & LCK_MTX_ILOCKED_MSK);
3324 assert(state & LCK_MTX_SPIN_MSK);
3325
3326 /*
3327 * Check if there are waiters to
3328 * inherit their priority.
3329 */
3330 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3331 return lck_mtx_convert_spin_acquire_tail(lock);
3332 }
3333
3334 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3335
3336 return;
3337}
3338
3339static inline boolean_t
3340lck_mtx_lock_grab_mutex(
ea3f0419 3341 lck_mtx_t *lock)
d9a64523
A
3342{
3343 uint32_t state;
3344
3345 state = ordered_load_mtx_state(lock);
3346
3347 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3348 return FALSE;
3349 }
3350
3351 /* lock and interlock acquired */
3352
3353 thread_t thread = current_thread();
3354 /* record owner of mutex */
3355 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3356
3357#if MACH_LDEBUG
3358 if (thread) {
ea3f0419 3359 thread->mutex_count++; /* lock statistic */
d9a64523
A
3360 }
3361#endif
3362 return TRUE;
3363}
3364
3365__attribute__((noinline))
3366void
3367lck_mtx_assert(
ea3f0419
A
3368 lck_mtx_t *lock,
3369 unsigned int type)
d9a64523
A
3370{
3371 thread_t thread, owner;
3372 uint32_t state;
3373
3374 thread = current_thread();
3375 state = ordered_load_mtx_state(lock);
3376
3377 if (state == LCK_MTX_TAG_INDIRECT) {
3378 get_indirect_mutex(&lock, &state);
3379 }
3380
3381 owner = (thread_t)lock->lck_mtx_owner;
3382
3383 if (type == LCK_MTX_ASSERT_OWNED) {
ea3f0419 3384 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
d9a64523 3385 panic("mutex (%p) not owned\n", lock);
ea3f0419 3386 }
d9a64523 3387 } else {
ea3f0419
A
3388 assert(type == LCK_MTX_ASSERT_NOTOWNED);
3389 if (owner == thread) {
d9a64523 3390 panic("mutex (%p) owned\n", lock);
ea3f0419 3391 }
d9a64523
A
3392 }
3393}
b0d623f7 3394
91447636 3395/*
ea3f0419 3396 * Routine: lck_mtx_lock_spinwait_x86
0c530ab8
A
3397 *
3398 * Invoked trying to acquire a mutex when there is contention but
3399 * the holder is running on another processor. We spin for up to a maximum
3400 * time waiting for the lock to be released.
3401 *
3402 * Called with the interlock unlocked.
d9a64523
A
3403 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3404 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3405 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
0c530ab8 3406 */
d9a64523
A
3407__attribute__((noinline))
3408lck_mtx_spinwait_ret_type_t
b0d623f7 3409lck_mtx_lock_spinwait_x86(
ea3f0419 3410 lck_mtx_t *mutex)
0c530ab8 3411{
ea3f0419
A
3412 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3413 thread_t owner, prev_owner;
3414 uint64_t window_deadline, sliding_deadline, high_deadline;
3415 uint64_t start_time, cur_time, avg_hold_time, bias, delta;
3416 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3417 int loopcount = 0;
3418 int total_hold_time_samples, window_hold_time_samples, unfairness;
3419 uint i, prev_owner_cpu;
3420 bool owner_on_core, adjust;
0c530ab8 3421
6d2010ae 3422 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
ea3f0419 3423 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
0c530ab8 3424
ea3f0419
A
3425 start_time = mach_absolute_time();
3426 /*
3427 * window_deadline represents the "learning" phase.
3428 * The thread collects statistics about the lock during
3429 * window_deadline and then it makes a decision on whether to spin more
3430 * or block according to the concurrency behavior
3431 * observed.
3432 *
3433 * Every thread can spin at least low_MutexSpin.
3434 */
3435 window_deadline = start_time + low_MutexSpin;
3436 /*
3437 * Sliding_deadline is the adjusted spin deadline
3438 * computed after the "learning" phase.
3439 */
3440 sliding_deadline = window_deadline;
3441 /*
3442 * High_deadline is a hard deadline. No thread
3443 * can spin more than this deadline.
3444 */
3445 if (high_MutexSpin >= 0) {
3446 high_deadline = start_time + high_MutexSpin;
3447 } else {
3448 high_deadline = start_time + low_MutexSpin * real_ncpus;
3449 }
b0d623f7 3450
ea3f0419
A
3451 /*
3452 * Do not know yet which is the owner cpu.
3453 * Initialize prev_owner_cpu with next cpu.
3454 */
3455 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
3456 total_hold_time_samples = 0;
3457 window_hold_time_samples = 0;
3458 avg_hold_time = 0;
3459 adjust = TRUE;
3460 bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus;
3461
3462 prev_owner = (thread_t) mutex->lck_mtx_owner;
0c530ab8
A
3463 /*
3464 * Spin while:
3465 * - mutex is locked, and
ea3f0419 3466 * - it's locked as a spin lock, and
0c530ab8 3467 * - owner is running on another processor, and
0c530ab8
A
3468 * - we haven't spun for long enough.
3469 */
b0d623f7 3470 do {
ea3f0419
A
3471 /*
3472 * Try to acquire the lock.
3473 */
6d2010ae 3474 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
d9a64523 3475 retval = LCK_MTX_SPINWAIT_ACQUIRED;
b0d623f7 3476 break;
2d21ac55 3477 }
ea3f0419 3478
3e170ce0 3479 cur_time = mach_absolute_time();
b0d623f7 3480
ea3f0419
A
3481 /*
3482 * Never spin past high_deadline.
3483 */
3484 if (cur_time >= high_deadline) {
3485 retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3e170ce0 3486 break;
ea3f0419 3487 }
3e170ce0 3488
ea3f0419
A
3489 /*
3490 * Check if owner is on core. If not block.
3491 */
3492 owner = (thread_t) mutex->lck_mtx_owner;
3493 if (owner) {
3494 i = prev_owner_cpu;
3495 owner_on_core = FALSE;
3496
3497 disable_preemption();
3498 owner = (thread_t) mutex->lck_mtx_owner;
3e170ce0 3499
d9a64523 3500 /*
ea3f0419
A
3501 * For scalability we want to check if the owner is on core
3502 * without locking the mutex interlock.
3503 * If we do not lock the mutex interlock, the owner that we see might be
3504 * invalid, so we cannot dereference it. Therefore we cannot check
3505 * any field of the thread to tell us if it is on core.
3506 * Check if the thread that is running on the other cpus matches the owner.
3507 */
3508 if (owner) {
3509 do {
3510 if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner)) {
3511 owner_on_core = TRUE;
3512 break;
3513 }
3514 if (++i >= real_ncpus) {
3515 i = 0;
3516 }
3517 } while (i != prev_owner_cpu);
3518 enable_preemption();
3519
3520 if (owner_on_core) {
3521 prev_owner_cpu = i;
3522 } else {
3523 prev_owner = owner;
3524 owner = (thread_t) mutex->lck_mtx_owner;
3525 if (owner == prev_owner) {
3526 /*
3527 * Owner is not on core.
3528 * Stop spinning.
3529 */
3530 if (loopcount == 0) {
d9a64523 3531 retval = LCK_MTX_SPINWAIT_NO_SPIN;
ea3f0419
A
3532 } else {
3533 retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE;
3534 }
3e170ce0
A
3535 break;
3536 }
ea3f0419
A
3537 /*
3538 * Fall through if the owner changed while we were scanning.
3539 * The new owner could potentially be on core, so loop
3540 * again.
3541 */
3e170ce0 3542 }
ea3f0419
A
3543 } else {
3544 enable_preemption();
3545 }
3546 }
3e170ce0 3547
ea3f0419
A
3548 /*
3549 * Save how many times we see the owner changing.
3550 * We can roughly estimate the mutex hold
3551 * time and the fairness with that.
3552 */
3553 if (owner != prev_owner) {
3554 prev_owner = owner;
3555 total_hold_time_samples++;
3556 window_hold_time_samples++;
3557 }
3558
3559 /*
3560 * Learning window expired.
3561 * Try to adjust the sliding_deadline.
3562 */
3563 if (cur_time >= window_deadline) {
3564 /*
3565 * If there was not contention during the window
3566 * stop spinning.
3567 */
3568 if (window_hold_time_samples < 1) {
3569 retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION;
3570 break;
b0d623f7 3571 }
ea3f0419
A
3572
3573 if (adjust) {
3574 /*
3575 * For a fair lock, we'd wait for at most (NCPU-1) periods,
3576 * but the lock is unfair, so let's try to estimate by how much.
3577 */
3578 unfairness = total_hold_time_samples / real_ncpus;
3579
3580 if (unfairness == 0) {
3581 /*
3582 * We observed the owner changing `total_hold_time_samples` times which
3583 * let us estimate the average hold time of this mutex for the duration
3584 * of the spin time.
3585 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
3586 *
3587 * In this case spin at max avg_hold_time * (real_ncpus - 1)
3588 */
3589 delta = cur_time - start_time;
3590 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
3591 } else {
3592 /*
3593 * In this case at least one of the other cpus was able to get the lock twice
3594 * while I was spinning.
3595 * We could spin longer but it won't necessarily help if the system is unfair.
3596 * Try to randomize the wait to reduce contention.
3597 *
3598 * We compute how much time we could potentially spin
3599 * and distribute it over the cpus.
3600 *
3601 * bias is an integer between 0 and real_ncpus.
3602 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
3603 */
3604 delta = high_deadline - cur_time;
3605 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
3606 adjust = FALSE;
3607 }
3608 }
3609
3610 window_deadline += low_MutexSpin;
3611 window_hold_time_samples = 0;
b0d623f7 3612 }
b0d623f7 3613
ea3f0419
A
3614 /*
3615 * Stop spinning if we past
3616 * the adjusted deadline.
3617 */
3618 if (cur_time >= sliding_deadline) {
3619 retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR;
3620 break;
3621 }
3622
3623 if ((thread_t) mutex->lck_mtx_owner != NULL) {
3624 cpu_pause();
3625 }
cb323159 3626
ea3f0419 3627 loopcount++;
3e170ce0 3628 } while (TRUE);
b0d623f7 3629
ea3f0419 3630#if CONFIG_DTRACE
2d21ac55 3631 /*
2d21ac55 3632 * Note that we record a different probe id depending on whether
ea3f0419 3633 * this is a direct or indirect mutex. This allows us to
2d21ac55
A
3634 * penalize only lock groups that have debug/stats enabled
3635 * with dtrace processing if desired.
3636 */
6d2010ae 3637 if (__probable(mutex->lck_mtx_is_ext == 0)) {
b0d623f7 3638 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
ea3f0419 3639 mach_absolute_time() - start_time);
2d21ac55 3640 } else {
b0d623f7 3641 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
ea3f0419 3642 mach_absolute_time() - start_time);
2d21ac55
A
3643 }
3644 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3645#endif
b0d623f7 3646
6d2010ae 3647 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
ea3f0419 3648 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
b0d623f7
A
3649
3650 return retval;
0c530ab8
A
3651}
3652
b0d623f7
A
3653
3654
0c530ab8 3655/*
ea3f0419 3656 * Routine: lck_mtx_lock_wait_x86
b0d623f7
A
3657 *
3658 * Invoked in order to wait on contention.
3659 *
3660 * Called with the interlock locked and
d9a64523 3661 * preemption disabled...
6d2010ae 3662 * returns it unlocked and with preemption enabled
d9a64523
A
3663 *
3664 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3665 * A runnable waiter can exist between wait and acquire
3666 * without a waiters count being set.
3667 * This allows us to never make a spurious wakeup call.
3668 *
3669 * Priority:
3670 * This avoids taking the thread lock if the owning thread is the same priority.
3671 * This optimizes the case of same-priority threads contending on a lock.
3672 * However, that allows the owning thread to drop in priority while holding the lock,
3673 * because there is no state that the priority change can notice that
3674 * says that the targeted thread holds a contended mutex.
3675 *
3676 * One possible solution: priority changes could look for some atomic tag
3677 * on the thread saying 'holding contended lock', and then set up a promotion.
3678 * Needs a story for dropping that promotion - the last contended unlock
3679 * has to notice that this has happened.
0c530ab8 3680 */
d9a64523 3681__attribute__((noinline))
0c530ab8 3682void
ea3f0419
A
3683lck_mtx_lock_wait_x86(
3684 lck_mtx_t *mutex,
cb323159 3685 struct turnstile **ts)
0c530ab8 3686{
cb323159
A
3687 thread_t self = current_thread();
3688
ea3f0419 3689#if CONFIG_DTRACE
d9a64523 3690 uint64_t sleep_start = 0;
b0d623f7
A
3691
3692 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3693 sleep_start = mach_absolute_time();
3694 }
3695#endif
d9a64523
A
3696 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3697
6d2010ae 3698 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
ea3f0419
A
3699 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3700 mutex->lck_mtx_waiters, 0, 0);
b0d623f7 3701
cb323159
A
3702 assert(self->waiting_for_mutex == NULL);
3703 self->waiting_for_mutex = mutex;
3704 mutex->lck_mtx_waiters++;
39236c6e 3705
d9a64523 3706 thread_t holder = (thread_t)mutex->lck_mtx_owner;
d9a64523
A
3707 assert(holder != NULL);
3708
3709 /*
cb323159
A
3710 * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3711 * the same turnstile while looping, the matching turnstile compleate will be called
3712 * by lck_mtx_lock_contended when finally acquiring the lock.
d9a64523 3713 */
cb323159
A
3714 if (*ts == NULL) {
3715 *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
b0d623f7 3716 }
d9a64523 3717
cb323159 3718 struct turnstile *turnstile = *ts;
813fb2f6 3719 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
cb323159
A
3720 turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
3721
3722 waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
b0d623f7
A
3723
3724 lck_mtx_ilk_unlock(mutex);
3725
cb323159
A
3726 turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
3727
b0d623f7
A
3728 thread_block(THREAD_CONTINUE_NULL);
3729
d9a64523
A
3730 self->waiting_for_mutex = NULL;
3731
6d2010ae 3732 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
ea3f0419
A
3733 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3734 mutex->lck_mtx_waiters, 0, 0);
b0d623f7 3735
ea3f0419 3736#if CONFIG_DTRACE
b0d623f7
A
3737 /*
3738 * Record the Dtrace lockstat probe for blocking, block time
3739 * measured from when we were entered.
3740 */
3741 if (sleep_start) {
6d2010ae 3742 if (mutex->lck_mtx_is_ext == 0) {
b0d623f7
A
3743 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3744 mach_absolute_time() - sleep_start);
3745 } else {
3746 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3747 mach_absolute_time() - sleep_start);
3748 }
3749 }
3750#endif
0c530ab8 3751}
3e170ce0
A
3752
3753/*
3754 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3755 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3756 * Returns: TRUE if lock is acquired.
3757 */
3758boolean_t
ea3f0419 3759kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3e170ce0
A
3760{
3761 if (not_in_kdp) {
3762 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3763 }
3764
39037602 3765 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3e170ce0
A
3766 return TRUE;
3767 }
3768
3769 return FALSE;
3770}
3771
813fb2f6
A
3772void
3773kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3774{
3775 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3776 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3777 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3778 waitinfo->owner = thread_tid(holder);
3779}
3780
3781void
3782kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3783{
3784 lck_rw_t *rwlck = NULL;
ea3f0419
A
3785 switch (waitinfo->wait_type) {
3786 case kThreadWaitKernelRWLockRead:
3787 rwlck = READ_EVENT_TO_RWLOCK(event);
3788 break;
3789 case kThreadWaitKernelRWLockWrite:
3790 case kThreadWaitKernelRWLockUpgrade:
3791 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3792 break;
3793 default:
3794 panic("%s was called with an invalid blocking type", __FUNCTION__);
3795 break;
813fb2f6
A
3796 }
3797 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3798 waitinfo->owner = 0;
3799}