]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/locks_i386.c
xnu-7195.60.75.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
CommitLineData
91447636 1/*
f427ee49 2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
91447636 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
91447636
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
0a7de745 31/*
91447636
A
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
0a7de745 35 *
91447636
A
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
0a7de745 41 *
91447636
A
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745 45 *
91447636 46 * Carnegie Mellon requests users of this software to return to
0a7de745 47 *
91447636
A
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
0a7de745 52 *
91447636
A
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
d9a64523
A
64#define LOCK_PRIVATE 1
65
91447636
A
66#include <mach_ldebug.h>
67
0a7de745 68#include <kern/lock_stat.h>
91447636 69#include <kern/locks.h>
f427ee49 70#include <kern/zalloc.h>
91447636
A
71#include <kern/misc_protos.h>
72#include <kern/thread.h>
73#include <kern/processor.h>
74#include <kern/cpu_data.h>
75#include <kern/cpu_number.h>
76#include <kern/sched_prim.h>
91447636
A
77#include <kern/debug.h>
78#include <string.h>
79
060df5ea 80#include <i386/machine_routines.h> /* machine_timeout_suspended() */
5ba3f43e 81#include <machine/atomic.h>
b0d623f7 82#include <machine/machine_cpu.h>
060df5ea 83#include <i386/mp.h>
d9a64523 84#include <machine/atomic.h>
91447636 85#include <sys/kdebug.h>
d9a64523 86#include <i386/locks_i386_inlines.h>
ea3f0419
A
87#include <kern/cpu_number.h>
88#include <os/hash.h>
91447636 89
ea3f0419
A
90#if CONFIG_DTRACE
91#define DTRACE_RW_SHARED 0x0 //reader
92#define DTRACE_RW_EXCL 0x1 //writer
93#define DTRACE_NO_FLAG 0x0 //not applicable
0a7de745 94#endif /* CONFIG_DTRACE */
2d21ac55 95
ea3f0419
A
96#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
97#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
98#define LCK_RW_LCK_SHARED_CODE 0x102
99#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
100#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
101#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
91447636 102
ea3f0419
A
103#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
104#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
105#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
106#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
107#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
108#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
109#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
110#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
b0d623f7 111
91447636 112
ea3f0419 113#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
91447636 114
91447636
A
115/* Forwards */
116
ea3f0419 117#if USLOCK_DEBUG
91447636
A
118/*
119 * Perform simple lock checks.
120 */
ea3f0419
A
121int uslock_check = 1;
122int max_lock_loops = 100000000;
123decl_simple_lock_data(extern, printf_lock);
124decl_simple_lock_data(extern, panic_lock);
125#endif /* USLOCK_DEBUG */
91447636 126
fe8ab488 127extern unsigned int not_in_kdp;
91447636
A
128
129/*
130 * We often want to know the addresses of the callers
131 * of the various lock routines. However, this information
132 * is only used for debugging and statistics.
133 */
ea3f0419
A
134typedef void *pc_t;
135#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
136#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
137#if ANY_LOCK_DEBUG
138#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
139#define DECL_PC(pc) pc_t pc;
140#else /* ANY_LOCK_DEBUG */
91447636 141#define DECL_PC(pc)
ea3f0419 142#ifdef lint
91447636
A
143/*
144 * Eliminate lint complaints about unused local pc variables.
145 */
ea3f0419
A
146#define OBTAIN_PC(pc) ++pc
147#else /* lint */
148#define OBTAIN_PC(pc)
149#endif /* lint */
150#endif /* USLOCK_DEBUG */
91447636 151
f427ee49
A
152ZONE_VIEW_DEFINE(ZV_LCK_SPIN, "lck_spin",
153 KHEAP_ID_DEFAULT, sizeof(lck_spin_t));
154
155ZONE_VIEW_DEFINE(ZV_LCK_MTX, "lck_mtx",
156 KHEAP_ID_DEFAULT, sizeof(lck_mtx_t));
157
158ZONE_VIEW_DEFINE(ZV_LCK_MTX_EXT, "lck_mtx_ext",
159 KHEAP_ID_DEFAULT, sizeof(lck_mtx_ext_t));
160
161ZONE_VIEW_DEFINE(ZV_LCK_RW, "lck_rw",
162 KHEAP_ID_DEFAULT, sizeof(lck_rw_t));
163
5ba3f43e
A
164/*
165 * atomic exchange API is a low level abstraction of the operations
166 * to atomically read, modify, and write a pointer. This abstraction works
167 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
168 * well as the ARM exclusive instructions.
169 *
170 * atomic_exchange_begin() - begin exchange and retrieve current value
171 * atomic_exchange_complete() - conclude an exchange
172 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
173 */
174static uint32_t
175atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
176{
ea3f0419 177 uint32_t val;
5ba3f43e 178
ea3f0419 179 (void)ord; // Memory order not used
cb323159 180 val = os_atomic_load(target, relaxed);
5ba3f43e
A
181 *previous = val;
182 return val;
183}
184
185static boolean_t
186atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
187{
188 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
189}
190
191static void
ea3f0419
A
192atomic_exchange_abort(void)
193{
194}
5ba3f43e
A
195
196static boolean_t
197atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
198{
ea3f0419 199 uint32_t value, prev;
5ba3f43e 200
ea3f0419 201 for (;;) {
5ba3f43e
A
202 value = atomic_exchange_begin32(target, &prev, ord);
203 if (value & test_mask) {
ea3f0419 204 if (wait) {
5ba3f43e 205 cpu_pause();
ea3f0419 206 } else {
5ba3f43e 207 atomic_exchange_abort();
ea3f0419 208 }
5ba3f43e
A
209 return FALSE;
210 }
211 value |= set_mask;
ea3f0419 212 if (atomic_exchange_complete32(target, prev, value, ord)) {
5ba3f43e 213 return TRUE;
ea3f0419 214 }
5ba3f43e
A
215 }
216}
91447636 217
cb323159
A
218inline boolean_t
219hw_atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
220{
221 return atomic_test_and_set32(target, test_mask, set_mask, ord, wait);
222}
223
91447636
A
224/*
225 * Portable lock package implementation of usimple_locks.
226 */
227
ea3f0419
A
228#if USLOCK_DEBUG
229#define USLDBG(stmt) stmt
230void usld_lock_init(usimple_lock_t, unsigned short);
231void usld_lock_pre(usimple_lock_t, pc_t);
232void usld_lock_post(usimple_lock_t, pc_t);
233void usld_unlock(usimple_lock_t, pc_t);
234void usld_lock_try_pre(usimple_lock_t, pc_t);
235void usld_lock_try_post(usimple_lock_t, pc_t);
236int usld_lock_common_checks(usimple_lock_t, char *);
237#else /* USLOCK_DEBUG */
238#define USLDBG(stmt)
239#endif /* USLOCK_DEBUG */
91447636 240
2d21ac55
A
241/*
242 * Forward definitions
243 */
244
5ba3f43e
A
245static void lck_rw_lock_shared_gen(lck_rw_t *lck);
246static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
247static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
248static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
249static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
250static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
39236c6e 251void lck_rw_clear_promotions_x86(thread_t thread);
5ba3f43e
A
252static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
253static boolean_t lck_rw_grab_want(lck_rw_t *lock);
254static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
cb323159 255static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, uint32_t state, boolean_t indirect);
d9a64523
A
256static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
257static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
d9a64523
A
258static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
259static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
260static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
261
39236c6e 262
91447636
A
263/*
264 * Routine: lck_spin_alloc_init
265 */
266lck_spin_t *
267lck_spin_alloc_init(
ea3f0419
A
268 lck_grp_t *grp,
269 lck_attr_t *attr)
91447636 270{
f427ee49 271 lck_spin_t *lck;
91447636 272
f427ee49
A
273 lck = zalloc(ZV_LCK_SPIN);
274 lck_spin_init(lck, grp, attr);
ea3f0419 275 return lck;
91447636
A
276}
277
278/*
279 * Routine: lck_spin_free
280 */
281void
282lck_spin_free(
ea3f0419
A
283 lck_spin_t *lck,
284 lck_grp_t *grp)
91447636
A
285{
286 lck_spin_destroy(lck, grp);
f427ee49 287 zfree(ZV_LCK_SPIN, lck);
91447636
A
288}
289
290/*
291 * Routine: lck_spin_init
292 */
293void
294lck_spin_init(
ea3f0419
A
295 lck_spin_t *lck,
296 lck_grp_t *grp,
297 __unused lck_attr_t *attr)
91447636
A
298{
299 usimple_lock_init((usimple_lock_t) lck, 0);
cb323159
A
300 if (grp) {
301 lck_grp_reference(grp);
302 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
303 }
91447636
A
304}
305
306/*
307 * Routine: lck_spin_destroy
308 */
309void
310lck_spin_destroy(
ea3f0419
A
311 lck_spin_t *lck,
312 lck_grp_t *grp)
91447636 313{
ea3f0419 314 if (lck->interlock == LCK_SPIN_TAG_DESTROYED) {
91447636 315 return;
ea3f0419 316 }
b0d623f7 317 lck->interlock = LCK_SPIN_TAG_DESTROYED;
cb323159
A
318 if (grp) {
319 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
320 lck_grp_deallocate(grp);
321 }
91447636
A
322 return;
323}
324
325/*
326 * Routine: lck_spin_lock
327 */
0a7de745
A
328void
329lck_spin_lock_grp(
ea3f0419
A
330 lck_spin_t *lck,
331 lck_grp_t *grp)
0a7de745
A
332{
333#pragma unused(grp)
334 usimple_lock((usimple_lock_t) lck, grp);
335}
336
91447636
A
337void
338lck_spin_lock(
ea3f0419 339 lck_spin_t *lck)
91447636 340{
0a7de745 341 usimple_lock((usimple_lock_t) lck, NULL);
91447636
A
342}
343
344/*
345 * Routine: lck_spin_unlock
346 */
347void
348lck_spin_unlock(
ea3f0419 349 lck_spin_t *lck)
91447636
A
350{
351 usimple_unlock((usimple_lock_t) lck);
352}
353
0a7de745
A
354boolean_t
355lck_spin_try_lock_grp(
ea3f0419
A
356 lck_spin_t *lck,
357 lck_grp_t *grp)
0a7de745
A
358{
359#pragma unused(grp)
360 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, grp);
ea3f0419 361#if DEVELOPMENT || DEBUG
0a7de745
A
362 if (lrval) {
363 pltrace(FALSE);
364 }
365#endif
ea3f0419 366 return lrval;
0a7de745
A
367}
368
91447636
A
369
370/*
371 * Routine: lck_spin_try_lock
372 */
373boolean_t
374lck_spin_try_lock(
ea3f0419 375 lck_spin_t *lck)
91447636 376{
0a7de745 377 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck, LCK_GRP_NULL);
ea3f0419 378#if DEVELOPMENT || DEBUG
39037602
A
379 if (lrval) {
380 pltrace(FALSE);
381 }
382#endif
ea3f0419 383 return lrval;
39037602
A
384}
385
386/*
387 * Routine: lck_spin_assert
388 */
389void
390lck_spin_assert(lck_spin_t *lock, unsigned int type)
391{
392 thread_t thread, holder;
393 uintptr_t state;
394
395 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
396 panic("lck_spin_assert(): invalid arg (%u)", type);
397 }
398
399 state = lock->interlock;
400 holder = (thread_t)state;
401 thread = current_thread();
402 if (type == LCK_ASSERT_OWNED) {
403 if (__improbable(holder == THREAD_NULL)) {
404 panic("Lock not owned %p = %lx", lock, state);
405 }
406 if (__improbable(holder != thread)) {
407 panic("Lock not owned by current thread %p = %lx", lock, state);
408 }
409 } else if (type == LCK_ASSERT_NOTOWNED) {
410 if (__improbable(holder != THREAD_NULL)) {
411 if (holder == thread) {
412 panic("Lock owned by current thread %p = %lx", lock, state);
39037602
A
413 }
414 }
415 }
91447636
A
416}
417
fe8ab488 418/*
3e170ce0 419 * Routine: kdp_lck_spin_is_acquired
fe8ab488
A
420 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
421 * Returns: TRUE if lock is acquired.
422 */
423boolean_t
ea3f0419
A
424kdp_lck_spin_is_acquired(lck_spin_t *lck)
425{
fe8ab488
A
426 if (not_in_kdp) {
427 panic("panic: spinlock acquired check done outside of kernel debugger");
428 }
429 return (lck->interlock != 0)? TRUE : FALSE;
430}
431
91447636
A
432/*
433 * Initialize a usimple_lock.
434 *
435 * No change in preemption state.
436 */
437void
438usimple_lock_init(
ea3f0419
A
439 usimple_lock_t l,
440 __unused unsigned short tag)
91447636 441{
ea3f0419 442#ifndef MACHINE_SIMPLE_LOCK
91447636
A
443 USLDBG(usld_lock_init(l, tag));
444 hw_lock_init(&l->interlock);
445#else
ea3f0419 446 simple_lock_init((simple_lock_t)l, tag);
91447636
A
447#endif
448}
449
060df5ea
A
450volatile uint32_t spinlock_owner_cpu = ~0;
451volatile usimple_lock_t spinlock_timed_out;
452
ea3f0419
A
453uint32_t
454spinlock_timeout_NMI(uintptr_t thread_addr)
455{
060df5ea
A
456 uint32_t i;
457
458 for (i = 0; i < real_ncpus; i++) {
a39ff7e2 459 if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
060df5ea 460 spinlock_owner_cpu = i;
5ba3f43e
A
461 if ((uint32_t) cpu_number() != i) {
462 /* Cause NMI and panic on the owner's cpu */
463 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
464 }
060df5ea
A
465 break;
466 }
467 }
468
469 return spinlock_owner_cpu;
470}
91447636
A
471
472/*
473 * Acquire a usimple_lock.
474 *
475 * Returns with preemption disabled. Note
476 * that the hw_lock routines are responsible for
477 * maintaining preemption state.
478 */
479void
0a7de745 480(usimple_lock)(
ea3f0419 481 usimple_lock_t l
0a7de745 482 LCK_GRP_ARG(lck_grp_t *grp))
91447636 483{
ea3f0419 484#ifndef MACHINE_SIMPLE_LOCK
2d21ac55 485 DECL_PC(pc);
91447636 486
b0d623f7 487 OBTAIN_PC(pc);
91447636 488 USLDBG(usld_lock_pre(l, pc));
6d2010ae 489
ea3f0419 490 if (__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC, grp) == 0)) {
b0d623f7 491 boolean_t uslock_acquired = FALSE;
060df5ea
A
492 while (machine_timeout_suspended()) {
493 enable_preemption();
ea3f0419 494 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC, grp))) {
060df5ea 495 break;
ea3f0419 496 }
6d2010ae
A
497 }
498
060df5ea
A
499 if (uslock_acquired == FALSE) {
500 uint32_t lock_cpu;
7ddcb079 501 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
060df5ea 502 spinlock_timed_out = l;
7ddcb079 503 lock_cpu = spinlock_timeout_NMI(lowner);
5ba3f43e 504 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
ea3f0419 505 l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
060df5ea 506 }
b0d623f7 507 }
39037602 508#if DEVELOPMENT || DEBUG
ea3f0419 509 pltrace(FALSE);
39037602
A
510#endif
511
91447636
A
512 USLDBG(usld_lock_post(l, pc));
513#else
0a7de745 514 simple_lock((simple_lock_t)l, grp);
91447636 515#endif
5ba3f43e 516#if CONFIG_DTRACE
0a7de745 517 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0, (uintptr_t)LCK_GRP_PROBEARG(grp));
5ba3f43e 518#endif
91447636
A
519}
520
521
522/*
523 * Release a usimple_lock.
524 *
525 * Returns with preemption enabled. Note
526 * that the hw_lock routines are responsible for
527 * maintaining preemption state.
528 */
529void
530usimple_unlock(
ea3f0419 531 usimple_lock_t l)
91447636 532{
ea3f0419 533#ifndef MACHINE_SIMPLE_LOCK
91447636
A
534 DECL_PC(pc);
535
b0d623f7 536 OBTAIN_PC(pc);
91447636 537 USLDBG(usld_unlock(l, pc));
39037602 538#if DEVELOPMENT || DEBUG
ea3f0419 539 pltrace(TRUE);
39037602 540#endif
91447636
A
541 hw_lock_unlock(&l->interlock);
542#else
543 simple_unlock_rwmb((simple_lock_t)l);
544#endif
545}
546
547
548/*
549 * Conditionally acquire a usimple_lock.
550 *
551 * On success, returns with preemption disabled.
552 * On failure, returns with preemption in the same state
553 * as when first invoked. Note that the hw_lock routines
554 * are responsible for maintaining preemption state.
555 *
556 * XXX No stats are gathered on a miss; I preserved this
557 * behavior from the original assembly-language code, but
558 * doesn't it make sense to log misses? XXX
559 */
560unsigned int
561usimple_lock_try(
ea3f0419 562 usimple_lock_t l,
0a7de745 563 lck_grp_t *grp)
91447636 564{
ea3f0419
A
565#ifndef MACHINE_SIMPLE_LOCK
566 unsigned int success;
2d21ac55 567 DECL_PC(pc);
91447636 568
b0d623f7 569 OBTAIN_PC(pc);
91447636 570 USLDBG(usld_lock_try_pre(l, pc));
0a7de745 571 if ((success = hw_lock_try(&l->interlock, grp))) {
39037602
A
572#if DEVELOPMENT || DEBUG
573 pltrace(FALSE);
574#endif
ea3f0419 575 USLDBG(usld_lock_try_post(l, pc));
91447636
A
576 }
577 return success;
578#else
ea3f0419 579 return simple_lock_try((simple_lock_t)l, grp);
91447636
A
580#endif
581}
582
39037602 583/*
cb323159 584 * Acquire a usimple_lock while polling for pending cpu signals
39037602
A
585 * and spinning on a lock.
586 *
587 */
ea3f0419
A
588unsigned
589int
cb323159 590(usimple_lock_try_lock_mp_signal_safe_loop_deadline)(usimple_lock_t l,
ea3f0419
A
591 uint64_t deadline
592 LCK_GRP_ARG(lck_grp_t *grp))
39037602
A
593{
594 boolean_t istate = ml_get_interrupts_enabled();
cb323159
A
595
596 if (deadline < mach_absolute_time()) {
597 return 0;
598 }
599
0a7de745 600 while (!simple_lock_try(l, grp)) {
ea3f0419 601 if (!istate) {
cb323159 602 cpu_signal_handler(NULL);
ea3f0419
A
603 }
604
cb323159
A
605 if (deadline < mach_absolute_time()) {
606 return 0;
0a7de745 607 }
cb323159 608
39037602
A
609 cpu_pause();
610 }
cb323159
A
611
612 return 1;
613}
614
615void
616(usimple_lock_try_lock_loop)(usimple_lock_t l
ea3f0419 617 LCK_GRP_ARG(lck_grp_t *grp))
cb323159 618{
f427ee49
A
619 /* When the lock is not contended, grab the lock and go. */
620 if (!simple_lock_try(l, grp)) {
621 usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, ULLONG_MAX, grp);
622 }
39037602
A
623}
624
ea3f0419
A
625unsigned
626int
cb323159 627(usimple_lock_try_lock_mp_signal_safe_loop_duration)(usimple_lock_t l,
ea3f0419
A
628 uint64_t duration
629 LCK_GRP_ARG(lck_grp_t *grp))
cb323159
A
630{
631 uint64_t deadline;
f427ee49 632 uint64_t base_at;
cb323159
A
633 uint64_t duration_at;
634
f427ee49
A
635 /* Fast track for uncontended locks */
636 if (simple_lock_try(l, grp)) {
637 return 1;
638 }
639
640 base_at = mach_absolute_time();
641
cb323159
A
642 nanoseconds_to_absolutetime(duration, &duration_at);
643 deadline = base_at + duration_at;
644 if (deadline < base_at) {
645 /* deadline has overflowed, make it saturate */
646 deadline = ULLONG_MAX;
647 }
648
649 return usimple_lock_try_lock_mp_signal_safe_loop_deadline(l, deadline, grp);
650}
651
ea3f0419 652#if USLOCK_DEBUG
91447636
A
653/*
654 * States of a usimple_lock. The default when initializing
655 * a usimple_lock is setting it up for debug checking.
656 */
ea3f0419
A
657#define USLOCK_CHECKED 0x0001 /* lock is being checked */
658#define USLOCK_TAKEN 0x0002 /* lock has been taken */
659#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
660#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
661#define USLOCK_CHECKING(l) (uslock_check && \
662 ((l)->debug.state & USLOCK_CHECKED))
91447636
A
663
664/*
665 * Initialize the debugging information contained
666 * in a usimple_lock.
667 */
668void
669usld_lock_init(
ea3f0419
A
670 usimple_lock_t l,
671 __unused unsigned short tag)
91447636 672{
ea3f0419 673 if (l == USIMPLE_LOCK_NULL) {
91447636 674 panic("lock initialization: null lock pointer");
ea3f0419 675 }
91447636
A
676 l->lock_type = USLOCK_TAG;
677 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
678 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
679 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
680 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
681 l->debug.duration[0] = l->debug.duration[1] = 0;
682 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
683 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
684 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
685}
686
687
688/*
689 * These checks apply to all usimple_locks, not just
690 * those with USLOCK_CHECKED turned on.
691 */
692int
693usld_lock_common_checks(
ea3f0419
A
694 usimple_lock_t l,
695 char *caller)
91447636 696{
ea3f0419 697 if (l == USIMPLE_LOCK_NULL) {
91447636 698 panic("%s: null lock pointer", caller);
ea3f0419
A
699 }
700 if (l->lock_type != USLOCK_TAG) {
ebb1b9f4 701 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
ea3f0419
A
702 }
703 if (!(l->debug.state & USLOCK_INIT)) {
ebb1b9f4 704 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
ea3f0419 705 }
91447636
A
706 return USLOCK_CHECKING(l);
707}
708
709
710/*
711 * Debug checks on a usimple_lock just before attempting
712 * to acquire it.
713 */
714/* ARGSUSED */
715void
716usld_lock_pre(
ea3f0419
A
717 usimple_lock_t l,
718 pc_t pc)
91447636 719{
ea3f0419 720 char caller[] = "usimple_lock";
91447636
A
721
722
ea3f0419 723 if (!usld_lock_common_checks(l, caller)) {
91447636 724 return;
ea3f0419 725 }
91447636
A
726
727/*
728 * Note that we have a weird case where we are getting a lock when we are]
729 * in the process of putting the system to sleep. We are running with no
730 * current threads, therefore we can't tell if we are trying to retake a lock
731 * we have or someone on the other processor has it. Therefore we just
732 * ignore this test if the locking thread is 0.
733 */
734
735 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
736 l->debug.lock_thread == (void *) current_thread()) {
2d21ac55 737 printf("%s: lock %p already locked (at %p) by",
ea3f0419 738 caller, l, l->debug.lock_pc);
2d21ac55 739 printf(" current thread %p (new attempt at pc %p)\n",
ea3f0419 740 l->debug.lock_thread, pc);
2d21ac55 741 panic("%s", caller);
91447636
A
742 }
743 mp_disable_preemption();
91447636
A
744 mp_enable_preemption();
745}
746
747
748/*
749 * Debug checks on a usimple_lock just after acquiring it.
750 *
751 * Pre-emption has been disabled at this point,
752 * so we are safe in using cpu_number.
753 */
754void
755usld_lock_post(
ea3f0419
A
756 usimple_lock_t l,
757 pc_t pc)
91447636 758{
f427ee49 759 unsigned int mycpu;
ea3f0419 760 char caller[] = "successful usimple_lock";
91447636
A
761
762
ea3f0419 763 if (!usld_lock_common_checks(l, caller)) {
91447636 764 return;
ea3f0419 765 }
91447636 766
ea3f0419 767 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
b0d623f7 768 panic("%s: lock %p became uninitialized",
ea3f0419
A
769 caller, l);
770 }
771 if ((l->debug.state & USLOCK_TAKEN)) {
b0d623f7 772 panic("%s: lock 0x%p became TAKEN by someone else",
ea3f0419
A
773 caller, l);
774 }
91447636 775
f427ee49
A
776 mycpu = (unsigned int)cpu_number();
777 assert(mycpu <= UCHAR_MAX);
778
91447636
A
779 l->debug.lock_thread = (void *)current_thread();
780 l->debug.state |= USLOCK_TAKEN;
781 l->debug.lock_pc = pc;
f427ee49 782 l->debug.lock_cpu = (unsigned char)mycpu;
91447636
A
783}
784
785
786/*
787 * Debug checks on a usimple_lock just before
788 * releasing it. Note that the caller has not
789 * yet released the hardware lock.
790 *
791 * Preemption is still disabled, so there's
792 * no problem using cpu_number.
793 */
794void
795usld_unlock(
ea3f0419
A
796 usimple_lock_t l,
797 pc_t pc)
91447636 798{
f427ee49 799 unsigned int mycpu;
ea3f0419 800 char caller[] = "usimple_unlock";
91447636
A
801
802
ea3f0419 803 if (!usld_lock_common_checks(l, caller)) {
91447636 804 return;
ea3f0419 805 }
91447636
A
806
807 mycpu = cpu_number();
f427ee49 808 assert(mycpu <= UCHAR_MAX);
91447636 809
ea3f0419 810 if (!(l->debug.state & USLOCK_TAKEN)) {
b0d623f7 811 panic("%s: lock 0x%p hasn't been taken",
ea3f0419
A
812 caller, l);
813 }
814 if (l->debug.lock_thread != (void *) current_thread()) {
b0d623f7 815 panic("%s: unlocking lock 0x%p, owned by thread %p",
ea3f0419
A
816 caller, l, l->debug.lock_thread);
817 }
91447636 818 if (l->debug.lock_cpu != mycpu) {
b0d623f7 819 printf("%s: unlocking lock 0x%p on cpu 0x%x",
ea3f0419 820 caller, l, mycpu);
91447636 821 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
2d21ac55 822 panic("%s", caller);
91447636 823 }
91447636
A
824
825 l->debug.unlock_thread = l->debug.lock_thread;
826 l->debug.lock_thread = INVALID_PC;
827 l->debug.state &= ~USLOCK_TAKEN;
828 l->debug.unlock_pc = pc;
f427ee49 829 l->debug.unlock_cpu = (unsigned char)mycpu;
91447636
A
830}
831
832
833/*
834 * Debug checks on a usimple_lock just before
835 * attempting to acquire it.
836 *
837 * Preemption isn't guaranteed to be disabled.
838 */
839void
840usld_lock_try_pre(
ea3f0419
A
841 usimple_lock_t l,
842 __unused pc_t pc)
91447636 843{
ea3f0419 844 char caller[] = "usimple_lock_try";
91447636 845
ea3f0419 846 if (!usld_lock_common_checks(l, caller)) {
91447636 847 return;
ea3f0419 848 }
91447636
A
849}
850
851
852/*
853 * Debug checks on a usimple_lock just after
854 * successfully attempting to acquire it.
855 *
856 * Preemption has been disabled by the
857 * lock acquisition attempt, so it's safe
858 * to use cpu_number.
859 */
860void
861usld_lock_try_post(
ea3f0419
A
862 usimple_lock_t l,
863 pc_t pc)
91447636 864{
f427ee49 865 unsigned int mycpu;
ea3f0419 866 char caller[] = "successful usimple_lock_try";
91447636 867
ea3f0419 868 if (!usld_lock_common_checks(l, caller)) {
91447636 869 return;
ea3f0419 870 }
91447636 871
ea3f0419 872 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED)) {
b0d623f7 873 panic("%s: lock 0x%p became uninitialized",
ea3f0419
A
874 caller, l);
875 }
876 if ((l->debug.state & USLOCK_TAKEN)) {
b0d623f7 877 panic("%s: lock 0x%p became TAKEN by someone else",
ea3f0419
A
878 caller, l);
879 }
91447636
A
880
881 mycpu = cpu_number();
f427ee49
A
882 assert(mycpu <= UCHAR_MAX);
883
91447636
A
884 l->debug.lock_thread = (void *) current_thread();
885 l->debug.state |= USLOCK_TAKEN;
886 l->debug.lock_pc = pc;
f427ee49 887 l->debug.lock_cpu = (unsigned char)mycpu;
91447636 888}
ea3f0419 889#endif /* USLOCK_DEBUG */
91447636 890
91447636
A
891/*
892 * Routine: lck_rw_alloc_init
893 */
894lck_rw_t *
895lck_rw_alloc_init(
ea3f0419
A
896 lck_grp_t *grp,
897 lck_attr_t *attr)
898{
f427ee49 899 lck_rw_t *lck;
b0d623f7 900
f427ee49
A
901 lck = zalloc_flags(ZV_LCK_RW, Z_WAITOK | Z_ZERO);
902 lck_rw_init(lck, grp, attr);
ea3f0419 903 return lck;
91447636
A
904}
905
906/*
907 * Routine: lck_rw_free
908 */
909void
910lck_rw_free(
ea3f0419
A
911 lck_rw_t *lck,
912 lck_grp_t *grp)
913{
91447636 914 lck_rw_destroy(lck, grp);
f427ee49 915 zfree(ZV_LCK_RW, lck);
91447636
A
916}
917
918/*
919 * Routine: lck_rw_init
920 */
921void
922lck_rw_init(
ea3f0419
A
923 lck_rw_t *lck,
924 lck_grp_t *grp,
925 lck_attr_t *attr)
0c530ab8 926{
ea3f0419
A
927 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
928 attr : &LockDefaultLckAttr;
91447636 929
2d21ac55
A
930 hw_lock_byte_init(&lck->lck_rw_interlock);
931 lck->lck_rw_want_write = FALSE;
932 lck->lck_rw_want_upgrade = FALSE;
933 lck->lck_rw_shared_count = 0;
934 lck->lck_rw_can_sleep = TRUE;
b0d623f7 935 lck->lck_r_waiting = lck->lck_w_waiting = 0;
91447636 936 lck->lck_rw_tag = 0;
2d21ac55 937 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
ea3f0419 938 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
91447636
A
939
940 lck_grp_reference(grp);
941 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
942}
943
944/*
945 * Routine: lck_rw_destroy
946 */
947void
948lck_rw_destroy(
ea3f0419
A
949 lck_rw_t *lck,
950 lck_grp_t *grp)
b0d623f7 951{
ea3f0419 952 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED) {
91447636 953 return;
ea3f0419 954 }
39236c6e
A
955#if MACH_LDEBUG
956 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
957#endif
91447636
A
958 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
959 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
960 lck_grp_deallocate(grp);
961 return;
962}
963
964/*
965 * Sleep locks. These use the same data structure and algorithm
966 * as the spin locks, but the process sleeps while it is waiting
967 * for the lock. These work on uniprocessor systems.
968 */
969
970#define DECREMENTER_TIMEOUT 1000000
971
91447636 972/*
6d2010ae
A
973 * We disable interrupts while holding the RW interlock to prevent an
974 * interrupt from exacerbating hold time.
91447636
A
975 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
976 */
5ba3f43e 977static inline boolean_t
91447636
A
978lck_interlock_lock(lck_rw_t *lck)
979{
ea3f0419 980 boolean_t istate;
91447636 981
0a7de745 982 istate = ml_set_interrupts_enabled(FALSE);
2d21ac55 983 hw_lock_byte_lock(&lck->lck_rw_interlock);
91447636
A
984 return istate;
985}
986
5ba3f43e 987static inline void
91447636 988lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
0a7de745 989{
2d21ac55 990 hw_lock_byte_unlock(&lck->lck_rw_interlock);
91447636
A
991 ml_set_interrupts_enabled(istate);
992}
993
0c530ab8
A
994/*
995 * This inline is used when busy-waiting for an rw lock.
996 * If interrupts were disabled when the lock primitive was called,
997 * we poll the IPI handler for pending tlb flushes.
998 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
999 */
1000static inline void
1001lck_rw_lock_pause(boolean_t interrupts_enabled)
1002{
ea3f0419 1003 if (!interrupts_enabled) {
0c530ab8 1004 handle_pending_TLB_flushes();
ea3f0419 1005 }
0c530ab8
A
1006 cpu_pause();
1007}
1008
5ba3f43e
A
1009static inline boolean_t
1010lck_rw_held_read_or_upgrade(lck_rw_t *lock)
1011{
ea3f0419 1012 if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE)) {
5ba3f43e 1013 return TRUE;
ea3f0419 1014 }
5ba3f43e
A
1015 return FALSE;
1016}
b0d623f7
A
1017
1018/*
1019 * compute the deadline to spin against when
1020 * waiting for a change of state on a lck_rw_t
1021 */
1022static inline uint64_t
1023lck_rw_deadline_for_spin(lck_rw_t *lck)
1024{
1025 if (lck->lck_rw_can_sleep) {
1026 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
1027 /*
1028 * there are already threads waiting on this lock... this
ea3f0419 1029 * implies that they have spun beyond their deadlines waiting for
b0d623f7
A
1030 * the desired state to show up so we will not bother spinning at this time...
1031 * or
1032 * the current number of threads sharing this lock exceeds our capacity to run them
1033 * concurrently and since all states we're going to spin for require the rw_shared_count
1034 * to be at 0, we'll not bother spinning since the latency for this to happen is
1035 * unpredictable...
1036 */
ea3f0419 1037 return mach_absolute_time();
b0d623f7 1038 }
ea3f0419
A
1039 return mach_absolute_time() + MutexSpin;
1040 } else {
1041 return mach_absolute_time() + (100000LL * 1000000000LL);
1042 }
b0d623f7
A
1043}
1044
1045
5ba3f43e
A
1046/*
1047 * Spin while interlock is held.
1048 */
1049
1050static inline void
1051lck_rw_interlock_spin(lck_rw_t *lock)
1052{
1053 while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
1054 cpu_pause();
1055 }
1056}
1057
1058static boolean_t
1059lck_rw_grab_want(lck_rw_t *lock)
1060{
ea3f0419 1061 uint32_t data, prev;
5ba3f43e 1062
ea3f0419 1063 for (;;) {
5ba3f43e 1064 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
ea3f0419 1065 if ((data & LCK_RW_INTERLOCK) == 0) {
5ba3f43e 1066 break;
ea3f0419 1067 }
5ba3f43e
A
1068 atomic_exchange_abort();
1069 lck_rw_interlock_spin(lock);
1070 }
1071 if (data & LCK_RW_WANT_WRITE) {
1072 atomic_exchange_abort();
1073 return FALSE;
1074 }
1075 data |= LCK_RW_WANT_WRITE;
1076 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1077}
1078
1079static boolean_t
1080lck_rw_grab_shared(lck_rw_t *lock)
1081{
ea3f0419 1082 uint32_t data, prev;
5ba3f43e 1083
ea3f0419 1084 for (;;) {
5ba3f43e 1085 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
ea3f0419 1086 if ((data & LCK_RW_INTERLOCK) == 0) {
5ba3f43e 1087 break;
ea3f0419 1088 }
5ba3f43e
A
1089 atomic_exchange_abort();
1090 lck_rw_interlock_spin(lock);
1091 }
1092 if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1093 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1094 atomic_exchange_abort();
1095 return FALSE;
1096 }
1097 }
1098 data += LCK_RW_SHARED_READER;
1099 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1100}
1101
91447636
A
1102/*
1103 * Routine: lck_rw_lock_exclusive
1104 */
5ba3f43e 1105static void
b0d623f7 1106lck_rw_lock_exclusive_gen(
ea3f0419 1107 lck_rw_t *lck)
91447636 1108{
ea3f0419
A
1109 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1110 uint64_t deadline = 0;
1111 int slept = 0;
1112 int gotlock = 0;
1113 int lockheld = 0;
1114 wait_result_t res = 0;
1115 boolean_t istate = -1;
91447636 1116
ea3f0419 1117#if CONFIG_DTRACE
b0d623f7 1118 boolean_t dtrace_ls_initialized = FALSE;
ea3f0419 1119 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled = FALSE;
b0d623f7
A
1120 uint64_t wait_interval = 0;
1121 int readers_at_sleep = 0;
2d21ac55 1122#endif
91447636 1123
91447636 1124 /*
2d21ac55 1125 * Try to acquire the lck_rw_want_write bit.
91447636 1126 */
ea3f0419
A
1127 while (!lck_rw_grab_want(lck)) {
1128#if CONFIG_DTRACE
b0d623f7
A
1129 if (dtrace_ls_initialized == FALSE) {
1130 dtrace_ls_initialized = TRUE;
1131 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1132 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1133 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1134 if (dtrace_ls_enabled) {
1135 /*
1136 * Either sleeping or spinning is happening,
1137 * start a timing of our delay interval now.
1138 */
1139 readers_at_sleep = lck->lck_rw_shared_count;
1140 wait_interval = mach_absolute_time();
1141 }
91447636 1142 }
2d21ac55 1143#endif
ea3f0419 1144 if (istate == -1) {
b0d623f7 1145 istate = ml_get_interrupts_enabled();
ea3f0419 1146 }
91447636 1147
b0d623f7
A
1148 deadline = lck_rw_deadline_for_spin(lck);
1149
3e170ce0 1150 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
0a7de745 1151
ea3f0419 1152 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline) {
b0d623f7 1153 lck_rw_lock_pause(istate);
ea3f0419 1154 }
b0d623f7 1155
3e170ce0 1156 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
b0d623f7 1157
ea3f0419 1158 if (gotlock) {
b0d623f7 1159 break;
ea3f0419 1160 }
b0d623f7
A
1161 /*
1162 * if we get here, the deadline has expired w/o us
1163 * being able to grab the lock exclusively
1164 * check to see if we're allowed to do a thread_block
1165 */
1166 if (lck->lck_rw_can_sleep) {
91447636 1167 istate = lck_interlock_lock(lck);
91447636 1168
b0d623f7 1169 if (lck->lck_rw_want_write) {
3e170ce0 1170 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
91447636 1171
b0d623f7 1172 lck->lck_w_waiting = TRUE;
91447636 1173
813fb2f6 1174 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
d9a64523 1175 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
ea3f0419 1176 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
b0d623f7 1177 lck_interlock_unlock(lck, istate);
91447636 1178
b0d623f7
A
1179 if (res == THREAD_WAITING) {
1180 res = thread_block(THREAD_CONTINUE_NULL);
1181 slept++;
1182 }
3e170ce0 1183 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
b0d623f7
A
1184 } else {
1185 lck->lck_rw_want_write = TRUE;
1186 lck_interlock_unlock(lck, istate);
1187 break;
1188 }
1189 }
1190 }
1191 /*
1192 * Wait for readers (and upgrades) to finish...
1193 * the test for these conditions must be done simultaneously with
1194 * a check of the interlock not being held since
1195 * the rw_shared_count will drop to 0 first and then want_upgrade
1196 * will be set to 1 in the shared_to_exclusive scenario... those
1197 * adjustments are done behind the interlock and represent an
1198 * atomic change in state and must be considered as such
1199 * however, once we see the read count at 0, the want_upgrade not set
1200 * and the interlock not held, we are safe to proceed
1201 */
1202 while (lck_rw_held_read_or_upgrade(lck)) {
ea3f0419 1203#if CONFIG_DTRACE
2d21ac55
A
1204 /*
1205 * Either sleeping or spinning is happening, start
1206 * a timing of our delay interval now. If we set it
1207 * to -1 we don't have accurate data so we cannot later
1208 * decide to record a dtrace spin or sleep event.
1209 */
b0d623f7
A
1210 if (dtrace_ls_initialized == FALSE) {
1211 dtrace_ls_initialized = TRUE;
1212 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1213 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1214 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1215 if (dtrace_ls_enabled) {
1216 /*
1217 * Either sleeping or spinning is happening,
1218 * start a timing of our delay interval now.
1219 */
1220 readers_at_sleep = lck->lck_rw_shared_count;
1221 wait_interval = mach_absolute_time();
1222 }
2d21ac55
A
1223 }
1224#endif
ea3f0419 1225 if (istate == -1) {
b0d623f7 1226 istate = ml_get_interrupts_enabled();
ea3f0419 1227 }
b0d623f7
A
1228
1229 deadline = lck_rw_deadline_for_spin(lck);
1230
3e170ce0 1231 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7 1232
ea3f0419 1233 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline) {
b0d623f7 1234 lck_rw_lock_pause(istate);
ea3f0419 1235 }
b0d623f7 1236
3e170ce0 1237 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
b0d623f7 1238
ea3f0419 1239 if (!lockheld) {
b0d623f7 1240 break;
ea3f0419 1241 }
b0d623f7
A
1242 /*
1243 * if we get here, the deadline has expired w/o us
1244 * being able to grab the lock exclusively
1245 * check to see if we're allowed to do a thread_block
1246 */
1247 if (lck->lck_rw_can_sleep) {
91447636 1248 istate = lck_interlock_lock(lck);
91447636 1249
b0d623f7 1250 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
3e170ce0 1251 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7
A
1252
1253 lck->lck_w_waiting = TRUE;
1254
813fb2f6 1255 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
d9a64523 1256 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
ea3f0419 1257 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1258 lck_interlock_unlock(lck, istate);
b0d623f7
A
1259
1260 if (res == THREAD_WAITING) {
1261 res = thread_block(THREAD_CONTINUE_NULL);
1262 slept++;
1263 }
3e170ce0 1264 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
b0d623f7
A
1265 } else {
1266 lck_interlock_unlock(lck, istate);
1267 /*
1268 * must own the lock now, since we checked for
1269 * readers or upgrade owner behind the interlock
1270 * no need for a call to 'lck_rw_held_read_or_upgrade'
1271 */
1272 break;
91447636
A
1273 }
1274 }
91447636
A
1275 }
1276
ea3f0419 1277#if CONFIG_DTRACE
2d21ac55
A
1278 /*
1279 * Decide what latencies we suffered that are Dtrace events.
1280 * If we have set wait_interval, then we either spun or slept.
1281 * At least we get out from under the interlock before we record
1282 * which is the best we can do here to minimize the impact
1283 * of the tracing.
1284 * If we have set wait_interval to -1, then dtrace was not enabled when we
1285 * started sleeping/spinning so we don't record this event.
1286 */
b0d623f7 1287 if (dtrace_ls_enabled == TRUE) {
2d21ac55 1288 if (slept == 0) {
0a7de745 1289 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
2d21ac55
A
1290 mach_absolute_time() - wait_interval, 1);
1291 } else {
1292 /*
1293 * For the blocking case, we also record if when we blocked
1294 * it was held for read or write, and how many readers.
1295 * Notice that above we recorded this before we dropped
1296 * the interlock so the count is accurate.
1297 */
0a7de745 1298 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
2d21ac55
A
1299 mach_absolute_time() - wait_interval, 1,
1300 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1301 }
1302 }
1303 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1304#endif
91447636
A
1305}
1306
5ba3f43e
A
1307/*
1308 * Routine: lck_rw_done
1309 */
1310
ea3f0419
A
1311lck_rw_type_t
1312lck_rw_done(lck_rw_t *lock)
5ba3f43e 1313{
ea3f0419 1314 uint32_t data, prev;
5ba3f43e 1315
ea3f0419 1316 for (;;) {
5ba3f43e 1317 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
ea3f0419 1318 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
5ba3f43e
A
1319 atomic_exchange_abort();
1320 lck_rw_interlock_spin(lock);
1321 continue;
1322 }
1323 if (data & LCK_RW_SHARED_MASK) {
1324 data -= LCK_RW_SHARED_READER;
ea3f0419 1325 if ((data & LCK_RW_SHARED_MASK) == 0) { /* if reader count has now gone to 0, check for waiters */
5ba3f43e 1326 goto check_waiters;
ea3f0419
A
1327 }
1328 } else { /* if reader count == 0, must be exclusive lock */
5ba3f43e
A
1329 if (data & LCK_RW_WANT_UPGRADE) {
1330 data &= ~(LCK_RW_WANT_UPGRADE);
1331 } else {
ea3f0419 1332 if (data & LCK_RW_WANT_WRITE) {
5ba3f43e 1333 data &= ~(LCK_RW_WANT_EXCL);
ea3f0419 1334 } else { /* lock is not 'owned', panic */
5ba3f43e 1335 panic("Releasing non-exclusive RW lock without a reader refcount!");
ea3f0419 1336 }
5ba3f43e
A
1337 }
1338check_waiters:
1339 if (prev & LCK_RW_W_WAITING) {
1340 data &= ~(LCK_RW_W_WAITING);
ea3f0419 1341 if ((prev & LCK_RW_PRIV_EXCL) == 0) {
5ba3f43e 1342 data &= ~(LCK_RW_R_WAITING);
ea3f0419
A
1343 }
1344 } else {
5ba3f43e 1345 data &= ~(LCK_RW_R_WAITING);
ea3f0419 1346 }
5ba3f43e 1347 }
ea3f0419 1348 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
5ba3f43e 1349 break;
ea3f0419 1350 }
5ba3f43e
A
1351 cpu_pause();
1352 }
1353 return lck_rw_done_gen(lock, prev);
1354}
91447636
A
1355
1356/*
2d21ac55 1357 * Routine: lck_rw_done_gen
b0d623f7 1358 *
5ba3f43e 1359 * called from lck_rw_done()
b0d623f7 1360 * prior_lock_state is the value in the 1st
ea3f0419 1361 * word of the lock at the time of a successful
b0d623f7 1362 * atomic compare and exchange with the new value...
ea3f0419 1363 * it represents the state of the lock before we
b0d623f7 1364 * decremented the rw_shared_count or cleared either
ea3f0419 1365 * rw_want_upgrade or rw_want_write and
b0d623f7 1366 * the lck_x_waiting bits... since the wrapper
ea3f0419 1367 * routine has already changed the state atomically,
b0d623f7
A
1368 * we just need to decide if we should
1369 * wake up anyone and what value to return... we do
1370 * this by examining the state of the lock before
1371 * we changed it
91447636 1372 */
5ba3f43e 1373static lck_rw_type_t
2d21ac55 1374lck_rw_done_gen(
0a7de745
A
1375 lck_rw_t *lck,
1376 uint32_t prior_lock_state)
91447636 1377{
0a7de745
A
1378 lck_rw_t *fake_lck;
1379 lck_rw_type_t lock_type;
1380 thread_t thread;
1381 uint32_t rwlock_count;
39236c6e 1382
0a7de745
A
1383 thread = current_thread();
1384 rwlock_count = thread->rwlock_count--;
b0d623f7 1385 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1386
0a7de745
A
1387 if (lck->lck_rw_can_sleep) {
1388 /*
1389 * prior_lock state is a snapshot of the 1st word of the
1390 * lock in question... we'll fake up a pointer to it
1391 * and carefully not access anything beyond whats defined
1392 * in the first word of a lck_rw_t
1393 */
91447636 1394
0a7de745
A
1395 if (fake_lck->lck_rw_shared_count <= 1) {
1396 if (fake_lck->lck_w_waiting) {
1397 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1398 }
2d21ac55 1399
0a7de745
A
1400 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
1401 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1402 }
1403 }
fe8ab488 1404#if MACH_LDEBUG
0a7de745
A
1405 if (rwlock_count == 0) {
1406 panic("rw lock count underflow for thread %p", thread);
1407 }
fe8ab488 1408#endif
0a7de745
A
1409 /* Check if dropping the lock means that we need to unpromote */
1410
1411 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1412 /* sched_flags checked without lock, but will be rechecked while clearing */
1413 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1414 }
1415 }
1416 if (fake_lck->lck_rw_shared_count) {
1417 lock_type = LCK_RW_TYPE_SHARED;
1418 } else {
1419 lock_type = LCK_RW_TYPE_EXCLUSIVE;
fe8ab488
A
1420 }
1421
2d21ac55 1422#if CONFIG_DTRACE
b0d623f7 1423 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2d21ac55
A
1424#endif
1425
0a7de745 1426 return lock_type;
91447636
A
1427}
1428
1429
91447636
A
1430/*
1431 * Routine: lck_rw_unlock
1432 */
1433void
1434lck_rw_unlock(
ea3f0419
A
1435 lck_rw_t *lck,
1436 lck_rw_type_t lck_rw_type)
91447636 1437{
ea3f0419 1438 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
91447636 1439 lck_rw_unlock_shared(lck);
ea3f0419 1440 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
91447636 1441 lck_rw_unlock_exclusive(lck);
ea3f0419 1442 } else {
91447636 1443 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
ea3f0419 1444 }
91447636
A
1445}
1446
1447
1448/*
1449 * Routine: lck_rw_unlock_shared
1450 */
1451void
1452lck_rw_unlock_shared(
ea3f0419 1453 lck_rw_t *lck)
91447636 1454{
ea3f0419 1455 lck_rw_type_t ret;
91447636 1456
a39ff7e2 1457 assertf(lck->lck_rw_shared_count > 0, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
91447636
A
1458 ret = lck_rw_done(lck);
1459
ea3f0419 1460 if (ret != LCK_RW_TYPE_SHARED) {
39037602 1461 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
ea3f0419 1462 }
91447636
A
1463}
1464
1465
1466/*
1467 * Routine: lck_rw_unlock_exclusive
1468 */
1469void
1470lck_rw_unlock_exclusive(
ea3f0419 1471 lck_rw_t *lck)
91447636 1472{
ea3f0419 1473 lck_rw_type_t ret;
91447636
A
1474
1475 ret = lck_rw_done(lck);
1476
ea3f0419 1477 if (ret != LCK_RW_TYPE_EXCLUSIVE) {
91447636 1478 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
ea3f0419 1479 }
91447636
A
1480}
1481
1482
1483/*
1484 * Routine: lck_rw_lock
1485 */
1486void
1487lck_rw_lock(
ea3f0419
A
1488 lck_rw_t *lck,
1489 lck_rw_type_t lck_rw_type)
91447636 1490{
ea3f0419 1491 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
91447636 1492 lck_rw_lock_shared(lck);
ea3f0419 1493 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
91447636 1494 lck_rw_lock_exclusive(lck);
ea3f0419 1495 } else {
91447636 1496 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
ea3f0419 1497 }
91447636
A
1498}
1499
5ba3f43e
A
1500/*
1501 * Routine: lck_rw_lock_shared
1502 */
1503void
1504lck_rw_lock_shared(lck_rw_t *lock)
1505{
ea3f0419 1506 uint32_t data, prev;
5ba3f43e
A
1507
1508 current_thread()->rwlock_count++;
ea3f0419 1509 for (;;) {
5ba3f43e
A
1510 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1511 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1512 atomic_exchange_abort();
0a7de745
A
1513 if (lock->lck_rw_can_sleep) {
1514 lck_rw_lock_shared_gen(lock);
1515 } else {
1516 cpu_pause();
1517 continue;
1518 }
5ba3f43e
A
1519 break;
1520 }
1521 data += LCK_RW_SHARED_READER;
ea3f0419 1522 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1523 break;
ea3f0419 1524 }
5ba3f43e
A
1525 cpu_pause();
1526 }
ea3f0419 1527#if CONFIG_DTRACE
5ba3f43e 1528 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
ea3f0419 1529#endif /* CONFIG_DTRACE */
5ba3f43e
A
1530 return;
1531}
91447636
A
1532
1533/*
2d21ac55 1534 * Routine: lck_rw_lock_shared_gen
b0d623f7
A
1535 * Function:
1536 * assembly fast path code has determined that this lock
1537 * is held exclusively... this is where we spin/block
1538 * until we can acquire the lock in the shared mode
91447636 1539 */
5ba3f43e 1540static void
2d21ac55 1541lck_rw_lock_shared_gen(
ea3f0419 1542 lck_rw_t *lck)
91447636 1543{
ea3f0419
A
1544 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1545 uint64_t deadline = 0;
1546 int gotlock = 0;
1547 int slept = 0;
1548 wait_result_t res = 0;
1549 boolean_t istate = -1;
3e170ce0 1550
ea3f0419 1551#if CONFIG_DTRACE
2d21ac55 1552 uint64_t wait_interval = 0;
b0d623f7
A
1553 int readers_at_sleep = 0;
1554 boolean_t dtrace_ls_initialized = FALSE;
1555 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
2d21ac55 1556#endif
91447636 1557
ea3f0419
A
1558 while (!lck_rw_grab_shared(lck)) {
1559#if CONFIG_DTRACE
b0d623f7
A
1560 if (dtrace_ls_initialized == FALSE) {
1561 dtrace_ls_initialized = TRUE;
1562 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1563 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1564 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1565 if (dtrace_ls_enabled) {
1566 /*
1567 * Either sleeping or spinning is happening,
1568 * start a timing of our delay interval now.
1569 */
1570 readers_at_sleep = lck->lck_rw_shared_count;
1571 wait_interval = mach_absolute_time();
1572 }
1573 }
2d21ac55 1574#endif
ea3f0419 1575 if (istate == -1) {
b0d623f7 1576 istate = ml_get_interrupts_enabled();
ea3f0419 1577 }
91447636 1578
b0d623f7 1579 deadline = lck_rw_deadline_for_spin(lck);
0c530ab8 1580
b0d623f7 1581 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
ea3f0419 1582 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
91447636 1583
ea3f0419 1584 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline) {
b0d623f7 1585 lck_rw_lock_pause(istate);
ea3f0419 1586 }
b0d623f7
A
1587
1588 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
ea3f0419 1589 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
b0d623f7 1590
ea3f0419 1591 if (gotlock) {
b0d623f7 1592 break;
ea3f0419 1593 }
b0d623f7
A
1594 /*
1595 * if we get here, the deadline has expired w/o us
1596 * being able to grab the lock for read
1597 * check to see if we're allowed to do a thread_block
1598 */
1599 if (lck->lck_rw_can_sleep) {
91447636 1600 istate = lck_interlock_lock(lck);
91447636 1601
b0d623f7
A
1602 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1603 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
b0d623f7 1604 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
ea3f0419 1605 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
b0d623f7
A
1606
1607 lck->lck_r_waiting = TRUE;
1608
813fb2f6 1609 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
d9a64523 1610 res = assert_wait(RW_LOCK_READER_EVENT(lck),
ea3f0419 1611 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1612 lck_interlock_unlock(lck, istate);
b0d623f7
A
1613
1614 if (res == THREAD_WAITING) {
1615 res = thread_block(THREAD_CONTINUE_NULL);
1616 slept++;
1617 }
1618 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
ea3f0419 1619 trace_lck, res, slept, 0, 0);
b0d623f7
A
1620 } else {
1621 lck->lck_rw_shared_count++;
1622 lck_interlock_unlock(lck, istate);
1623 break;
91447636
A
1624 }
1625 }
91447636
A
1626 }
1627
ea3f0419 1628#if CONFIG_DTRACE
b0d623f7 1629 if (dtrace_ls_enabled == TRUE) {
2d21ac55 1630 if (slept == 0) {
0a7de745 1631 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
2d21ac55 1632 } else {
0a7de745 1633 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
2d21ac55
A
1634 mach_absolute_time() - wait_interval, 0,
1635 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1636 }
1637 }
1638 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1639#endif
91447636
A
1640}
1641
f427ee49
A
1642#define LCK_RW_LOCK_EXCLUSIVE_TAS(lck) (atomic_test_and_set32(&(lck)->data, \
1643 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK), \
1644 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE))
1645
1646/*
1647 * Routine: lck_rw_lock_exclusive_check_contended
1648 */
1649
1650bool
1651lck_rw_lock_exclusive_check_contended(lck_rw_t *lock)
1652{
1653 bool contended = false;
1654 current_thread()->rwlock_count++;
1655 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
1656#if CONFIG_DTRACE
1657 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1658#endif /* CONFIG_DTRACE */
1659 } else {
1660 contended = true;
1661 lck_rw_lock_exclusive_gen(lock);
1662 }
1663 return contended;
1664}
91447636 1665
5ba3f43e
A
1666/*
1667 * Routine: lck_rw_lock_exclusive
1668 */
1669
1670void
1671lck_rw_lock_exclusive(lck_rw_t *lock)
1672{
1673 current_thread()->rwlock_count++;
f427ee49 1674 if (LCK_RW_LOCK_EXCLUSIVE_TAS(lock)) {
ea3f0419 1675#if CONFIG_DTRACE
5ba3f43e 1676 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
ea3f0419
A
1677#endif /* CONFIG_DTRACE */
1678 } else {
5ba3f43e 1679 lck_rw_lock_exclusive_gen(lock);
ea3f0419 1680 }
5ba3f43e
A
1681}
1682
1683
1684/*
1685 * Routine: lck_rw_lock_shared_to_exclusive
cb323159
A
1686 *
1687 * False returned upon failure, in this case the shared lock is dropped.
5ba3f43e
A
1688 */
1689
1690boolean_t
1691lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1692{
ea3f0419 1693 uint32_t data, prev;
5ba3f43e 1694
ea3f0419 1695 for (;;) {
5ba3f43e
A
1696 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1697 if (data & LCK_RW_INTERLOCK) {
1698 atomic_exchange_abort();
1699 lck_rw_interlock_spin(lock);
1700 continue;
1701 }
1702 if (data & LCK_RW_WANT_UPGRADE) {
1703 data -= LCK_RW_SHARED_READER;
ea3f0419
A
1704 if ((data & LCK_RW_SHARED_MASK) == 0) { /* we were the last reader */
1705 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1706 }
1707 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1708 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
ea3f0419 1709 }
5ba3f43e 1710 } else {
ea3f0419
A
1711 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1712 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1713 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 1714 break;
ea3f0419 1715 }
5ba3f43e
A
1716 }
1717 cpu_pause();
1718 }
ea3f0419
A
1719 /* we now own the WANT_UPGRADE */
1720 if (data & LCK_RW_SHARED_MASK) { /* check to see if all of the readers are drained */
1721 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1722 }
1723#if CONFIG_DTRACE
5ba3f43e
A
1724 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1725#endif
1726 return TRUE;
1727}
1728
1729
91447636 1730/*
b0d623f7 1731 * Routine: lck_rw_lock_shared_to_exclusive_failure
91447636 1732 * Function:
b0d623f7
A
1733 * assembly fast path code has already dropped our read
1734 * count and determined that someone else owns 'lck_rw_want_upgrade'
1735 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1736 * all we need to do here is determine if a wakeup is needed
91447636 1737 */
5ba3f43e 1738static boolean_t
b0d623f7 1739lck_rw_lock_shared_to_exclusive_failure(
ea3f0419
A
1740 lck_rw_t *lck,
1741 uint32_t prior_lock_state)
91447636 1742{
ea3f0419
A
1743 lck_rw_t *fake_lck;
1744 thread_t thread = current_thread();
1745 uint32_t rwlock_count;
39236c6e
A
1746
1747 /* Check if dropping the lock means that we need to unpromote */
1748 rwlock_count = thread->rwlock_count--;
1749#if MACH_LDEBUG
1750 if (rwlock_count == 0) {
1751 panic("rw lock count underflow for thread %p", thread);
1752 }
1753#endif
b0d623f7 1754 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1755
b0d623f7 1756 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
91447636
A
1757 /*
1758 * Someone else has requested upgrade.
b0d623f7
A
1759 * Since we've released the read lock, wake
1760 * him up if he's blocked waiting
91447636 1761 */
b0d623f7
A
1762 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1763 }
5ba3f43e
A
1764
1765 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1766 /* sched_flags checked without lock, but will be rechecked while clearing */
d9a64523 1767 lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
5ba3f43e
A
1768 }
1769
b0d623f7 1770 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
ea3f0419 1771 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
91447636 1772
ea3f0419 1773 return FALSE;
b0d623f7 1774}
91447636 1775
91447636 1776
b0d623f7
A
1777/*
1778 * Routine: lck_rw_lock_shared_to_exclusive_failure
1779 * Function:
1780 * assembly fast path code has already dropped our read
1781 * count and successfully acquired 'lck_rw_want_upgrade'
1782 * we just need to wait for the rest of the readers to drain
1783 * and then we can return as the exclusive holder of this lock
1784 */
5ba3f43e 1785static boolean_t
b0d623f7 1786lck_rw_lock_shared_to_exclusive_success(
ea3f0419 1787 lck_rw_t *lck)
b0d623f7 1788{
ea3f0419
A
1789 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1790 uint64_t deadline = 0;
1791 int slept = 0;
1792 int still_shared = 0;
1793 wait_result_t res;
1794 boolean_t istate = -1;
91447636 1795
ea3f0419 1796#if CONFIG_DTRACE
b0d623f7
A
1797 uint64_t wait_interval = 0;
1798 int readers_at_sleep = 0;
1799 boolean_t dtrace_ls_initialized = FALSE;
1800 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1801#endif
91447636 1802
2d21ac55 1803 while (lck->lck_rw_shared_count != 0) {
ea3f0419 1804#if CONFIG_DTRACE
b0d623f7
A
1805 if (dtrace_ls_initialized == FALSE) {
1806 dtrace_ls_initialized = TRUE;
1807 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1808 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1809 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1810 if (dtrace_ls_enabled) {
1811 /*
1812 * Either sleeping or spinning is happening,
1813 * start a timing of our delay interval now.
1814 */
1815 readers_at_sleep = lck->lck_rw_shared_count;
1816 wait_interval = mach_absolute_time();
1817 }
2d21ac55
A
1818 }
1819#endif
ea3f0419 1820 if (istate == -1) {
b0d623f7 1821 istate = ml_get_interrupts_enabled();
ea3f0419 1822 }
b0d623f7
A
1823
1824 deadline = lck_rw_deadline_for_spin(lck);
1825
1826 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
ea3f0419 1827 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7 1828
ea3f0419 1829 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline) {
b0d623f7 1830 lck_rw_lock_pause(istate);
ea3f0419 1831 }
b0d623f7
A
1832
1833 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
ea3f0419 1834 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7 1835
ea3f0419 1836 if (!still_shared) {
b0d623f7 1837 break;
ea3f0419 1838 }
b0d623f7
A
1839 /*
1840 * if we get here, the deadline has expired w/o
1841 * the rw_shared_count having drained to 0
1842 * check to see if we're allowed to do a thread_block
1843 */
1844 if (lck->lck_rw_can_sleep) {
91447636 1845 istate = lck_interlock_lock(lck);
0a7de745 1846
b0d623f7
A
1847 if (lck->lck_rw_shared_count != 0) {
1848 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
ea3f0419 1849 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7
A
1850
1851 lck->lck_w_waiting = TRUE;
91447636 1852
813fb2f6 1853 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
d9a64523 1854 res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
ea3f0419 1855 THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
91447636 1856 lck_interlock_unlock(lck, istate);
b0d623f7
A
1857
1858 if (res == THREAD_WAITING) {
1859 res = thread_block(THREAD_CONTINUE_NULL);
1860 slept++;
1861 }
1862 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
ea3f0419 1863 trace_lck, res, slept, 0, 0);
b0d623f7
A
1864 } else {
1865 lck_interlock_unlock(lck, istate);
1866 break;
91447636
A
1867 }
1868 }
91447636 1869 }
ea3f0419 1870#if CONFIG_DTRACE
2d21ac55
A
1871 /*
1872 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1873 */
b0d623f7 1874 if (dtrace_ls_enabled == TRUE) {
2d21ac55 1875 if (slept == 0) {
0a7de745 1876 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
2d21ac55 1877 } else {
0a7de745 1878 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
2d21ac55
A
1879 mach_absolute_time() - wait_interval, 1,
1880 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1881 }
1882 }
2d21ac55
A
1883 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1884#endif
ea3f0419 1885 return TRUE;
91447636
A
1886}
1887
5ba3f43e
A
1888/*
1889 * Routine: lck_rw_lock_exclusive_to_shared
1890 */
1891
ea3f0419
A
1892void
1893lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
5ba3f43e 1894{
ea3f0419 1895 uint32_t data, prev;
5ba3f43e 1896
ea3f0419 1897 for (;;) {
5ba3f43e
A
1898 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1899 if (data & LCK_RW_INTERLOCK) {
1900 atomic_exchange_abort();
ea3f0419 1901 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
5ba3f43e
A
1902 continue;
1903 }
1904 data += LCK_RW_SHARED_READER;
ea3f0419 1905 if (data & LCK_RW_WANT_UPGRADE) {
5ba3f43e 1906 data &= ~(LCK_RW_WANT_UPGRADE);
ea3f0419 1907 } else {
5ba3f43e 1908 data &= ~(LCK_RW_WANT_EXCL);
ea3f0419
A
1909 }
1910 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL))) {
5ba3f43e 1911 data &= ~(LCK_RW_W_WAITING);
ea3f0419
A
1912 }
1913 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp)) {
5ba3f43e 1914 break;
ea3f0419 1915 }
5ba3f43e
A
1916 cpu_pause();
1917 }
1918 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1919}
1920
b0d623f7 1921
91447636 1922/*
5ba3f43e 1923 * Routine: lck_rw_lock_exclusive_to_shared_gen
ea3f0419 1924 * Function:
b0d623f7
A
1925 * assembly fast path has already dropped
1926 * our exclusive state and bumped lck_rw_shared_count
1927 * all we need to do here is determine if anyone
1928 * needs to be awakened.
91447636 1929 */
5ba3f43e 1930static void
b0d623f7 1931lck_rw_lock_exclusive_to_shared_gen(
ea3f0419
A
1932 lck_rw_t *lck,
1933 uint32_t prior_lock_state)
91447636 1934{
ea3f0419
A
1935 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1936 lck_rw_t *fake_lck;
91447636 1937
b0d623f7 1938 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1939
b0d623f7 1940 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
ea3f0419 1941 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
91447636 1942
b0d623f7
A
1943 /*
1944 * don't wake up anyone waiting to take the lock exclusively
1945 * since we hold a read count... when the read count drops to 0,
1946 * the writers will be woken.
1947 *
1948 * wake up any waiting readers if we don't have any writers waiting,
1949 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1950 */
ea3f0419 1951 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting) {
2d21ac55 1952 thread_wakeup(RW_LOCK_READER_EVENT(lck));
ea3f0419 1953 }
91447636
A
1954
1955 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
ea3f0419 1956 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
91447636 1957
2d21ac55
A
1958#if CONFIG_DTRACE
1959 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1960#endif
91447636
A
1961}
1962
1963
1964/*
1965 * Routine: lck_rw_try_lock
1966 */
1967boolean_t
1968lck_rw_try_lock(
ea3f0419
A
1969 lck_rw_t *lck,
1970 lck_rw_type_t lck_rw_type)
1971{
1972 if (lck_rw_type == LCK_RW_TYPE_SHARED) {
1973 return lck_rw_try_lock_shared(lck);
1974 } else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE) {
1975 return lck_rw_try_lock_exclusive(lck);
1976 } else {
91447636 1977 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
ea3f0419
A
1978 }
1979 return FALSE;
91447636
A
1980}
1981
5ba3f43e
A
1982/*
1983 * Routine: lck_rw_try_lock_shared
1984 */
1985
ea3f0419
A
1986boolean_t
1987lck_rw_try_lock_shared(lck_rw_t *lock)
5ba3f43e 1988{
ea3f0419 1989 uint32_t data, prev;
5ba3f43e 1990
ea3f0419 1991 for (;;) {
5ba3f43e
A
1992 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1993 if (data & LCK_RW_INTERLOCK) {
1994 atomic_exchange_abort();
1995 lck_rw_interlock_spin(lock);
1996 continue;
1997 }
1998 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1999 atomic_exchange_abort();
ea3f0419 2000 return FALSE; /* lock is busy */
5ba3f43e 2001 }
ea3f0419
A
2002 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
2003 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 2004 break;
ea3f0419 2005 }
5ba3f43e
A
2006 cpu_pause();
2007 }
2008 current_thread()->rwlock_count++;
2009 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
ea3f0419 2010#if CONFIG_DTRACE
5ba3f43e 2011 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
ea3f0419 2012#endif /* CONFIG_DTRACE */
5ba3f43e
A
2013 return TRUE;
2014}
2015
2016
2017/*
2018 * Routine: lck_rw_try_lock_exclusive
2019 */
2020
ea3f0419
A
2021boolean_t
2022lck_rw_try_lock_exclusive(lck_rw_t *lock)
5ba3f43e 2023{
ea3f0419 2024 uint32_t data, prev;
5ba3f43e 2025
ea3f0419 2026 for (;;) {
5ba3f43e
A
2027 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
2028 if (data & LCK_RW_INTERLOCK) {
2029 atomic_exchange_abort();
2030 lck_rw_interlock_spin(lock);
2031 continue;
2032 }
2033 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
2034 atomic_exchange_abort();
ea3f0419 2035 return FALSE; /* can't get it */
5ba3f43e
A
2036 }
2037 data |= LCK_RW_WANT_EXCL;
ea3f0419 2038 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp)) {
5ba3f43e 2039 break;
ea3f0419 2040 }
5ba3f43e
A
2041 cpu_pause();
2042 }
2043
2044 current_thread()->rwlock_count++;
ea3f0419 2045#if CONFIG_DTRACE
5ba3f43e 2046 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
ea3f0419 2047#endif /* CONFIG_DTRACE */
5ba3f43e
A
2048 return TRUE;
2049}
2050
91447636 2051
2d21ac55
A
2052void
2053lck_rw_assert(
ea3f0419
A
2054 lck_rw_t *lck,
2055 unsigned int type)
2d21ac55
A
2056{
2057 switch (type) {
2058 case LCK_RW_ASSERT_SHARED:
2059 if (lck->lck_rw_shared_count != 0) {
2060 return;
2061 }
2062 break;
2063 case LCK_RW_ASSERT_EXCLUSIVE:
2064 if ((lck->lck_rw_want_write ||
ea3f0419 2065 lck->lck_rw_want_upgrade) &&
2d21ac55
A
2066 lck->lck_rw_shared_count == 0) {
2067 return;
2068 }
2069 break;
2070 case LCK_RW_ASSERT_HELD:
2071 if (lck->lck_rw_want_write ||
2072 lck->lck_rw_want_upgrade ||
2073 lck->lck_rw_shared_count != 0) {
2074 return;
2075 }
2076 break;
39236c6e
A
2077 case LCK_RW_ASSERT_NOTHELD:
2078 if (!(lck->lck_rw_want_write ||
ea3f0419
A
2079 lck->lck_rw_want_upgrade ||
2080 lck->lck_rw_shared_count != 0)) {
39236c6e
A
2081 return;
2082 }
2083 break;
2d21ac55
A
2084 default:
2085 break;
2086 }
2087
39236c6e
A
2088 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
2089}
2090
2091/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
cb323159
A
2092#if MACH_LDEBUG
2093__dead2
2094#endif
39236c6e
A
2095void
2096lck_rw_clear_promotions_x86(thread_t thread)
2097{
2098#if MACH_LDEBUG
2099 /* It's fatal to leave a RW lock locked and return to userspace */
2100 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
2101#else
2102 /* Paper over the issue */
2103 thread->rwlock_count = 0;
d9a64523 2104 lck_rw_clear_promotion(thread, 0);
39236c6e 2105#endif
2d21ac55
A
2106}
2107
5ba3f43e
A
2108boolean_t
2109lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
2110{
2111 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
2112
2113 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
2114 lck_rw_unlock_shared(lck);
2115 mutex_pause(2);
2116 lck_rw_lock_shared(lck);
2117 return TRUE;
2118 }
2119
2120 return FALSE;
2121}
39236c6e 2122
3e170ce0
A
2123/*
2124 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
2125 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2126 */
2127boolean_t
ea3f0419
A
2128kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck)
2129{
3e170ce0
A
2130 if (not_in_kdp) {
2131 panic("panic: rw lock exclusive check done outside of kernel debugger");
2132 }
2133 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
2134}
2135
d9a64523
A
2136/*
2137 * Slow path routines for lck_mtx locking and unlocking functions.
2138 *
2139 * These functions were previously implemented in x86 assembly,
2140 * and some optimizations are in place in this c code to obtain a compiled code
2141 * as performant and compact as the assembly version.
2142 *
2143 * To avoid to inline these functions on the fast path, all functions directly called by
2144 * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2145 * in such a way the fast path can tail call into them. In this way the return address
2146 * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2147 *
2148 * Slow path code is structured in such a way there are no calls to functions that will return
2149 * on the context of the caller function, i.e. all functions called are or tail call functions
2150 * or inline functions. The number of arguments of the tail call functions are less then six,
2151 * so that they can be passed over registers and do not need to be pushed on stack.
2152 * This allows the compiler to not create a stack frame for the functions.
2153 *
2154 * __improbable and __probable are used to compile the slow path code in such a way
2155 * the fast path case will be on a sequence of instructions with as less jumps as possible,
2156 * to make this case the most optimized even if falling through the slow path.
2157 */
2158
2159/*
2160 * Intel lock invariants:
2161 *
2162 * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
d9a64523
A
2163 *
2164 * The lock owner is promoted to the max priority of all its waiters only if it
2165 * was a lower priority when it acquired or was an owner when a waiter waited.
2166 * Max priority is capped at MAXPRI_PROMOTE.
2167 *
2168 * The last waiter will not be promoted as it is woken up, but the last
2169 * lock owner may not have been the last thread to have been woken up depending on the
2170 * luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2171 * flag set.
2172 *
2173 * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2174 * priority from dropping priority in the future without having to take thread lock
2175 * on acquire.
2176 */
3e170ce0 2177
91447636
A
2178/*
2179 * Routine: lck_mtx_alloc_init
2180 */
2181lck_mtx_t *
2182lck_mtx_alloc_init(
ea3f0419
A
2183 lck_grp_t *grp,
2184 lck_attr_t *attr)
91447636 2185{
f427ee49
A
2186 lck_mtx_t *lck;
2187
2188 lck = zalloc(ZV_LCK_MTX);
2189 lck_mtx_init(lck, grp, attr);
ea3f0419 2190 return lck;
91447636
A
2191}
2192
2193/*
2194 * Routine: lck_mtx_free
2195 */
2196void
2197lck_mtx_free(
ea3f0419
A
2198 lck_mtx_t *lck,
2199 lck_grp_t *grp)
91447636
A
2200{
2201 lck_mtx_destroy(lck, grp);
f427ee49 2202 zfree(ZV_LCK_MTX, lck);
91447636
A
2203}
2204
2205/*
2206 * Routine: lck_mtx_ext_init
2207 */
2208static void
2209lck_mtx_ext_init(
ea3f0419
A
2210 lck_mtx_ext_t *lck,
2211 lck_grp_t *grp,
2212 lck_attr_t *attr)
91447636 2213{
2d21ac55 2214 bzero((void *)lck, sizeof(lck_mtx_ext_t));
91447636
A
2215
2216 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
91447636
A
2217 lck->lck_mtx_deb.type = MUTEX_TAG;
2218 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2219 }
2220
2221 lck->lck_mtx_grp = grp;
2d21ac55 2222
ea3f0419 2223 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT) {
6d2010ae 2224 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
ea3f0419 2225 }
b0d623f7 2226
6d2010ae 2227 lck->lck_mtx.lck_mtx_is_ext = 1;
39037602 2228 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
91447636
A
2229}
2230
2231/*
2232 * Routine: lck_mtx_init
2233 */
2234void
2235lck_mtx_init(
ea3f0419
A
2236 lck_mtx_t *lck,
2237 lck_grp_t *grp,
2238 lck_attr_t *attr)
91447636 2239{
ea3f0419
A
2240 lck_mtx_ext_t *lck_ext;
2241 lck_attr_t *lck_attr;
2d21ac55 2242
ea3f0419 2243 if (attr != LCK_ATTR_NULL) {
2d21ac55 2244 lck_attr = attr;
ea3f0419 2245 } else {
2d21ac55 2246 lck_attr = &LockDefaultLckAttr;
ea3f0419 2247 }
91447636 2248
2d21ac55 2249 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
f427ee49
A
2250 lck_ext = zalloc(ZV_LCK_MTX_EXT);
2251 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2252 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2253 lck->lck_mtx_ptr = lck_ext;
91447636 2254 } else {
b0d623f7 2255 lck->lck_mtx_owner = 0;
6d2010ae 2256 lck->lck_mtx_state = 0;
91447636 2257 }
39037602 2258 lck->lck_mtx_pad32 = 0xFFFFFFFF;
91447636
A
2259 lck_grp_reference(grp);
2260 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2261}
2262
2d21ac55
A
2263/*
2264 * Routine: lck_mtx_init_ext
2265 */
2266void
2267lck_mtx_init_ext(
ea3f0419
A
2268 lck_mtx_t *lck,
2269 lck_mtx_ext_t *lck_ext,
2270 lck_grp_t *grp,
2271 lck_attr_t *attr)
2d21ac55 2272{
ea3f0419 2273 lck_attr_t *lck_attr;
2d21ac55 2274
ea3f0419 2275 if (attr != LCK_ATTR_NULL) {
2d21ac55 2276 lck_attr = attr;
ea3f0419 2277 } else {
2d21ac55 2278 lck_attr = &LockDefaultLckAttr;
ea3f0419 2279 }
2d21ac55
A
2280
2281 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2282 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2283 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2284 lck->lck_mtx_ptr = lck_ext;
2285 } else {
b0d623f7 2286 lck->lck_mtx_owner = 0;
6d2010ae 2287 lck->lck_mtx_state = 0;
2d21ac55 2288 }
39037602 2289 lck->lck_mtx_pad32 = 0xFFFFFFFF;
6d2010ae 2290
2d21ac55
A
2291 lck_grp_reference(grp);
2292 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2293}
2294
d9a64523
A
2295static void
2296lck_mtx_lock_mark_destroyed(
2297 lck_mtx_t *mutex,
2298 boolean_t indirect)
2299{
2300 uint32_t state;
2301
2302 if (indirect) {
2303 /* convert to destroyed state */
2304 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2305 return;
2306 }
2307
2308 state = ordered_load_mtx_state(mutex);
2309 lck_mtx_interlock_lock(mutex, &state);
2310
2311 ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2312
2313 enable_preemption();
2314}
2315
91447636
A
2316/*
2317 * Routine: lck_mtx_destroy
2318 */
2319void
2320lck_mtx_destroy(
ea3f0419
A
2321 lck_mtx_t *lck,
2322 lck_grp_t *grp)
91447636 2323{
d9a64523 2324 boolean_t indirect;
0a7de745 2325
ea3f0419 2326 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED) {
91447636 2327 return;
ea3f0419 2328 }
39236c6e
A
2329#if MACH_LDEBUG
2330 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2331#endif
d9a64523 2332 indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
b0d623f7 2333
d9a64523 2334 lck_mtx_lock_mark_destroyed(lck, indirect);
b0d623f7 2335
ea3f0419 2336 if (indirect) {
f427ee49 2337 zfree(ZV_LCK_MTX_EXT, lck->lck_mtx_ptr);
ea3f0419 2338 }
91447636
A
2339 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2340 lck_grp_deallocate(grp);
2341 return;
2342}
2343
b0d623f7 2344
d9a64523
A
2345#if DEVELOPMENT | DEBUG
2346__attribute__((noinline))
2347void
2348lck_mtx_owner_check_panic(
2349 lck_mtx_t *lock)
2350{
2351 thread_t owner = (thread_t)lock->lck_mtx_owner;
2352 panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2353}
2354#endif
2355
2356__attribute__((always_inline))
2357static boolean_t
2358get_indirect_mutex(
2359 lck_mtx_t **lock,
ea3f0419 2360 uint32_t *state)
d9a64523
A
2361{
2362 *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
2363 *state = ordered_load_mtx_state(*lock);
2364 return TRUE;
2365}
2366
2367/*
ea3f0419 2368 * Routine: lck_mtx_unlock_slow
d9a64523
A
2369 *
2370 * Unlocks a mutex held by current thread.
2371 *
cb323159 2372 * It will wake up waiters if necessary.
d9a64523
A
2373 *
2374 * Interlock can be held.
2375 */
2376__attribute__((noinline))
2377void
2378lck_mtx_unlock_slow(
ea3f0419 2379 lck_mtx_t *lock)
d9a64523 2380{
ea3f0419
A
2381 thread_t thread;
2382 uint32_t state, prev;
2383 boolean_t indirect = FALSE;
d9a64523
A
2384
2385 state = ordered_load_mtx_state(lock);
2386
2387 /* Is this an indirect mutex? */
2388 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2389 indirect = get_indirect_mutex(&lock, &state);
2390 }
2391
2392 thread = current_thread();
2393
2394#if DEVELOPMENT | DEBUG
2395 thread_t owner = (thread_t)lock->lck_mtx_owner;
ea3f0419 2396 if (__improbable(owner != thread)) {
cb323159 2397 lck_mtx_owner_check_panic(lock);
ea3f0419 2398 }
d9a64523
A
2399#endif
2400
2401 /* check if it is held as a spinlock */
ea3f0419 2402 if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0)) {
d9a64523 2403 goto unlock;
ea3f0419 2404 }
d9a64523
A
2405
2406 lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2407
2408unlock:
2409 /* preemption disabled, interlock held and mutex not held */
2410
2411 /* clear owner */
2412 ordered_store_mtx_owner(lock, 0);
2413 /* keep original state in prev for later evaluation */
2414 prev = state;
d9a64523 2415
cb323159 2416 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
0a7de745 2417#if MACH_LDEBUG
ea3f0419 2418 if (thread) {
cb323159 2419 thread->mutex_count--;
ea3f0419 2420 }
cb323159
A
2421#endif
2422 return lck_mtx_unlock_wakeup_tail(lock, state, indirect);
0a7de745 2423 }
d9a64523 2424
cb323159
A
2425 /* release interlock, promotion and clear spin flag */
2426 state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK));
ea3f0419 2427 ordered_store_mtx_state_release(lock, state); /* since I own the interlock, I don't need an atomic update */
cb323159 2428
ea3f0419 2429#if MACH_LDEBUG
cb323159 2430 /* perform lock statistics after drop to prevent delay */
ea3f0419
A
2431 if (thread) {
2432 thread->mutex_count--; /* lock statistic */
2433 }
2434#endif /* MACH_LDEBUG */
d9a64523
A
2435
2436 /* re-enable preemption */
2437 lck_mtx_unlock_finish_inline(lock, FALSE);
2438
2439 return;
2440}
2441
ea3f0419
A
2442#define LCK_MTX_LCK_WAIT_CODE 0x20
2443#define LCK_MTX_LCK_WAKEUP_CODE 0x21
2444#define LCK_MTX_LCK_SPIN_CODE 0x22
2445#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2446#define LCK_MTX_LCK_DEMOTE_CODE 0x24
b0d623f7 2447
b0d623f7 2448/*
d9a64523 2449 * Routine: lck_mtx_unlock_wakeup_tail
b0d623f7 2450 *
d9a64523
A
2451 * Invoked on unlock when there is
2452 * contention, i.e. the assembly routine sees
cb323159 2453 * that mutex->lck_mtx_waiters != 0
b0d623f7 2454 *
6d2010ae 2455 * neither the mutex or interlock is held
d9a64523
A
2456 *
2457 * Note that this routine might not be called if there are pending
2458 * waiters which have previously been woken up, and they didn't
2459 * end up boosting the old owner.
2460 *
2461 * assembly routine previously did the following to mutex:
2462 * (after saving the state in prior_lock_state)
d9a64523
A
2463 * decremented lck_mtx_waiters if nonzero
2464 *
2465 * This function needs to be called as a tail call
2466 * to optimize the compiled code.
b0d623f7 2467 */
d9a64523
A
2468__attribute__((noinline))
2469static void
ea3f0419
A
2470lck_mtx_unlock_wakeup_tail(
2471 lck_mtx_t *mutex,
cb323159 2472 uint32_t state,
ea3f0419 2473 boolean_t indirect)
b0d623f7 2474{
cb323159 2475 struct turnstile *ts;
6d2010ae 2476
ea3f0419 2477 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
cb323159 2478 kern_return_t did_wake;
6d2010ae
A
2479
2480 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
ea3f0419 2481 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7 2482
cb323159 2483 ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
d9a64523 2484
cb323159
A
2485 if (mutex->lck_mtx_waiters > 1) {
2486 /* WAITQ_PROMOTE_ON_WAKE will call turnstile_update_inheritor on the wokenup thread */
2487 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_PROMOTE_ON_WAKE);
2488 } else {
2489 did_wake = waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2490 turnstile_update_inheritor(ts, NULL, TURNSTILE_IMMEDIATE_UPDATE);
6d2010ae 2491 }
cb323159 2492 assert(did_wake == KERN_SUCCESS);
b0d623f7 2493
cb323159
A
2494 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2495 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
b0d623f7 2496
cb323159 2497 state -= LCK_MTX_WAITER;
ea3f0419 2498 state &= (~(LCK_MTX_SPIN_MSK | LCK_MTX_ILOCKED_MSK));
cb323159 2499 ordered_store_mtx_state_release(mutex, state);
b0d623f7 2500
cb323159 2501 assert(current_thread()->turnstile != NULL);
b0d623f7 2502
cb323159 2503 turnstile_cleanup();
d9a64523 2504
6d2010ae 2505 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
ea3f0419 2506 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7 2507
d9a64523
A
2508 lck_mtx_unlock_finish_inline(mutex, indirect);
2509}
b0d623f7
A
2510
2511/*
ea3f0419 2512 * Routine: lck_mtx_lock_acquire_x86
b0d623f7
A
2513 *
2514 * Invoked on acquiring the mutex when there is
6d2010ae 2515 * contention (i.e. the assembly routine sees that
cb323159 2516 * that mutex->lck_mtx_waiters != 0
6d2010ae
A
2517 *
2518 * mutex is owned... interlock is held... preemption is disabled
b0d623f7 2519 */
d9a64523
A
2520__attribute__((always_inline))
2521static void
2522lck_mtx_lock_acquire_inline(
ea3f0419 2523 lck_mtx_t *mutex,
cb323159 2524 struct turnstile *ts)
b0d623f7 2525{
ea3f0419 2526 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
b0d623f7 2527
6d2010ae 2528 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
ea3f0419 2529 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7 2530
d9a64523 2531 thread_t thread = (thread_t)mutex->lck_mtx_owner; /* faster than current_thread() */
cb323159 2532 assert(thread->waiting_for_mutex == NULL);
b0d623f7 2533
cb323159
A
2534 if (mutex->lck_mtx_waiters > 0) {
2535 if (ts == NULL) {
2536 ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
0a7de745 2537 }
d9a64523 2538
cb323159
A
2539 turnstile_update_inheritor(ts, thread, (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
2540 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
2541 }
d9a64523 2542
cb323159
A
2543 if (ts != NULL) {
2544 turnstile_complete((uintptr_t)mutex, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2545 }
d9a64523 2546
cb323159 2547 assert(current_thread()->turnstile != NULL);
d9a64523 2548
6d2010ae 2549 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
ea3f0419 2550 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7
A
2551}
2552
d9a64523
A
2553void
2554lck_mtx_lock_acquire_x86(
ea3f0419 2555 lck_mtx_t *mutex)
d9a64523 2556{
cb323159 2557 return lck_mtx_lock_acquire_inline(mutex, NULL);
d9a64523
A
2558}
2559
2560/*
2561 * Tail call helpers for lock functions that perform
2562 * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2563 * the caller's compiled code.
2564 */
b0d623f7 2565
d9a64523
A
2566__attribute__((noinline))
2567static void
2568lck_mtx_lock_acquire_tail(
ea3f0419
A
2569 lck_mtx_t *mutex,
2570 boolean_t indirect,
cb323159 2571 struct turnstile *ts)
d9a64523 2572{
cb323159
A
2573 lck_mtx_lock_acquire_inline(mutex, ts);
2574 lck_mtx_lock_finish_inline_with_cleanup(mutex, ordered_load_mtx_state(mutex), indirect);
d9a64523
A
2575}
2576
2577__attribute__((noinline))
2578static boolean_t
2579lck_mtx_try_lock_acquire_tail(
ea3f0419 2580 lck_mtx_t *mutex)
d9a64523 2581{
cb323159 2582 lck_mtx_lock_acquire_inline(mutex, NULL);
d9a64523
A
2583 lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2584
2585 return TRUE;
2586}
2587
2588__attribute__((noinline))
2589static void
2590lck_mtx_convert_spin_acquire_tail(
ea3f0419 2591 lck_mtx_t *mutex)
d9a64523 2592{
cb323159 2593 lck_mtx_lock_acquire_inline(mutex, NULL);
d9a64523
A
2594 lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2595}
2596
2597boolean_t
2598lck_mtx_ilk_unlock(
2599 lck_mtx_t *mutex)
2600{
2601 lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2602 return TRUE;
2603}
2604
2605static inline void
2606lck_mtx_interlock_lock_set_and_clear_flags(
2607 lck_mtx_t *mutex,
2608 uint32_t xor_flags,
2609 uint32_t and_flags,
2610 uint32_t *new_state)
3e170ce0 2611{
d9a64523
A
2612 uint32_t state, prev;
2613 state = *new_state;
2614
ea3f0419 2615 for (;;) {
d9a64523
A
2616 /* have to wait for interlock to clear */
2617 while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
2618 cpu_pause();
2619 state = ordered_load_mtx_state(mutex);
2620 }
2621 prev = state; /* prev contains snapshot for exchange */
2622 state |= LCK_MTX_ILOCKED_MSK | xor_flags; /* pick up interlock */
ea3f0419 2623 state &= ~and_flags; /* clear flags */
d9a64523
A
2624
2625 disable_preemption();
ea3f0419 2626 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
d9a64523 2627 break;
ea3f0419 2628 }
d9a64523
A
2629 enable_preemption();
2630 cpu_pause();
2631 state = ordered_load_mtx_state(mutex);
2632 }
2633 *new_state = state;
2634 return;
2635}
2636
2637static inline void
2638lck_mtx_interlock_lock_clear_flags(
2639 lck_mtx_t *mutex,
2640 uint32_t and_flags,
2641 uint32_t *new_state)
2642{
2643 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
2644}
2645
2646static inline void
2647lck_mtx_interlock_lock(
2648 lck_mtx_t *mutex,
2649 uint32_t *new_state)
2650{
2651 return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
2652}
2653
2654static inline int
2655lck_mtx_interlock_try_lock_set_flags(
2656 lck_mtx_t *mutex,
2657 uint32_t or_flags,
2658 uint32_t *new_state)
2659{
2660 uint32_t state, prev;
2661 state = *new_state;
2662
2663 /* have to wait for interlock to clear */
2664 if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
2665 return 0;
2666 }
ea3f0419
A
2667 prev = state; /* prev contains snapshot for exchange */
2668 state |= LCK_MTX_ILOCKED_MSK | or_flags; /* pick up interlock */
d9a64523 2669 disable_preemption();
cb323159 2670 if (os_atomic_cmpxchg(&mutex->lck_mtx_state, prev, state, acquire)) {
ea3f0419
A
2671 *new_state = state;
2672 return 1;
d9a64523
A
2673 }
2674
2675 enable_preemption();
2676 return 0;
2677}
2678
d9a64523
A
2679__attribute__((noinline))
2680static void
2681lck_mtx_lock_contended(
2682 lck_mtx_t *lock,
2683 boolean_t indirect,
2684 boolean_t *first_miss)
2685{
2686 lck_mtx_spinwait_ret_type_t ret;
2687 uint32_t state;
2688 thread_t thread;
cb323159 2689 struct turnstile *ts = NULL;
d9a64523
A
2690
2691try_again:
2692
2693 if (indirect) {
0a7de745 2694 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
d9a64523
A
2695 }
2696
2697 ret = lck_mtx_lock_spinwait_x86(lock);
2698 state = ordered_load_mtx_state(lock);
2699 switch (ret) {
2700 case LCK_MTX_SPINWAIT_NO_SPIN:
2701 /*
2702 * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2703 * try to spin.
2704 */
2705 if (indirect) {
0a7de745 2706 lck_grp_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
d9a64523
A
2707 }
2708
f427ee49
A
2709 /* just fall through case LCK_MTX_SPINWAIT_SPUN */
2710 OS_FALLTHROUGH;
ea3f0419
A
2711 case LCK_MTX_SPINWAIT_SPUN_HIGH_THR:
2712 case LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE:
2713 case LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION:
2714 case LCK_MTX_SPINWAIT_SPUN_SLIDING_THR:
d9a64523
A
2715 /*
2716 * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2717 * interlock not held
2718 */
2719 lck_mtx_interlock_lock(lock, &state);
2720 assert(state & LCK_MTX_ILOCKED_MSK);
2721
2722 if (state & LCK_MTX_MLOCKED_MSK) {
2723 if (indirect) {
0a7de745 2724 lck_grp_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
d9a64523 2725 }
cb323159 2726 lck_mtx_lock_wait_x86(lock, &ts);
d9a64523
A
2727 /*
2728 * interlock is not held here.
2729 */
2730 goto try_again;
2731 } else {
d9a64523
A
2732 /* grab the mutex */
2733 state |= LCK_MTX_MLOCKED_MSK;
2734 ordered_store_mtx_state_release(lock, state);
2735 thread = current_thread();
2736 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2737#if MACH_LDEBUG
2738 if (thread) {
2739 thread->mutex_count++;
2740 }
2741#endif /* MACH_LDEBUG */
2742 }
2743
2744 break;
2745 case LCK_MTX_SPINWAIT_ACQUIRED:
2746 /*
2747 * mutex has been acquired by lck_mtx_lock_spinwait_x86
2748 * interlock is held and preemption disabled
2749 * owner is set and mutex marked as locked
2750 * statistics updated too
2751 */
2752 break;
2753 default:
2754 panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2755 }
2756
2757 /*
2758 * interlock is already acquired here
2759 */
2760
2761 /* mutex has been acquired */
2762 thread = (thread_t)lock->lck_mtx_owner;
cb323159
A
2763 if (state & LCK_MTX_WAITERS_MSK) {
2764 /*
2765 * lck_mtx_lock_acquire_tail will call
2766 * turnstile_complete.
2767 */
2768 return lck_mtx_lock_acquire_tail(lock, indirect, ts);
d9a64523
A
2769 }
2770
cb323159
A
2771 if (ts != NULL) {
2772 turnstile_complete((uintptr_t)lock, NULL, NULL, TURNSTILE_KERNEL_MUTEX);
2773 }
2774
2775 assert(current_thread()->turnstile != NULL);
2776
d9a64523 2777 /* release the interlock */
cb323159 2778 lck_mtx_lock_finish_inline_with_cleanup(lock, ordered_load_mtx_state(lock), indirect);
d9a64523
A
2779}
2780
2781/*
2782 * Helper noinline functions for calling
2783 * panic to optimize compiled code.
2784 */
2785
cb323159 2786__attribute__((noinline)) __abortlike
d9a64523
A
2787static void
2788lck_mtx_destroyed(
2789 lck_mtx_t *lock)
2790{
2791 panic("trying to interlock destroyed mutex (%p)", lock);
2792}
2793
2794__attribute__((noinline))
2795static boolean_t
2796lck_mtx_try_destroyed(
2797 lck_mtx_t *lock)
2798{
2799 panic("trying to interlock destroyed mutex (%p)", lock);
2800 return FALSE;
2801}
2802
2803__attribute__((always_inline))
2804static boolean_t
2805lck_mtx_lock_wait_interlock_to_clear(
2806 lck_mtx_t *lock,
2807 uint32_t* new_state)
2808{
2809 uint32_t state;
2810
ea3f0419 2811 for (;;) {
d9a64523
A
2812 cpu_pause();
2813 state = ordered_load_mtx_state(lock);
2814 if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
2815 *new_state = state;
2816 return TRUE;
2817 }
2818 if (state & LCK_MTX_MLOCKED_MSK) {
2819 /* if it is held as mutex, just fail */
2820 return FALSE;
2821 }
2822 }
2823}
2824
2825__attribute__((always_inline))
2826static boolean_t
2827lck_mtx_try_lock_wait_interlock_to_clear(
2828 lck_mtx_t *lock,
2829 uint32_t* new_state)
2830{
2831 uint32_t state;
2832
ea3f0419 2833 for (;;) {
d9a64523
A
2834 cpu_pause();
2835 state = ordered_load_mtx_state(lock);
2836 if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
2837 /* if it is held as mutex or spin, just fail */
2838 return FALSE;
2839 }
2840 if (!(state & LCK_MTX_ILOCKED_MSK)) {
2841 *new_state = state;
2842 return TRUE;
2843 }
2844 }
2845}
2846
2847/*
2848 * Routine: lck_mtx_lock_slow
2849 *
2850 * Locks a mutex for current thread.
2851 * If the lock is contended this function might
2852 * sleep.
2853 *
2854 * Called with interlock not held.
2855 */
2856__attribute__((noinline))
2857void
2858lck_mtx_lock_slow(
2859 lck_mtx_t *lock)
2860{
ea3f0419
A
2861 boolean_t indirect = FALSE;
2862 uint32_t state;
2863 int first_miss = 0;
d9a64523
A
2864
2865 state = ordered_load_mtx_state(lock);
2866
2867 /* is the interlock or mutex held */
2868 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2869 /*
2870 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2871 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2872 * set in state (state == lck_mtx_tag)
2873 */
2874
2875
2876 /* is the mutex already held and not indirect */
ea3f0419 2877 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
2878 /* no, must have been the mutex */
2879 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2880 }
2881
2882 /* check to see if it is marked destroyed */
2883 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
cb323159 2884 lck_mtx_destroyed(lock);
d9a64523
A
2885 }
2886
2887 /* Is this an indirect mutex? */
2888 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2889 indirect = get_indirect_mutex(&lock, &state);
2890
2891 first_miss = 0;
0a7de745 2892 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
2893
2894 if (state & LCK_MTX_SPIN_MSK) {
ea3f0419 2895 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
d9a64523 2896 assert(state & LCK_MTX_ILOCKED_MSK);
0a7de745 2897 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
d9a64523
A
2898 }
2899 }
2900
2901 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2902 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2903 }
2904 }
2905
2906 /* no - can't be INDIRECT, DESTROYED or locked */
2907 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2908 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2909 return lck_mtx_lock_contended(lock, indirect, &first_miss);
2910 }
2911 }
2912
2913 /* lock and interlock acquired */
2914
2915 thread_t thread = current_thread();
2916 /* record owner of mutex */
2917 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2918
2919#if MACH_LDEBUG
2920 if (thread) {
ea3f0419 2921 thread->mutex_count++; /* lock statistic */
d9a64523
A
2922 }
2923#endif
2924 /*
2925 * Check if there are waiters to
2926 * inherit their priority.
2927 */
2928 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
cb323159 2929 return lck_mtx_lock_acquire_tail(lock, indirect, NULL);
d9a64523
A
2930 }
2931
2932 /* release the interlock */
2933 lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2934
2935 return;
2936}
2937
2938__attribute__((noinline))
2939boolean_t
2940lck_mtx_try_lock_slow(
2941 lck_mtx_t *lock)
2942{
2943 boolean_t indirect = FALSE;
2944 uint32_t state;
2945 int first_miss = 0;
2946
2947 state = ordered_load_mtx_state(lock);
2948
2949 /* is the interlock or mutex held */
2950 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
2951 /*
2952 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2953 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2954 * set in state (state == lck_mtx_tag)
2955 */
2956
2957 /* is the mutex already held and not indirect */
ea3f0419 2958 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
2959 return FALSE;
2960 }
2961
2962 /* check to see if it is marked destroyed */
2963 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
cb323159 2964 lck_mtx_try_destroyed(lock);
d9a64523
A
2965 }
2966
2967 /* Is this an indirect mutex? */
2968 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2969 indirect = get_indirect_mutex(&lock, &state);
2970
2971 first_miss = 0;
0a7de745 2972 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
2973 }
2974
2975 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
ea3f0419 2976 if (indirect) {
0a7de745 2977 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
ea3f0419 2978 }
d9a64523
A
2979 return FALSE;
2980 }
2981 }
2982
2983 /* no - can't be INDIRECT, DESTROYED or locked */
2984 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2985 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
ea3f0419 2986 if (indirect) {
0a7de745 2987 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
ea3f0419 2988 }
d9a64523
A
2989 return FALSE;
2990 }
2991 }
2992
2993 /* lock and interlock acquired */
2994
2995 thread_t thread = current_thread();
2996 /* record owner of mutex */
2997 ordered_store_mtx_owner(lock, (uintptr_t)thread);
2998
2999#if MACH_LDEBUG
3000 if (thread) {
ea3f0419 3001 thread->mutex_count++; /* lock statistic */
d9a64523
A
3002 }
3003#endif
3004 /*
3005 * Check if there are waiters to
3006 * inherit their priority.
3007 */
3008 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3009 return lck_mtx_try_lock_acquire_tail(lock);
3010 }
3011
3012 /* release the interlock */
3013 lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3014
3015 return TRUE;
d9a64523
A
3016}
3017
3018__attribute__((noinline))
3019void
3020lck_mtx_lock_spin_slow(
ea3f0419 3021 lck_mtx_t *lock)
d9a64523
A
3022{
3023 boolean_t indirect = FALSE;
3024 uint32_t state;
3025 int first_miss = 0;
3026
3027 state = ordered_load_mtx_state(lock);
3028
3029 /* is the interlock or mutex held */
3030 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3031 /*
3032 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3033 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3034 * set in state (state == lck_mtx_tag)
3035 */
3036
3037
3038 /* is the mutex already held and not indirect */
ea3f0419 3039 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3040 /* no, must have been the mutex */
3041 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3042 }
3043
3044 /* check to see if it is marked destroyed */
3045 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
cb323159 3046 lck_mtx_destroyed(lock);
d9a64523
A
3047 }
3048
3049 /* Is this an indirect mutex? */
3050 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3051 indirect = get_indirect_mutex(&lock, &state);
3052
3053 first_miss = 0;
0a7de745 3054 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3055
3056 if (state & LCK_MTX_SPIN_MSK) {
ea3f0419 3057 /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
d9a64523 3058 assert(state & LCK_MTX_ILOCKED_MSK);
0a7de745 3059 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
d9a64523
A
3060 }
3061 }
3062
3063 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3064 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3065 }
3066 }
3067
3068 /* no - can't be INDIRECT, DESTROYED or locked */
ea3f0419 3069 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
d9a64523
A
3070 if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3071 return lck_mtx_lock_contended(lock, indirect, &first_miss);
3072 }
3073 }
3074
3075 /* lock as spinlock and interlock acquired */
3076
3077 thread_t thread = current_thread();
3078 /* record owner of mutex */
3079 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3080
3081#if MACH_LDEBUG
3082 if (thread) {
3083 thread->mutex_count++; /* lock statistic */
3084 }
3085#endif
3086
ea3f0419 3087#if CONFIG_DTRACE
d9a64523
A
3088 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
3089#endif
3090 /* return with the interlock held and preemption disabled */
3091 return;
3092}
3093
3094__attribute__((noinline))
3095boolean_t
3096lck_mtx_try_lock_spin_slow(
3097 lck_mtx_t *lock)
3098{
3099 boolean_t indirect = FALSE;
3100 uint32_t state;
3101 int first_miss = 0;
3102
3103 state = ordered_load_mtx_state(lock);
3104
3105 /* is the interlock or mutex held */
3106 if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
3107 /*
3108 * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3109 * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3110 * set in state (state == lck_mtx_tag)
3111 */
3112
3113 /* is the mutex already held and not indirect */
ea3f0419 3114 if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))) {
d9a64523
A
3115 return FALSE;
3116 }
3117
3118 /* check to see if it is marked destroyed */
3119 if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
cb323159 3120 lck_mtx_try_destroyed(lock);
d9a64523
A
3121 }
3122
3123 /* Is this an indirect mutex? */
3124 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3125 indirect = get_indirect_mutex(&lock, &state);
3126
3127 first_miss = 0;
0a7de745 3128 lck_grp_mtx_update_held((struct _lck_mtx_ext_*)lock);
d9a64523
A
3129 }
3130
3131 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
ea3f0419 3132 if (indirect) {
0a7de745 3133 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
ea3f0419 3134 }
d9a64523
A
3135 return FALSE;
3136 }
3137 }
3138
3139 /* no - can't be INDIRECT, DESTROYED or locked */
3140 while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3141 if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
ea3f0419 3142 if (indirect) {
0a7de745 3143 lck_grp_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
ea3f0419 3144 }
d9a64523
A
3145 return FALSE;
3146 }
3147 }
3148
3149 /* lock and interlock acquired */
3150
3151 thread_t thread = current_thread();
3152 /* record owner of mutex */
3153 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3154
3155#if MACH_LDEBUG
3156 if (thread) {
ea3f0419 3157 thread->mutex_count++; /* lock statistic */
d9a64523
A
3158 }
3159#endif
3160
3161#if CONFIG_DTRACE
3162 LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
3163#endif
3164 return TRUE;
d9a64523
A
3165}
3166
3167__attribute__((noinline))
3168void
3169lck_mtx_convert_spin(
ea3f0419 3170 lck_mtx_t *lock)
d9a64523
A
3171{
3172 uint32_t state;
3173
3174 state = ordered_load_mtx_state(lock);
3175
3176 /* Is this an indirect mutex? */
3177 if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3178 /* If so, take indirection */
3179 get_indirect_mutex(&lock, &state);
3180 }
3181
3182 assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3183
3184 if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3185 /* already owned as a mutex, just return */
3186 return;
3187 }
3188
3189 assert(get_preemption_level() > 0);
3190 assert(state & LCK_MTX_ILOCKED_MSK);
3191 assert(state & LCK_MTX_SPIN_MSK);
3192
3193 /*
3194 * Check if there are waiters to
3195 * inherit their priority.
3196 */
3197 if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3198 return lck_mtx_convert_spin_acquire_tail(lock);
3199 }
3200
3201 lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3202
3203 return;
3204}
3205
3206static inline boolean_t
3207lck_mtx_lock_grab_mutex(
ea3f0419 3208 lck_mtx_t *lock)
d9a64523
A
3209{
3210 uint32_t state;
3211
3212 state = ordered_load_mtx_state(lock);
3213
3214 if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3215 return FALSE;
3216 }
3217
3218 /* lock and interlock acquired */
3219
3220 thread_t thread = current_thread();
3221 /* record owner of mutex */
3222 ordered_store_mtx_owner(lock, (uintptr_t)thread);
3223
3224#if MACH_LDEBUG
3225 if (thread) {
ea3f0419 3226 thread->mutex_count++; /* lock statistic */
d9a64523
A
3227 }
3228#endif
3229 return TRUE;
3230}
3231
3232__attribute__((noinline))
3233void
3234lck_mtx_assert(
ea3f0419
A
3235 lck_mtx_t *lock,
3236 unsigned int type)
d9a64523
A
3237{
3238 thread_t thread, owner;
3239 uint32_t state;
3240
3241 thread = current_thread();
3242 state = ordered_load_mtx_state(lock);
3243
3244 if (state == LCK_MTX_TAG_INDIRECT) {
3245 get_indirect_mutex(&lock, &state);
3246 }
3247
3248 owner = (thread_t)lock->lck_mtx_owner;
3249
3250 if (type == LCK_MTX_ASSERT_OWNED) {
ea3f0419 3251 if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
d9a64523 3252 panic("mutex (%p) not owned\n", lock);
ea3f0419 3253 }
d9a64523 3254 } else {
ea3f0419
A
3255 assert(type == LCK_MTX_ASSERT_NOTOWNED);
3256 if (owner == thread) {
d9a64523 3257 panic("mutex (%p) owned\n", lock);
ea3f0419 3258 }
d9a64523
A
3259 }
3260}
b0d623f7 3261
91447636 3262/*
ea3f0419 3263 * Routine: lck_mtx_lock_spinwait_x86
0c530ab8
A
3264 *
3265 * Invoked trying to acquire a mutex when there is contention but
3266 * the holder is running on another processor. We spin for up to a maximum
3267 * time waiting for the lock to be released.
3268 *
3269 * Called with the interlock unlocked.
d9a64523
A
3270 * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3271 * returns LCK_MTX_SPINWAIT_SPUN if we spun
3272 * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
0c530ab8 3273 */
d9a64523
A
3274__attribute__((noinline))
3275lck_mtx_spinwait_ret_type_t
b0d623f7 3276lck_mtx_lock_spinwait_x86(
ea3f0419 3277 lck_mtx_t *mutex)
0c530ab8 3278{
ea3f0419
A
3279 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3280 thread_t owner, prev_owner;
3281 uint64_t window_deadline, sliding_deadline, high_deadline;
3282 uint64_t start_time, cur_time, avg_hold_time, bias, delta;
3283 lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3284 int loopcount = 0;
3285 int total_hold_time_samples, window_hold_time_samples, unfairness;
3286 uint i, prev_owner_cpu;
3287 bool owner_on_core, adjust;
0c530ab8 3288
6d2010ae 3289 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
ea3f0419 3290 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
0c530ab8 3291
ea3f0419
A
3292 start_time = mach_absolute_time();
3293 /*
3294 * window_deadline represents the "learning" phase.
3295 * The thread collects statistics about the lock during
3296 * window_deadline and then it makes a decision on whether to spin more
3297 * or block according to the concurrency behavior
3298 * observed.
3299 *
3300 * Every thread can spin at least low_MutexSpin.
3301 */
3302 window_deadline = start_time + low_MutexSpin;
3303 /*
3304 * Sliding_deadline is the adjusted spin deadline
3305 * computed after the "learning" phase.
3306 */
3307 sliding_deadline = window_deadline;
3308 /*
3309 * High_deadline is a hard deadline. No thread
3310 * can spin more than this deadline.
3311 */
3312 if (high_MutexSpin >= 0) {
3313 high_deadline = start_time + high_MutexSpin;
3314 } else {
3315 high_deadline = start_time + low_MutexSpin * real_ncpus;
3316 }
b0d623f7 3317
ea3f0419
A
3318 /*
3319 * Do not know yet which is the owner cpu.
3320 * Initialize prev_owner_cpu with next cpu.
3321 */
3322 prev_owner_cpu = (cpu_number() + 1) % real_ncpus;
3323 total_hold_time_samples = 0;
3324 window_hold_time_samples = 0;
3325 avg_hold_time = 0;
3326 adjust = TRUE;
3327 bias = (os_hash_kernel_pointer(mutex) + cpu_number()) % real_ncpus;
3328
3329 prev_owner = (thread_t) mutex->lck_mtx_owner;
0c530ab8
A
3330 /*
3331 * Spin while:
3332 * - mutex is locked, and
ea3f0419 3333 * - it's locked as a spin lock, and
0c530ab8 3334 * - owner is running on another processor, and
0c530ab8
A
3335 * - we haven't spun for long enough.
3336 */
b0d623f7 3337 do {
ea3f0419
A
3338 /*
3339 * Try to acquire the lock.
3340 */
6d2010ae 3341 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
d9a64523 3342 retval = LCK_MTX_SPINWAIT_ACQUIRED;
b0d623f7 3343 break;
2d21ac55 3344 }
ea3f0419 3345
3e170ce0 3346 cur_time = mach_absolute_time();
b0d623f7 3347
ea3f0419
A
3348 /*
3349 * Never spin past high_deadline.
3350 */
3351 if (cur_time >= high_deadline) {
3352 retval = LCK_MTX_SPINWAIT_SPUN_HIGH_THR;
3e170ce0 3353 break;
ea3f0419 3354 }
3e170ce0 3355
ea3f0419
A
3356 /*
3357 * Check if owner is on core. If not block.
3358 */
3359 owner = (thread_t) mutex->lck_mtx_owner;
3360 if (owner) {
3361 i = prev_owner_cpu;
3362 owner_on_core = FALSE;
3363
3364 disable_preemption();
3365 owner = (thread_t) mutex->lck_mtx_owner;
3e170ce0 3366
d9a64523 3367 /*
ea3f0419
A
3368 * For scalability we want to check if the owner is on core
3369 * without locking the mutex interlock.
3370 * If we do not lock the mutex interlock, the owner that we see might be
3371 * invalid, so we cannot dereference it. Therefore we cannot check
3372 * any field of the thread to tell us if it is on core.
3373 * Check if the thread that is running on the other cpus matches the owner.
3374 */
3375 if (owner) {
3376 do {
3377 if ((cpu_data_ptr[i] != NULL) && (cpu_data_ptr[i]->cpu_active_thread == owner)) {
3378 owner_on_core = TRUE;
3379 break;
3380 }
3381 if (++i >= real_ncpus) {
3382 i = 0;
3383 }
3384 } while (i != prev_owner_cpu);
3385 enable_preemption();
3386
3387 if (owner_on_core) {
3388 prev_owner_cpu = i;
3389 } else {
3390 prev_owner = owner;
3391 owner = (thread_t) mutex->lck_mtx_owner;
3392 if (owner == prev_owner) {
3393 /*
3394 * Owner is not on core.
3395 * Stop spinning.
3396 */
3397 if (loopcount == 0) {
d9a64523 3398 retval = LCK_MTX_SPINWAIT_NO_SPIN;
ea3f0419
A
3399 } else {
3400 retval = LCK_MTX_SPINWAIT_SPUN_OWNER_NOT_CORE;
3401 }
3e170ce0
A
3402 break;
3403 }
ea3f0419
A
3404 /*
3405 * Fall through if the owner changed while we were scanning.
3406 * The new owner could potentially be on core, so loop
3407 * again.
3408 */
3e170ce0 3409 }
ea3f0419
A
3410 } else {
3411 enable_preemption();
3412 }
3413 }
3e170ce0 3414
ea3f0419
A
3415 /*
3416 * Save how many times we see the owner changing.
3417 * We can roughly estimate the mutex hold
3418 * time and the fairness with that.
3419 */
3420 if (owner != prev_owner) {
3421 prev_owner = owner;
3422 total_hold_time_samples++;
3423 window_hold_time_samples++;
3424 }
3425
3426 /*
3427 * Learning window expired.
3428 * Try to adjust the sliding_deadline.
3429 */
3430 if (cur_time >= window_deadline) {
3431 /*
3432 * If there was not contention during the window
3433 * stop spinning.
3434 */
3435 if (window_hold_time_samples < 1) {
3436 retval = LCK_MTX_SPINWAIT_SPUN_NO_WINDOW_CONTENTION;
3437 break;
b0d623f7 3438 }
ea3f0419
A
3439
3440 if (adjust) {
3441 /*
3442 * For a fair lock, we'd wait for at most (NCPU-1) periods,
3443 * but the lock is unfair, so let's try to estimate by how much.
3444 */
3445 unfairness = total_hold_time_samples / real_ncpus;
3446
3447 if (unfairness == 0) {
3448 /*
3449 * We observed the owner changing `total_hold_time_samples` times which
3450 * let us estimate the average hold time of this mutex for the duration
3451 * of the spin time.
3452 * avg_hold_time = (cur_time - start_time) / total_hold_time_samples;
3453 *
3454 * In this case spin at max avg_hold_time * (real_ncpus - 1)
3455 */
3456 delta = cur_time - start_time;
3457 sliding_deadline = start_time + (delta * (real_ncpus - 1)) / total_hold_time_samples;
3458 } else {
3459 /*
3460 * In this case at least one of the other cpus was able to get the lock twice
3461 * while I was spinning.
3462 * We could spin longer but it won't necessarily help if the system is unfair.
3463 * Try to randomize the wait to reduce contention.
3464 *
3465 * We compute how much time we could potentially spin
3466 * and distribute it over the cpus.
3467 *
3468 * bias is an integer between 0 and real_ncpus.
3469 * distributed_increment = ((high_deadline - cur_time) / real_ncpus) * bias
3470 */
3471 delta = high_deadline - cur_time;
3472 sliding_deadline = cur_time + ((delta * bias) / real_ncpus);
3473 adjust = FALSE;
3474 }
3475 }
3476
3477 window_deadline += low_MutexSpin;
3478 window_hold_time_samples = 0;
b0d623f7 3479 }
b0d623f7 3480
ea3f0419
A
3481 /*
3482 * Stop spinning if we past
3483 * the adjusted deadline.
3484 */
3485 if (cur_time >= sliding_deadline) {
3486 retval = LCK_MTX_SPINWAIT_SPUN_SLIDING_THR;
3487 break;
3488 }
3489
3490 if ((thread_t) mutex->lck_mtx_owner != NULL) {
3491 cpu_pause();
3492 }
cb323159 3493
ea3f0419 3494 loopcount++;
3e170ce0 3495 } while (TRUE);
b0d623f7 3496
ea3f0419 3497#if CONFIG_DTRACE
2d21ac55 3498 /*
2d21ac55 3499 * Note that we record a different probe id depending on whether
ea3f0419 3500 * this is a direct or indirect mutex. This allows us to
2d21ac55
A
3501 * penalize only lock groups that have debug/stats enabled
3502 * with dtrace processing if desired.
3503 */
6d2010ae 3504 if (__probable(mutex->lck_mtx_is_ext == 0)) {
b0d623f7 3505 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
ea3f0419 3506 mach_absolute_time() - start_time);
2d21ac55 3507 } else {
b0d623f7 3508 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
ea3f0419 3509 mach_absolute_time() - start_time);
2d21ac55
A
3510 }
3511 /* The lockstat acquire event is recorded by the assembly code beneath us. */
3512#endif
b0d623f7 3513
6d2010ae 3514 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
ea3f0419 3515 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
b0d623f7
A
3516
3517 return retval;
0c530ab8
A
3518}
3519
b0d623f7
A
3520
3521
0c530ab8 3522/*
ea3f0419 3523 * Routine: lck_mtx_lock_wait_x86
b0d623f7
A
3524 *
3525 * Invoked in order to wait on contention.
3526 *
3527 * Called with the interlock locked and
d9a64523 3528 * preemption disabled...
6d2010ae 3529 * returns it unlocked and with preemption enabled
d9a64523
A
3530 *
3531 * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3532 * A runnable waiter can exist between wait and acquire
3533 * without a waiters count being set.
3534 * This allows us to never make a spurious wakeup call.
3535 *
3536 * Priority:
3537 * This avoids taking the thread lock if the owning thread is the same priority.
3538 * This optimizes the case of same-priority threads contending on a lock.
3539 * However, that allows the owning thread to drop in priority while holding the lock,
3540 * because there is no state that the priority change can notice that
3541 * says that the targeted thread holds a contended mutex.
3542 *
3543 * One possible solution: priority changes could look for some atomic tag
3544 * on the thread saying 'holding contended lock', and then set up a promotion.
3545 * Needs a story for dropping that promotion - the last contended unlock
3546 * has to notice that this has happened.
0c530ab8 3547 */
d9a64523 3548__attribute__((noinline))
0c530ab8 3549void
ea3f0419
A
3550lck_mtx_lock_wait_x86(
3551 lck_mtx_t *mutex,
cb323159 3552 struct turnstile **ts)
0c530ab8 3553{
cb323159
A
3554 thread_t self = current_thread();
3555
ea3f0419 3556#if CONFIG_DTRACE
d9a64523 3557 uint64_t sleep_start = 0;
b0d623f7
A
3558
3559 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3560 sleep_start = mach_absolute_time();
3561 }
3562#endif
d9a64523
A
3563 __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3564
6d2010ae 3565 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
ea3f0419
A
3566 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3567 mutex->lck_mtx_waiters, 0, 0);
b0d623f7 3568
cb323159
A
3569 assert(self->waiting_for_mutex == NULL);
3570 self->waiting_for_mutex = mutex;
3571 mutex->lck_mtx_waiters++;
39236c6e 3572
d9a64523 3573 thread_t holder = (thread_t)mutex->lck_mtx_owner;
d9a64523
A
3574 assert(holder != NULL);
3575
3576 /*
cb323159
A
3577 * lck_mtx_lock_wait_x86 might be called on a loop. Call prepare just once and reuse
3578 * the same turnstile while looping, the matching turnstile compleate will be called
3579 * by lck_mtx_lock_contended when finally acquiring the lock.
d9a64523 3580 */
cb323159
A
3581 if (*ts == NULL) {
3582 *ts = turnstile_prepare((uintptr_t)mutex, NULL, TURNSTILE_NULL, TURNSTILE_KERNEL_MUTEX);
b0d623f7 3583 }
d9a64523 3584
cb323159 3585 struct turnstile *turnstile = *ts;
813fb2f6 3586 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
cb323159
A
3587 turnstile_update_inheritor(turnstile, holder, (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
3588
3589 waitq_assert_wait64(&turnstile->ts_waitq, CAST_EVENT64_T(LCK_MTX_EVENT(mutex)), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER, TIMEOUT_WAIT_FOREVER);
b0d623f7
A
3590
3591 lck_mtx_ilk_unlock(mutex);
3592
cb323159
A
3593 turnstile_update_inheritor_complete(turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
3594
b0d623f7
A
3595 thread_block(THREAD_CONTINUE_NULL);
3596
d9a64523
A
3597 self->waiting_for_mutex = NULL;
3598
6d2010ae 3599 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
ea3f0419
A
3600 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3601 mutex->lck_mtx_waiters, 0, 0);
b0d623f7 3602
ea3f0419 3603#if CONFIG_DTRACE
b0d623f7
A
3604 /*
3605 * Record the Dtrace lockstat probe for blocking, block time
3606 * measured from when we were entered.
3607 */
3608 if (sleep_start) {
6d2010ae 3609 if (mutex->lck_mtx_is_ext == 0) {
b0d623f7
A
3610 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3611 mach_absolute_time() - sleep_start);
3612 } else {
3613 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3614 mach_absolute_time() - sleep_start);
3615 }
3616 }
3617#endif
0c530ab8 3618}
3e170ce0
A
3619
3620/*
3621 * Routine: kdp_lck_mtx_lock_spin_is_acquired
3622 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3623 * Returns: TRUE if lock is acquired.
3624 */
3625boolean_t
ea3f0419 3626kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3e170ce0
A
3627{
3628 if (not_in_kdp) {
3629 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3630 }
3631
39037602 3632 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3e170ce0
A
3633 return TRUE;
3634 }
3635
3636 return FALSE;
3637}
3638
813fb2f6
A
3639void
3640kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3641{
3642 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3643 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3644 thread_t holder = (thread_t)mutex->lck_mtx_owner;
3645 waitinfo->owner = thread_tid(holder);
3646}
3647
3648void
3649kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3650{
3651 lck_rw_t *rwlck = NULL;
ea3f0419
A
3652 switch (waitinfo->wait_type) {
3653 case kThreadWaitKernelRWLockRead:
3654 rwlck = READ_EVENT_TO_RWLOCK(event);
3655 break;
3656 case kThreadWaitKernelRWLockWrite:
3657 case kThreadWaitKernelRWLockUpgrade:
3658 rwlck = WRITE_EVENT_TO_RWLOCK(event);
3659 break;
3660 default:
3661 panic("%s was called with an invalid blocking type", __FUNCTION__);
3662 break;
813fb2f6
A
3663 }
3664 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3665 waitinfo->owner = 0;
3666}