]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/locks_i386.c
xnu-4570.41.2.tar.gz
[apple/xnu.git] / osfmk / i386 / locks_i386.c
CommitLineData
91447636 1/*
39236c6e 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
91447636 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
91447636 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
91447636
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 * File: kern/lock.c
58 * Author: Avadis Tevanian, Jr., Michael Wayne Young
59 * Date: 1985
60 *
61 * Locking primitives implementation
62 */
63
91447636
A
64#include <mach_ldebug.h>
65
91447636
A
66#include <kern/locks.h>
67#include <kern/kalloc.h>
68#include <kern/misc_protos.h>
69#include <kern/thread.h>
70#include <kern/processor.h>
71#include <kern/cpu_data.h>
72#include <kern/cpu_number.h>
73#include <kern/sched_prim.h>
74#include <kern/xpr.h>
75#include <kern/debug.h>
76#include <string.h>
77
060df5ea 78#include <i386/machine_routines.h> /* machine_timeout_suspended() */
5ba3f43e 79#include <machine/atomic.h>
b0d623f7 80#include <machine/machine_cpu.h>
060df5ea 81#include <i386/mp.h>
91447636
A
82
83#include <sys/kdebug.h>
6d2010ae 84#include <mach/branch_predicates.h>
91447636 85
2d21ac55
A
86/*
87 * We need only enough declarations from the BSD-side to be able to
88 * test if our probe is active, and to call __dtrace_probe(). Setting
89 * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
90 */
91#if CONFIG_DTRACE
92#define NEED_DTRACE_DEFS
93#include <../bsd/sys/lockstat.h>
5ba3f43e
A
94
95#define DTRACE_RW_SHARED 0x0 //reader
96#define DTRACE_RW_EXCL 0x1 //writer
97#define DTRACE_NO_FLAG 0x0 //not applicable
98
2d21ac55
A
99#endif
100
91447636
A
101#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
102#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
103#define LCK_RW_LCK_SHARED_CODE 0x102
104#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
105#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
106#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
107
b0d623f7
A
108#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
109#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
110#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
111#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
112#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
113#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
114#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
115#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
116
91447636
A
117
118#define ANY_LOCK_DEBUG (USLOCK_DEBUG || LOCK_DEBUG || MUTEX_DEBUG)
119
120unsigned int LcksOpts=0;
91447636 121
5ba3f43e
A
122#if DEVELOPMENT || DEBUG
123unsigned int LckDisablePreemptCheck = 0;
124#endif
125
91447636
A
126/* Forwards */
127
91447636
A
128#if USLOCK_DEBUG
129/*
130 * Perform simple lock checks.
131 */
132int uslock_check = 1;
133int max_lock_loops = 100000000;
134decl_simple_lock_data(extern , printf_lock)
135decl_simple_lock_data(extern , panic_lock)
91447636
A
136#endif /* USLOCK_DEBUG */
137
fe8ab488 138extern unsigned int not_in_kdp;
91447636
A
139
140/*
141 * We often want to know the addresses of the callers
142 * of the various lock routines. However, this information
143 * is only used for debugging and statistics.
144 */
145typedef void *pc_t;
146#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
147#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
148#if ANY_LOCK_DEBUG
b0d623f7 149#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
91447636
A
150#define DECL_PC(pc) pc_t pc;
151#else /* ANY_LOCK_DEBUG */
152#define DECL_PC(pc)
153#ifdef lint
154/*
155 * Eliminate lint complaints about unused local pc variables.
156 */
b0d623f7 157#define OBTAIN_PC(pc) ++pc
91447636 158#else /* lint */
b0d623f7 159#define OBTAIN_PC(pc)
91447636
A
160#endif /* lint */
161#endif /* USLOCK_DEBUG */
162
5ba3f43e
A
163// Enforce program order of loads and stores.
164#define ordered_load(target) _Generic( (target),\
165 uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \
166 uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) )
167#define ordered_store(target, value) _Generic( (target),\
168 uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_relaxed), \
169 uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_relaxed) )
170
171/*
172 * atomic exchange API is a low level abstraction of the operations
173 * to atomically read, modify, and write a pointer. This abstraction works
174 * for both Intel and ARMv8.1 compare and exchange atomic instructions as
175 * well as the ARM exclusive instructions.
176 *
177 * atomic_exchange_begin() - begin exchange and retrieve current value
178 * atomic_exchange_complete() - conclude an exchange
179 * atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
180 */
181static uint32_t
182atomic_exchange_begin32(uint32_t *target, uint32_t *previous, enum memory_order ord)
183{
184 uint32_t val;
185
186 (void)ord; // Memory order not used
187 val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
188 *previous = val;
189 return val;
190}
191
192static boolean_t
193atomic_exchange_complete32(uint32_t *target, uint32_t previous, uint32_t newval, enum memory_order ord)
194{
195 return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
196}
197
198static void
199atomic_exchange_abort(void) { }
200
201static boolean_t
202atomic_test_and_set32(uint32_t *target, uint32_t test_mask, uint32_t set_mask, enum memory_order ord, boolean_t wait)
203{
204 uint32_t value, prev;
205
206 for ( ; ; ) {
207 value = atomic_exchange_begin32(target, &prev, ord);
208 if (value & test_mask) {
209 if (wait)
210 cpu_pause();
211 else
212 atomic_exchange_abort();
213 return FALSE;
214 }
215 value |= set_mask;
216 if (atomic_exchange_complete32(target, prev, value, ord))
217 return TRUE;
218 }
219}
91447636
A
220
221/*
222 * Portable lock package implementation of usimple_locks.
223 */
224
225#if USLOCK_DEBUG
226#define USLDBG(stmt) stmt
227void usld_lock_init(usimple_lock_t, unsigned short);
228void usld_lock_pre(usimple_lock_t, pc_t);
229void usld_lock_post(usimple_lock_t, pc_t);
230void usld_unlock(usimple_lock_t, pc_t);
231void usld_lock_try_pre(usimple_lock_t, pc_t);
232void usld_lock_try_post(usimple_lock_t, pc_t);
233int usld_lock_common_checks(usimple_lock_t, char *);
234#else /* USLOCK_DEBUG */
235#define USLDBG(stmt)
236#endif /* USLOCK_DEBUG */
237
b0d623f7 238
2d21ac55
A
239/*
240 * Forward definitions
241 */
242
5ba3f43e
A
243static void lck_rw_lock_shared_gen(lck_rw_t *lck);
244static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
245static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
246static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
247static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
248static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
39236c6e 249void lck_rw_clear_promotions_x86(thread_t thread);
5ba3f43e
A
250static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
251static boolean_t lck_rw_grab_want(lck_rw_t *lock);
252static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
39236c6e 253
91447636
A
254/*
255 * Routine: lck_spin_alloc_init
256 */
257lck_spin_t *
258lck_spin_alloc_init(
259 lck_grp_t *grp,
260 lck_attr_t *attr)
261{
262 lck_spin_t *lck;
263
264 if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
265 lck_spin_init(lck, grp, attr);
266
267 return(lck);
268}
269
270/*
271 * Routine: lck_spin_free
272 */
273void
274lck_spin_free(
275 lck_spin_t *lck,
276 lck_grp_t *grp)
277{
278 lck_spin_destroy(lck, grp);
279 kfree(lck, sizeof(lck_spin_t));
280}
281
282/*
283 * Routine: lck_spin_init
284 */
285void
286lck_spin_init(
287 lck_spin_t *lck,
288 lck_grp_t *grp,
289 __unused lck_attr_t *attr)
290{
291 usimple_lock_init((usimple_lock_t) lck, 0);
292 lck_grp_reference(grp);
293 lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
294}
295
296/*
297 * Routine: lck_spin_destroy
298 */
299void
300lck_spin_destroy(
301 lck_spin_t *lck,
302 lck_grp_t *grp)
303{
b0d623f7 304 if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
91447636 305 return;
b0d623f7 306 lck->interlock = LCK_SPIN_TAG_DESTROYED;
91447636
A
307 lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
308 lck_grp_deallocate(grp);
309 return;
310}
311
312/*
313 * Routine: lck_spin_lock
314 */
315void
316lck_spin_lock(
317 lck_spin_t *lck)
318{
319 usimple_lock((usimple_lock_t) lck);
320}
321
322/*
323 * Routine: lck_spin_unlock
324 */
325void
326lck_spin_unlock(
327 lck_spin_t *lck)
328{
329 usimple_unlock((usimple_lock_t) lck);
330}
331
332
333/*
334 * Routine: lck_spin_try_lock
335 */
336boolean_t
337lck_spin_try_lock(
338 lck_spin_t *lck)
339{
39037602
A
340 boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
341#if DEVELOPMENT || DEBUG
342 if (lrval) {
343 pltrace(FALSE);
344 }
345#endif
346 return(lrval);
347}
348
349/*
350 * Routine: lck_spin_assert
351 */
352void
353lck_spin_assert(lck_spin_t *lock, unsigned int type)
354{
355 thread_t thread, holder;
356 uintptr_t state;
357
358 if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
359 panic("lck_spin_assert(): invalid arg (%u)", type);
360 }
361
362 state = lock->interlock;
363 holder = (thread_t)state;
364 thread = current_thread();
365 if (type == LCK_ASSERT_OWNED) {
366 if (__improbable(holder == THREAD_NULL)) {
367 panic("Lock not owned %p = %lx", lock, state);
368 }
369 if (__improbable(holder != thread)) {
370 panic("Lock not owned by current thread %p = %lx", lock, state);
371 }
372 } else if (type == LCK_ASSERT_NOTOWNED) {
373 if (__improbable(holder != THREAD_NULL)) {
374 if (holder == thread) {
375 panic("Lock owned by current thread %p = %lx", lock, state);
376 } else {
377 panic("Lock %p owned by thread %p", lock, holder);
378 }
379 }
380 }
91447636
A
381}
382
fe8ab488 383/*
3e170ce0 384 * Routine: kdp_lck_spin_is_acquired
fe8ab488
A
385 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
386 * Returns: TRUE if lock is acquired.
387 */
388boolean_t
3e170ce0 389kdp_lck_spin_is_acquired(lck_spin_t *lck) {
fe8ab488
A
390 if (not_in_kdp) {
391 panic("panic: spinlock acquired check done outside of kernel debugger");
392 }
393 return (lck->interlock != 0)? TRUE : FALSE;
394}
395
91447636
A
396/*
397 * Initialize a usimple_lock.
398 *
399 * No change in preemption state.
400 */
401void
402usimple_lock_init(
403 usimple_lock_t l,
404 __unused unsigned short tag)
405{
406#ifndef MACHINE_SIMPLE_LOCK
407 USLDBG(usld_lock_init(l, tag));
408 hw_lock_init(&l->interlock);
409#else
410 simple_lock_init((simple_lock_t)l,tag);
411#endif
412}
413
060df5ea
A
414volatile uint32_t spinlock_owner_cpu = ~0;
415volatile usimple_lock_t spinlock_timed_out;
416
fe8ab488 417uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
060df5ea
A
418 uint32_t i;
419
420 for (i = 0; i < real_ncpus; i++) {
421 if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
422 spinlock_owner_cpu = i;
5ba3f43e
A
423 if ((uint32_t) cpu_number() != i) {
424 /* Cause NMI and panic on the owner's cpu */
425 NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
426 }
060df5ea
A
427 break;
428 }
429 }
430
431 return spinlock_owner_cpu;
432}
91447636
A
433
434/*
435 * Acquire a usimple_lock.
436 *
437 * Returns with preemption disabled. Note
438 * that the hw_lock routines are responsible for
439 * maintaining preemption state.
440 */
441void
442usimple_lock(
443 usimple_lock_t l)
444{
445#ifndef MACHINE_SIMPLE_LOCK
2d21ac55 446 DECL_PC(pc);
91447636 447
b0d623f7 448 OBTAIN_PC(pc);
91447636 449 USLDBG(usld_lock_pre(l, pc));
6d2010ae
A
450
451 if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) {
b0d623f7 452 boolean_t uslock_acquired = FALSE;
060df5ea
A
453 while (machine_timeout_suspended()) {
454 enable_preemption();
455 if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
456 break;
6d2010ae
A
457 }
458
060df5ea
A
459 if (uslock_acquired == FALSE) {
460 uint32_t lock_cpu;
7ddcb079 461 uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
060df5ea 462 spinlock_timed_out = l;
7ddcb079 463 lock_cpu = spinlock_timeout_NMI(lowner);
5ba3f43e
A
464 panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
465 l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
060df5ea 466 }
b0d623f7 467 }
39037602
A
468#if DEVELOPMENT || DEBUG
469 pltrace(FALSE);
470#endif
471
91447636
A
472 USLDBG(usld_lock_post(l, pc));
473#else
474 simple_lock((simple_lock_t)l);
475#endif
5ba3f43e
A
476#if CONFIG_DTRACE
477 LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, 0);
478#endif
91447636
A
479}
480
481
482/*
483 * Release a usimple_lock.
484 *
485 * Returns with preemption enabled. Note
486 * that the hw_lock routines are responsible for
487 * maintaining preemption state.
488 */
489void
490usimple_unlock(
491 usimple_lock_t l)
492{
493#ifndef MACHINE_SIMPLE_LOCK
494 DECL_PC(pc);
495
b0d623f7 496 OBTAIN_PC(pc);
91447636 497 USLDBG(usld_unlock(l, pc));
39037602
A
498#if DEVELOPMENT || DEBUG
499 pltrace(TRUE);
500#endif
91447636
A
501 hw_lock_unlock(&l->interlock);
502#else
503 simple_unlock_rwmb((simple_lock_t)l);
504#endif
505}
506
507
508/*
509 * Conditionally acquire a usimple_lock.
510 *
511 * On success, returns with preemption disabled.
512 * On failure, returns with preemption in the same state
513 * as when first invoked. Note that the hw_lock routines
514 * are responsible for maintaining preemption state.
515 *
516 * XXX No stats are gathered on a miss; I preserved this
517 * behavior from the original assembly-language code, but
518 * doesn't it make sense to log misses? XXX
519 */
520unsigned int
521usimple_lock_try(
522 usimple_lock_t l)
523{
524#ifndef MACHINE_SIMPLE_LOCK
91447636 525 unsigned int success;
2d21ac55 526 DECL_PC(pc);
91447636 527
b0d623f7 528 OBTAIN_PC(pc);
91447636
A
529 USLDBG(usld_lock_try_pre(l, pc));
530 if ((success = hw_lock_try(&l->interlock))) {
39037602
A
531#if DEVELOPMENT || DEBUG
532 pltrace(FALSE);
533#endif
534 USLDBG(usld_lock_try_post(l, pc));
91447636
A
535 }
536 return success;
537#else
538 return(simple_lock_try((simple_lock_t)l));
539#endif
540}
541
39037602
A
542/*
543 * Acquire a usimple_lock while polling for pending TLB flushes
544 * and spinning on a lock.
545 *
546 */
547void
548usimple_lock_try_lock_loop(usimple_lock_t l)
549{
550 boolean_t istate = ml_get_interrupts_enabled();
551 while (!simple_lock_try((l))) {
552 if (!istate)
553 handle_pending_TLB_flushes();
554 cpu_pause();
555 }
556}
557
91447636
A
558#if USLOCK_DEBUG
559/*
560 * States of a usimple_lock. The default when initializing
561 * a usimple_lock is setting it up for debug checking.
562 */
563#define USLOCK_CHECKED 0x0001 /* lock is being checked */
564#define USLOCK_TAKEN 0x0002 /* lock has been taken */
565#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
566#define USLOCK_INITIALIZED (USLOCK_INIT|USLOCK_CHECKED)
567#define USLOCK_CHECKING(l) (uslock_check && \
568 ((l)->debug.state & USLOCK_CHECKED))
569
570/*
571 * Trace activities of a particularly interesting lock.
572 */
573void usl_trace(usimple_lock_t, int, pc_t, const char *);
574
575
576/*
577 * Initialize the debugging information contained
578 * in a usimple_lock.
579 */
580void
581usld_lock_init(
582 usimple_lock_t l,
583 __unused unsigned short tag)
584{
585 if (l == USIMPLE_LOCK_NULL)
586 panic("lock initialization: null lock pointer");
587 l->lock_type = USLOCK_TAG;
588 l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
589 l->debug.lock_cpu = l->debug.unlock_cpu = 0;
590 l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
591 l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
592 l->debug.duration[0] = l->debug.duration[1] = 0;
593 l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
594 l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
595 l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
596}
597
598
599/*
600 * These checks apply to all usimple_locks, not just
601 * those with USLOCK_CHECKED turned on.
602 */
603int
604usld_lock_common_checks(
605 usimple_lock_t l,
606 char *caller)
607{
608 if (l == USIMPLE_LOCK_NULL)
609 panic("%s: null lock pointer", caller);
610 if (l->lock_type != USLOCK_TAG)
ebb1b9f4 611 panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
91447636 612 if (!(l->debug.state & USLOCK_INIT))
ebb1b9f4 613 panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
91447636
A
614 return USLOCK_CHECKING(l);
615}
616
617
618/*
619 * Debug checks on a usimple_lock just before attempting
620 * to acquire it.
621 */
622/* ARGSUSED */
623void
624usld_lock_pre(
625 usimple_lock_t l,
626 pc_t pc)
627{
628 char caller[] = "usimple_lock";
629
630
631 if (!usld_lock_common_checks(l, caller))
632 return;
633
634/*
635 * Note that we have a weird case where we are getting a lock when we are]
636 * in the process of putting the system to sleep. We are running with no
637 * current threads, therefore we can't tell if we are trying to retake a lock
638 * we have or someone on the other processor has it. Therefore we just
639 * ignore this test if the locking thread is 0.
640 */
641
642 if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
643 l->debug.lock_thread == (void *) current_thread()) {
2d21ac55
A
644 printf("%s: lock %p already locked (at %p) by",
645 caller, l, l->debug.lock_pc);
646 printf(" current thread %p (new attempt at pc %p)\n",
91447636 647 l->debug.lock_thread, pc);
2d21ac55 648 panic("%s", caller);
91447636
A
649 }
650 mp_disable_preemption();
651 usl_trace(l, cpu_number(), pc, caller);
652 mp_enable_preemption();
653}
654
655
656/*
657 * Debug checks on a usimple_lock just after acquiring it.
658 *
659 * Pre-emption has been disabled at this point,
660 * so we are safe in using cpu_number.
661 */
662void
663usld_lock_post(
664 usimple_lock_t l,
665 pc_t pc)
666{
39037602 667 int mycpu;
91447636
A
668 char caller[] = "successful usimple_lock";
669
670
671 if (!usld_lock_common_checks(l, caller))
672 return;
673
674 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
b0d623f7
A
675 panic("%s: lock %p became uninitialized",
676 caller, l);
91447636 677 if ((l->debug.state & USLOCK_TAKEN))
b0d623f7
A
678 panic("%s: lock 0x%p became TAKEN by someone else",
679 caller, l);
91447636
A
680
681 mycpu = cpu_number();
682 l->debug.lock_thread = (void *)current_thread();
683 l->debug.state |= USLOCK_TAKEN;
684 l->debug.lock_pc = pc;
685 l->debug.lock_cpu = mycpu;
686
687 usl_trace(l, mycpu, pc, caller);
688}
689
690
691/*
692 * Debug checks on a usimple_lock just before
693 * releasing it. Note that the caller has not
694 * yet released the hardware lock.
695 *
696 * Preemption is still disabled, so there's
697 * no problem using cpu_number.
698 */
699void
700usld_unlock(
701 usimple_lock_t l,
702 pc_t pc)
703{
39037602 704 int mycpu;
91447636
A
705 char caller[] = "usimple_unlock";
706
707
708 if (!usld_lock_common_checks(l, caller))
709 return;
710
711 mycpu = cpu_number();
712
713 if (!(l->debug.state & USLOCK_TAKEN))
b0d623f7
A
714 panic("%s: lock 0x%p hasn't been taken",
715 caller, l);
91447636 716 if (l->debug.lock_thread != (void *) current_thread())
b0d623f7
A
717 panic("%s: unlocking lock 0x%p, owned by thread %p",
718 caller, l, l->debug.lock_thread);
91447636 719 if (l->debug.lock_cpu != mycpu) {
b0d623f7
A
720 printf("%s: unlocking lock 0x%p on cpu 0x%x",
721 caller, l, mycpu);
91447636 722 printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
2d21ac55 723 panic("%s", caller);
91447636
A
724 }
725 usl_trace(l, mycpu, pc, caller);
726
727 l->debug.unlock_thread = l->debug.lock_thread;
728 l->debug.lock_thread = INVALID_PC;
729 l->debug.state &= ~USLOCK_TAKEN;
730 l->debug.unlock_pc = pc;
731 l->debug.unlock_cpu = mycpu;
732}
733
734
735/*
736 * Debug checks on a usimple_lock just before
737 * attempting to acquire it.
738 *
739 * Preemption isn't guaranteed to be disabled.
740 */
741void
742usld_lock_try_pre(
743 usimple_lock_t l,
744 pc_t pc)
745{
746 char caller[] = "usimple_lock_try";
747
748 if (!usld_lock_common_checks(l, caller))
749 return;
750 mp_disable_preemption();
751 usl_trace(l, cpu_number(), pc, caller);
752 mp_enable_preemption();
753}
754
755
756/*
757 * Debug checks on a usimple_lock just after
758 * successfully attempting to acquire it.
759 *
760 * Preemption has been disabled by the
761 * lock acquisition attempt, so it's safe
762 * to use cpu_number.
763 */
764void
765usld_lock_try_post(
766 usimple_lock_t l,
767 pc_t pc)
768{
39037602 769 int mycpu;
91447636
A
770 char caller[] = "successful usimple_lock_try";
771
772 if (!usld_lock_common_checks(l, caller))
773 return;
774
775 if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
b0d623f7
A
776 panic("%s: lock 0x%p became uninitialized",
777 caller, l);
91447636 778 if ((l->debug.state & USLOCK_TAKEN))
b0d623f7
A
779 panic("%s: lock 0x%p became TAKEN by someone else",
780 caller, l);
91447636
A
781
782 mycpu = cpu_number();
783 l->debug.lock_thread = (void *) current_thread();
784 l->debug.state |= USLOCK_TAKEN;
785 l->debug.lock_pc = pc;
786 l->debug.lock_cpu = mycpu;
787
788 usl_trace(l, mycpu, pc, caller);
789}
790
791
792/*
793 * For very special cases, set traced_lock to point to a
794 * specific lock of interest. The result is a series of
795 * XPRs showing lock operations on that lock. The lock_seq
796 * value is used to show the order of those operations.
797 */
798usimple_lock_t traced_lock;
799unsigned int lock_seq;
800
801void
802usl_trace(
803 usimple_lock_t l,
804 int mycpu,
805 pc_t pc,
806 const char * op_name)
807{
808 if (traced_lock == l) {
809 XPR(XPR_SLOCK,
810 "seq %d, cpu %d, %s @ %x\n",
b0d623f7
A
811 (uintptr_t) lock_seq, (uintptr_t) mycpu,
812 (uintptr_t) op_name, (uintptr_t) pc, 0);
91447636
A
813 lock_seq++;
814 }
815}
816
817
818#endif /* USLOCK_DEBUG */
819
91447636
A
820/*
821 * Routine: lck_rw_alloc_init
822 */
823lck_rw_t *
824lck_rw_alloc_init(
825 lck_grp_t *grp,
826 lck_attr_t *attr) {
827 lck_rw_t *lck;
828
b0d623f7
A
829 if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
830 bzero(lck, sizeof(lck_rw_t));
91447636 831 lck_rw_init(lck, grp, attr);
b0d623f7
A
832 }
833
91447636
A
834 return(lck);
835}
836
837/*
838 * Routine: lck_rw_free
839 */
840void
841lck_rw_free(
842 lck_rw_t *lck,
843 lck_grp_t *grp) {
844 lck_rw_destroy(lck, grp);
845 kfree(lck, sizeof(lck_rw_t));
846}
847
848/*
849 * Routine: lck_rw_init
850 */
851void
852lck_rw_init(
853 lck_rw_t *lck,
854 lck_grp_t *grp,
0c530ab8
A
855 lck_attr_t *attr)
856{
857 lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
858 attr : &LockDefaultLckAttr;
91447636 859
2d21ac55
A
860 hw_lock_byte_init(&lck->lck_rw_interlock);
861 lck->lck_rw_want_write = FALSE;
862 lck->lck_rw_want_upgrade = FALSE;
863 lck->lck_rw_shared_count = 0;
864 lck->lck_rw_can_sleep = TRUE;
b0d623f7 865 lck->lck_r_waiting = lck->lck_w_waiting = 0;
91447636 866 lck->lck_rw_tag = 0;
2d21ac55
A
867 lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
868 LCK_ATTR_RW_SHARED_PRIORITY) == 0);
91447636
A
869
870 lck_grp_reference(grp);
871 lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
872}
873
874/*
875 * Routine: lck_rw_destroy
876 */
877void
878lck_rw_destroy(
879 lck_rw_t *lck,
b0d623f7
A
880 lck_grp_t *grp)
881{
91447636
A
882 if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
883 return;
39236c6e
A
884#if MACH_LDEBUG
885 lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
886#endif
91447636
A
887 lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
888 lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
889 lck_grp_deallocate(grp);
890 return;
891}
892
893/*
894 * Sleep locks. These use the same data structure and algorithm
895 * as the spin locks, but the process sleeps while it is waiting
896 * for the lock. These work on uniprocessor systems.
897 */
898
899#define DECREMENTER_TIMEOUT 1000000
900
91447636 901/*
6d2010ae
A
902 * We disable interrupts while holding the RW interlock to prevent an
903 * interrupt from exacerbating hold time.
91447636
A
904 * Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
905 */
5ba3f43e 906static inline boolean_t
91447636
A
907lck_interlock_lock(lck_rw_t *lck)
908{
909 boolean_t istate;
910
911 istate = ml_set_interrupts_enabled(FALSE);
2d21ac55 912 hw_lock_byte_lock(&lck->lck_rw_interlock);
91447636
A
913 return istate;
914}
915
5ba3f43e 916static inline void
91447636
A
917lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
918{
2d21ac55 919 hw_lock_byte_unlock(&lck->lck_rw_interlock);
91447636
A
920 ml_set_interrupts_enabled(istate);
921}
922
0c530ab8
A
923/*
924 * This inline is used when busy-waiting for an rw lock.
925 * If interrupts were disabled when the lock primitive was called,
926 * we poll the IPI handler for pending tlb flushes.
927 * XXX This is a hack to avoid deadlocking on the pmap_system_lock.
928 */
929static inline void
930lck_rw_lock_pause(boolean_t interrupts_enabled)
931{
932 if (!interrupts_enabled)
933 handle_pending_TLB_flushes();
934 cpu_pause();
935}
936
5ba3f43e
A
937static inline boolean_t
938lck_rw_held_read_or_upgrade(lck_rw_t *lock)
939{
940 if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK | LCK_RW_INTERLOCK | LCK_RW_WANT_UPGRADE))
941 return TRUE;
942 return FALSE;
943}
b0d623f7
A
944
945/*
946 * compute the deadline to spin against when
947 * waiting for a change of state on a lck_rw_t
948 */
949static inline uint64_t
950lck_rw_deadline_for_spin(lck_rw_t *lck)
951{
952 if (lck->lck_rw_can_sleep) {
953 if (lck->lck_r_waiting || lck->lck_w_waiting || lck->lck_rw_shared_count > machine_info.max_cpus) {
954 /*
955 * there are already threads waiting on this lock... this
956 * implies that they have spun beyond their deadlines waiting for
957 * the desired state to show up so we will not bother spinning at this time...
958 * or
959 * the current number of threads sharing this lock exceeds our capacity to run them
960 * concurrently and since all states we're going to spin for require the rw_shared_count
961 * to be at 0, we'll not bother spinning since the latency for this to happen is
962 * unpredictable...
963 */
964 return (mach_absolute_time());
965 }
966 return (mach_absolute_time() + MutexSpin);
967 } else
968 return (mach_absolute_time() + (100000LL * 1000000000LL));
969}
970
971
5ba3f43e
A
972/*
973 * Spin while interlock is held.
974 */
975
976static inline void
977lck_rw_interlock_spin(lck_rw_t *lock)
978{
979 while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
980 cpu_pause();
981 }
982}
983
984static boolean_t
985lck_rw_grab_want(lck_rw_t *lock)
986{
987 uint32_t data, prev;
988
989 for ( ; ; ) {
990 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
991 if ((data & LCK_RW_INTERLOCK) == 0)
992 break;
993 atomic_exchange_abort();
994 lck_rw_interlock_spin(lock);
995 }
996 if (data & LCK_RW_WANT_WRITE) {
997 atomic_exchange_abort();
998 return FALSE;
999 }
1000 data |= LCK_RW_WANT_WRITE;
1001 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1002}
1003
1004static boolean_t
1005lck_rw_grab_shared(lck_rw_t *lock)
1006{
1007 uint32_t data, prev;
1008
1009 for ( ; ; ) {
1010 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1011 if ((data & LCK_RW_INTERLOCK) == 0)
1012 break;
1013 atomic_exchange_abort();
1014 lck_rw_interlock_spin(lock);
1015 }
1016 if (data & (LCK_RW_WANT_WRITE | LCK_RW_WANT_UPGRADE)) {
1017 if (((data & LCK_RW_SHARED_MASK) == 0) || (data & LCK_RW_PRIV_EXCL)) {
1018 atomic_exchange_abort();
1019 return FALSE;
1020 }
1021 }
1022 data += LCK_RW_SHARED_READER;
1023 return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1024}
1025
91447636
A
1026/*
1027 * Routine: lck_rw_lock_exclusive
1028 */
5ba3f43e 1029static void
b0d623f7 1030lck_rw_lock_exclusive_gen(
91447636
A
1031 lck_rw_t *lck)
1032{
3e170ce0 1033 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
b0d623f7
A
1034 uint64_t deadline = 0;
1035 int slept = 0;
1036 int gotlock = 0;
1037 int lockheld = 0;
1038 wait_result_t res = 0;
1039 boolean_t istate = -1;
91447636 1040
2d21ac55 1041#if CONFIG_DTRACE
b0d623f7
A
1042 boolean_t dtrace_ls_initialized = FALSE;
1043 boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
1044 uint64_t wait_interval = 0;
1045 int readers_at_sleep = 0;
2d21ac55 1046#endif
91447636 1047
91447636 1048 /*
2d21ac55 1049 * Try to acquire the lck_rw_want_write bit.
91447636 1050 */
b0d623f7 1051 while ( !lck_rw_grab_want(lck)) {
91447636 1052
2d21ac55 1053#if CONFIG_DTRACE
b0d623f7
A
1054 if (dtrace_ls_initialized == FALSE) {
1055 dtrace_ls_initialized = TRUE;
1056 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1057 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1058 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1059 if (dtrace_ls_enabled) {
1060 /*
1061 * Either sleeping or spinning is happening,
1062 * start a timing of our delay interval now.
1063 */
1064 readers_at_sleep = lck->lck_rw_shared_count;
1065 wait_interval = mach_absolute_time();
1066 }
91447636 1067 }
2d21ac55 1068#endif
b0d623f7
A
1069 if (istate == -1)
1070 istate = ml_get_interrupts_enabled();
91447636 1071
b0d623f7
A
1072 deadline = lck_rw_deadline_for_spin(lck);
1073
3e170ce0 1074 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7
A
1075
1076 while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
1077 lck_rw_lock_pause(istate);
1078
3e170ce0 1079 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
b0d623f7
A
1080
1081 if (gotlock)
1082 break;
1083 /*
1084 * if we get here, the deadline has expired w/o us
1085 * being able to grab the lock exclusively
1086 * check to see if we're allowed to do a thread_block
1087 */
1088 if (lck->lck_rw_can_sleep) {
2d21ac55 1089
91447636 1090 istate = lck_interlock_lock(lck);
91447636 1091
b0d623f7 1092 if (lck->lck_rw_want_write) {
91447636 1093
3e170ce0 1094 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
91447636 1095
b0d623f7 1096 lck->lck_w_waiting = TRUE;
91447636 1097
813fb2f6 1098 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
b0d623f7
A
1099 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
1100 lck_interlock_unlock(lck, istate);
91447636 1101
b0d623f7
A
1102 if (res == THREAD_WAITING) {
1103 res = thread_block(THREAD_CONTINUE_NULL);
1104 slept++;
1105 }
3e170ce0 1106 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
b0d623f7
A
1107 } else {
1108 lck->lck_rw_want_write = TRUE;
1109 lck_interlock_unlock(lck, istate);
1110 break;
1111 }
1112 }
1113 }
1114 /*
1115 * Wait for readers (and upgrades) to finish...
1116 * the test for these conditions must be done simultaneously with
1117 * a check of the interlock not being held since
1118 * the rw_shared_count will drop to 0 first and then want_upgrade
1119 * will be set to 1 in the shared_to_exclusive scenario... those
1120 * adjustments are done behind the interlock and represent an
1121 * atomic change in state and must be considered as such
1122 * however, once we see the read count at 0, the want_upgrade not set
1123 * and the interlock not held, we are safe to proceed
1124 */
1125 while (lck_rw_held_read_or_upgrade(lck)) {
2d21ac55
A
1126
1127#if CONFIG_DTRACE
1128 /*
1129 * Either sleeping or spinning is happening, start
1130 * a timing of our delay interval now. If we set it
1131 * to -1 we don't have accurate data so we cannot later
1132 * decide to record a dtrace spin or sleep event.
1133 */
b0d623f7
A
1134 if (dtrace_ls_initialized == FALSE) {
1135 dtrace_ls_initialized = TRUE;
1136 dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
1137 dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
1138 dtrace_ls_enabled = dtrace_rwl_excl_spin || dtrace_rwl_excl_block;
1139 if (dtrace_ls_enabled) {
1140 /*
1141 * Either sleeping or spinning is happening,
1142 * start a timing of our delay interval now.
1143 */
1144 readers_at_sleep = lck->lck_rw_shared_count;
1145 wait_interval = mach_absolute_time();
1146 }
2d21ac55
A
1147 }
1148#endif
b0d623f7
A
1149 if (istate == -1)
1150 istate = ml_get_interrupts_enabled();
1151
1152 deadline = lck_rw_deadline_for_spin(lck);
1153
3e170ce0 1154 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7
A
1155
1156 while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1157 lck_rw_lock_pause(istate);
1158
3e170ce0 1159 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) | DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
b0d623f7
A
1160
1161 if ( !lockheld)
1162 break;
1163 /*
1164 * if we get here, the deadline has expired w/o us
1165 * being able to grab the lock exclusively
1166 * check to see if we're allowed to do a thread_block
1167 */
1168 if (lck->lck_rw_can_sleep) {
91447636 1169
91447636 1170 istate = lck_interlock_lock(lck);
91447636 1171
b0d623f7 1172 if (lck->lck_rw_shared_count != 0 || lck->lck_rw_want_upgrade) {
3e170ce0 1173 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
b0d623f7
A
1174
1175 lck->lck_w_waiting = TRUE;
1176
813fb2f6 1177 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
b0d623f7 1178 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
91447636 1179 lck_interlock_unlock(lck, istate);
b0d623f7
A
1180
1181 if (res == THREAD_WAITING) {
1182 res = thread_block(THREAD_CONTINUE_NULL);
1183 slept++;
1184 }
3e170ce0 1185 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) | DBG_FUNC_END, trace_lck, res, slept, 0, 0);
b0d623f7
A
1186 } else {
1187 lck_interlock_unlock(lck, istate);
1188 /*
1189 * must own the lock now, since we checked for
1190 * readers or upgrade owner behind the interlock
1191 * no need for a call to 'lck_rw_held_read_or_upgrade'
1192 */
1193 break;
91447636
A
1194 }
1195 }
91447636
A
1196 }
1197
2d21ac55
A
1198#if CONFIG_DTRACE
1199 /*
1200 * Decide what latencies we suffered that are Dtrace events.
1201 * If we have set wait_interval, then we either spun or slept.
1202 * At least we get out from under the interlock before we record
1203 * which is the best we can do here to minimize the impact
1204 * of the tracing.
1205 * If we have set wait_interval to -1, then dtrace was not enabled when we
1206 * started sleeping/spinning so we don't record this event.
1207 */
b0d623f7 1208 if (dtrace_ls_enabled == TRUE) {
2d21ac55
A
1209 if (slept == 0) {
1210 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1211 mach_absolute_time() - wait_interval, 1);
1212 } else {
1213 /*
1214 * For the blocking case, we also record if when we blocked
1215 * it was held for read or write, and how many readers.
1216 * Notice that above we recorded this before we dropped
1217 * the interlock so the count is accurate.
1218 */
1219 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1220 mach_absolute_time() - wait_interval, 1,
1221 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1222 }
1223 }
1224 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
1225#endif
91447636
A
1226}
1227
5ba3f43e
A
1228/*
1229 * Routine: lck_rw_done
1230 */
1231
1232lck_rw_type_t lck_rw_done(lck_rw_t *lock)
1233{
1234 uint32_t data, prev;
1235
1236 for ( ; ; ) {
1237 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1238 if (data & LCK_RW_INTERLOCK) { /* wait for interlock to clear */
1239 atomic_exchange_abort();
1240 lck_rw_interlock_spin(lock);
1241 continue;
1242 }
1243 if (data & LCK_RW_SHARED_MASK) {
1244 data -= LCK_RW_SHARED_READER;
1245 if ((data & LCK_RW_SHARED_MASK) == 0) /* if reader count has now gone to 0, check for waiters */
1246 goto check_waiters;
1247 } else { /* if reader count == 0, must be exclusive lock */
1248 if (data & LCK_RW_WANT_UPGRADE) {
1249 data &= ~(LCK_RW_WANT_UPGRADE);
1250 } else {
1251 if (data & LCK_RW_WANT_WRITE)
1252 data &= ~(LCK_RW_WANT_EXCL);
1253 else /* lock is not 'owned', panic */
1254 panic("Releasing non-exclusive RW lock without a reader refcount!");
1255 }
1256check_waiters:
1257 if (prev & LCK_RW_W_WAITING) {
1258 data &= ~(LCK_RW_W_WAITING);
1259 if ((prev & LCK_RW_PRIV_EXCL) == 0)
1260 data &= ~(LCK_RW_R_WAITING);
1261 } else
1262 data &= ~(LCK_RW_R_WAITING);
1263 }
1264 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1265 break;
1266 cpu_pause();
1267 }
1268 return lck_rw_done_gen(lock, prev);
1269}
91447636
A
1270
1271/*
2d21ac55 1272 * Routine: lck_rw_done_gen
b0d623f7 1273 *
5ba3f43e 1274 * called from lck_rw_done()
b0d623f7
A
1275 * prior_lock_state is the value in the 1st
1276 * word of the lock at the time of a successful
1277 * atomic compare and exchange with the new value...
1278 * it represents the state of the lock before we
1279 * decremented the rw_shared_count or cleared either
1280 * rw_want_upgrade or rw_want_write and
1281 * the lck_x_waiting bits... since the wrapper
1282 * routine has already changed the state atomically,
1283 * we just need to decide if we should
1284 * wake up anyone and what value to return... we do
1285 * this by examining the state of the lock before
1286 * we changed it
91447636 1287 */
5ba3f43e 1288static lck_rw_type_t
2d21ac55 1289lck_rw_done_gen(
b0d623f7 1290 lck_rw_t *lck,
5ba3f43e 1291 uint32_t prior_lock_state)
91447636 1292{
b0d623f7
A
1293 lck_rw_t *fake_lck;
1294 lck_rw_type_t lock_type;
fe8ab488 1295 thread_t thread;
39236c6e
A
1296 uint32_t rwlock_count;
1297
91447636 1298 /*
b0d623f7
A
1299 * prior_lock state is a snapshot of the 1st word of the
1300 * lock in question... we'll fake up a pointer to it
1301 * and carefully not access anything beyond whats defined
1302 * in the first word of a lck_rw_t
91447636 1303 */
b0d623f7 1304 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1305
b0d623f7
A
1306 if (fake_lck->lck_rw_shared_count <= 1) {
1307 if (fake_lck->lck_w_waiting)
1308 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
91447636 1309
b0d623f7
A
1310 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1311 thread_wakeup(RW_LOCK_READER_EVENT(lck));
1312 }
1313 if (fake_lck->lck_rw_shared_count)
1314 lock_type = LCK_RW_TYPE_SHARED;
1315 else
1316 lock_type = LCK_RW_TYPE_EXCLUSIVE;
2d21ac55 1317
fe8ab488
A
1318 /* Check if dropping the lock means that we need to unpromote */
1319 thread = current_thread();
1320 rwlock_count = thread->rwlock_count--;
1321#if MACH_LDEBUG
1322 if (rwlock_count == 0) {
1323 panic("rw lock count underflow for thread %p", thread);
1324 }
1325#endif
1326 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1327 /* sched_flags checked without lock, but will be rechecked while clearing */
1328 lck_rw_clear_promotion(thread);
1329 }
1330
2d21ac55 1331#if CONFIG_DTRACE
b0d623f7 1332 LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
2d21ac55
A
1333#endif
1334
b0d623f7 1335 return(lock_type);
91447636
A
1336}
1337
1338
91447636
A
1339/*
1340 * Routine: lck_rw_unlock
1341 */
1342void
1343lck_rw_unlock(
1344 lck_rw_t *lck,
1345 lck_rw_type_t lck_rw_type)
1346{
1347 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1348 lck_rw_unlock_shared(lck);
1349 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1350 lck_rw_unlock_exclusive(lck);
1351 else
1352 panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1353}
1354
1355
1356/*
1357 * Routine: lck_rw_unlock_shared
1358 */
1359void
1360lck_rw_unlock_shared(
1361 lck_rw_t *lck)
1362{
1363 lck_rw_type_t ret;
1364
1365 ret = lck_rw_done(lck);
1366
1367 if (ret != LCK_RW_TYPE_SHARED)
39037602 1368 panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
91447636
A
1369}
1370
1371
1372/*
1373 * Routine: lck_rw_unlock_exclusive
1374 */
1375void
1376lck_rw_unlock_exclusive(
1377 lck_rw_t *lck)
1378{
1379 lck_rw_type_t ret;
1380
1381 ret = lck_rw_done(lck);
1382
1383 if (ret != LCK_RW_TYPE_EXCLUSIVE)
1384 panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1385}
1386
1387
1388/*
1389 * Routine: lck_rw_lock
1390 */
1391void
1392lck_rw_lock(
1393 lck_rw_t *lck,
1394 lck_rw_type_t lck_rw_type)
1395{
1396 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1397 lck_rw_lock_shared(lck);
1398 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1399 lck_rw_lock_exclusive(lck);
1400 else
1401 panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1402}
1403
5ba3f43e
A
1404/*
1405 * Routine: lck_rw_lock_shared
1406 */
1407void
1408lck_rw_lock_shared(lck_rw_t *lock)
1409{
1410 uint32_t data, prev;
1411
1412 current_thread()->rwlock_count++;
1413 for ( ; ; ) {
1414 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1415 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK)) {
1416 atomic_exchange_abort();
1417 lck_rw_lock_shared_gen(lock);
1418 break;
1419 }
1420 data += LCK_RW_SHARED_READER;
1421 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1422 break;
1423 cpu_pause();
1424 }
1425#if CONFIG_DTRACE
1426 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1427#endif /* CONFIG_DTRACE */
1428 return;
1429}
91447636
A
1430
1431/*
2d21ac55 1432 * Routine: lck_rw_lock_shared_gen
b0d623f7
A
1433 * Function:
1434 * assembly fast path code has determined that this lock
1435 * is held exclusively... this is where we spin/block
1436 * until we can acquire the lock in the shared mode
91447636 1437 */
5ba3f43e 1438static void
2d21ac55 1439lck_rw_lock_shared_gen(
91447636
A
1440 lck_rw_t *lck)
1441{
3e170ce0 1442 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
b0d623f7
A
1443 uint64_t deadline = 0;
1444 int gotlock = 0;
1445 int slept = 0;
1446 wait_result_t res = 0;
1447 boolean_t istate = -1;
3e170ce0 1448
2d21ac55
A
1449#if CONFIG_DTRACE
1450 uint64_t wait_interval = 0;
b0d623f7
A
1451 int readers_at_sleep = 0;
1452 boolean_t dtrace_ls_initialized = FALSE;
1453 boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
2d21ac55 1454#endif
91447636 1455
b0d623f7
A
1456 while ( !lck_rw_grab_shared(lck)) {
1457
2d21ac55 1458#if CONFIG_DTRACE
b0d623f7
A
1459 if (dtrace_ls_initialized == FALSE) {
1460 dtrace_ls_initialized = TRUE;
1461 dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
1462 dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
1463 dtrace_ls_enabled = dtrace_rwl_shared_spin || dtrace_rwl_shared_block;
1464 if (dtrace_ls_enabled) {
1465 /*
1466 * Either sleeping or spinning is happening,
1467 * start a timing of our delay interval now.
1468 */
1469 readers_at_sleep = lck->lck_rw_shared_count;
1470 wait_interval = mach_absolute_time();
1471 }
1472 }
2d21ac55 1473#endif
b0d623f7
A
1474 if (istate == -1)
1475 istate = ml_get_interrupts_enabled();
91447636 1476
b0d623f7 1477 deadline = lck_rw_deadline_for_spin(lck);
0c530ab8 1478
b0d623f7 1479 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_START,
3e170ce0 1480 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
91447636 1481
b0d623f7
A
1482 while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
1483 lck_rw_lock_pause(istate);
1484
1485 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) | DBG_FUNC_END,
3e170ce0 1486 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
b0d623f7
A
1487
1488 if (gotlock)
1489 break;
1490 /*
1491 * if we get here, the deadline has expired w/o us
1492 * being able to grab the lock for read
1493 * check to see if we're allowed to do a thread_block
1494 */
1495 if (lck->lck_rw_can_sleep) {
91447636 1496
91447636 1497 istate = lck_interlock_lock(lck);
91447636 1498
b0d623f7
A
1499 if ((lck->lck_rw_want_write || lck->lck_rw_want_upgrade) &&
1500 ((lck->lck_rw_shared_count == 0) || lck->lck_rw_priv_excl)) {
1501
1502 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_START,
3e170ce0 1503 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
b0d623f7
A
1504
1505 lck->lck_r_waiting = TRUE;
1506
813fb2f6 1507 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
b0d623f7 1508 res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
91447636 1509 lck_interlock_unlock(lck, istate);
b0d623f7
A
1510
1511 if (res == THREAD_WAITING) {
1512 res = thread_block(THREAD_CONTINUE_NULL);
1513 slept++;
1514 }
1515 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) | DBG_FUNC_END,
3e170ce0 1516 trace_lck, res, slept, 0, 0);
b0d623f7
A
1517 } else {
1518 lck->lck_rw_shared_count++;
1519 lck_interlock_unlock(lck, istate);
1520 break;
91447636
A
1521 }
1522 }
91447636
A
1523 }
1524
2d21ac55 1525#if CONFIG_DTRACE
b0d623f7 1526 if (dtrace_ls_enabled == TRUE) {
2d21ac55
A
1527 if (slept == 0) {
1528 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1529 } else {
1530 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1531 mach_absolute_time() - wait_interval, 0,
1532 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1533 }
1534 }
1535 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
1536#endif
91447636
A
1537}
1538
1539
5ba3f43e
A
1540/*
1541 * Routine: lck_rw_lock_exclusive
1542 */
1543
1544void
1545lck_rw_lock_exclusive(lck_rw_t *lock)
1546{
1547 current_thread()->rwlock_count++;
1548 if (atomic_test_and_set32(&lock->data,
1549 (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE | LCK_RW_INTERLOCK),
1550 LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1551#if CONFIG_DTRACE
1552 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1553#endif /* CONFIG_DTRACE */
1554 } else
1555 lck_rw_lock_exclusive_gen(lock);
1556}
1557
1558
1559/*
1560 * Routine: lck_rw_lock_shared_to_exclusive
1561 */
1562
1563boolean_t
1564lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1565{
1566 uint32_t data, prev;
1567
1568 for ( ; ; ) {
1569 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1570 if (data & LCK_RW_INTERLOCK) {
1571 atomic_exchange_abort();
1572 lck_rw_interlock_spin(lock);
1573 continue;
1574 }
1575 if (data & LCK_RW_WANT_UPGRADE) {
1576 data -= LCK_RW_SHARED_READER;
1577 if ((data & LCK_RW_SHARED_MASK) == 0) /* we were the last reader */
1578 data &= ~(LCK_RW_W_WAITING); /* so clear the wait indicator */
1579 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1580 return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1581 } else {
1582 data |= LCK_RW_WANT_UPGRADE; /* ask for WANT_UPGRADE */
1583 data -= LCK_RW_SHARED_READER; /* and shed our read count */
1584 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1585 break;
1586 }
1587 cpu_pause();
1588 }
1589 /* we now own the WANT_UPGRADE */
1590 if (data & LCK_RW_SHARED_MASK) /* check to see if all of the readers are drained */
1591 lck_rw_lock_shared_to_exclusive_success(lock); /* if not, we need to go wait */
1592#if CONFIG_DTRACE
1593 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, 0);
1594#endif
1595 return TRUE;
1596}
1597
1598
91447636 1599/*
b0d623f7 1600 * Routine: lck_rw_lock_shared_to_exclusive_failure
91447636 1601 * Function:
b0d623f7
A
1602 * assembly fast path code has already dropped our read
1603 * count and determined that someone else owns 'lck_rw_want_upgrade'
1604 * if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1605 * all we need to do here is determine if a wakeup is needed
91447636 1606 */
5ba3f43e 1607static boolean_t
b0d623f7
A
1608lck_rw_lock_shared_to_exclusive_failure(
1609 lck_rw_t *lck,
5ba3f43e 1610 uint32_t prior_lock_state)
91447636 1611{
b0d623f7 1612 lck_rw_t *fake_lck;
39236c6e
A
1613 thread_t thread = current_thread();
1614 uint32_t rwlock_count;
1615
1616 /* Check if dropping the lock means that we need to unpromote */
1617 rwlock_count = thread->rwlock_count--;
1618#if MACH_LDEBUG
1619 if (rwlock_count == 0) {
1620 panic("rw lock count underflow for thread %p", thread);
1621 }
1622#endif
b0d623f7 1623 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1624
b0d623f7 1625 if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
91447636
A
1626 /*
1627 * Someone else has requested upgrade.
b0d623f7
A
1628 * Since we've released the read lock, wake
1629 * him up if he's blocked waiting
91447636 1630 */
b0d623f7
A
1631 thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1632 }
5ba3f43e
A
1633
1634 if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1635 /* sched_flags checked without lock, but will be rechecked while clearing */
1636 lck_rw_clear_promotion(thread);
1637 }
1638
b0d623f7 1639 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
3e170ce0 1640 VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
91447636 1641
b0d623f7
A
1642 return (FALSE);
1643}
91447636 1644
91447636 1645
b0d623f7
A
1646/*
1647 * Routine: lck_rw_lock_shared_to_exclusive_failure
1648 * Function:
1649 * assembly fast path code has already dropped our read
1650 * count and successfully acquired 'lck_rw_want_upgrade'
1651 * we just need to wait for the rest of the readers to drain
1652 * and then we can return as the exclusive holder of this lock
1653 */
5ba3f43e 1654static boolean_t
b0d623f7
A
1655lck_rw_lock_shared_to_exclusive_success(
1656 lck_rw_t *lck)
1657{
3e170ce0 1658 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
b0d623f7
A
1659 uint64_t deadline = 0;
1660 int slept = 0;
1661 int still_shared = 0;
1662 wait_result_t res;
1663 boolean_t istate = -1;
91447636 1664
b0d623f7
A
1665#if CONFIG_DTRACE
1666 uint64_t wait_interval = 0;
1667 int readers_at_sleep = 0;
1668 boolean_t dtrace_ls_initialized = FALSE;
1669 boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1670#endif
91447636 1671
2d21ac55 1672 while (lck->lck_rw_shared_count != 0) {
b0d623f7 1673
2d21ac55 1674#if CONFIG_DTRACE
b0d623f7
A
1675 if (dtrace_ls_initialized == FALSE) {
1676 dtrace_ls_initialized = TRUE;
1677 dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
1678 dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
1679 dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin || dtrace_rwl_shared_to_excl_block;
1680 if (dtrace_ls_enabled) {
1681 /*
1682 * Either sleeping or spinning is happening,
1683 * start a timing of our delay interval now.
1684 */
1685 readers_at_sleep = lck->lck_rw_shared_count;
1686 wait_interval = mach_absolute_time();
1687 }
2d21ac55
A
1688 }
1689#endif
b0d623f7
A
1690 if (istate == -1)
1691 istate = ml_get_interrupts_enabled();
1692
1693 deadline = lck_rw_deadline_for_spin(lck);
1694
1695 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_START,
3e170ce0 1696 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7
A
1697
1698 while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1699 lck_rw_lock_pause(istate);
1700
1701 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) | DBG_FUNC_END,
3e170ce0 1702 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7
A
1703
1704 if ( !still_shared)
1705 break;
1706 /*
1707 * if we get here, the deadline has expired w/o
1708 * the rw_shared_count having drained to 0
1709 * check to see if we're allowed to do a thread_block
1710 */
1711 if (lck->lck_rw_can_sleep) {
1712
91447636 1713 istate = lck_interlock_lock(lck);
b0d623f7
A
1714
1715 if (lck->lck_rw_shared_count != 0) {
1716 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_START,
3e170ce0 1717 trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
b0d623f7
A
1718
1719 lck->lck_w_waiting = TRUE;
91447636 1720
813fb2f6 1721 thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
b0d623f7 1722 res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
91447636 1723 lck_interlock_unlock(lck, istate);
b0d623f7
A
1724
1725 if (res == THREAD_WAITING) {
1726 res = thread_block(THREAD_CONTINUE_NULL);
1727 slept++;
1728 }
1729 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) | DBG_FUNC_END,
3e170ce0 1730 trace_lck, res, slept, 0, 0);
b0d623f7
A
1731 } else {
1732 lck_interlock_unlock(lck, istate);
1733 break;
91447636
A
1734 }
1735 }
91447636 1736 }
2d21ac55
A
1737#if CONFIG_DTRACE
1738 /*
1739 * We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1740 */
b0d623f7 1741 if (dtrace_ls_enabled == TRUE) {
2d21ac55
A
1742 if (slept == 0) {
1743 LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
1744 } else {
1745 LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1746 mach_absolute_time() - wait_interval, 1,
1747 (readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
1748 }
1749 }
2d21ac55
A
1750 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
1751#endif
1752 return (TRUE);
91447636
A
1753}
1754
5ba3f43e
A
1755/*
1756 * Routine: lck_rw_lock_exclusive_to_shared
1757 */
1758
1759void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1760{
1761 uint32_t data, prev;
1762
1763 for ( ; ; ) {
1764 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1765 if (data & LCK_RW_INTERLOCK) {
1766 atomic_exchange_abort();
1767 lck_rw_interlock_spin(lock); /* wait for interlock to clear */
1768 continue;
1769 }
1770 data += LCK_RW_SHARED_READER;
1771 if (data & LCK_RW_WANT_UPGRADE)
1772 data &= ~(LCK_RW_WANT_UPGRADE);
1773 else
1774 data &= ~(LCK_RW_WANT_EXCL);
1775 if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
1776 data &= ~(LCK_RW_W_WAITING);
1777 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1778 break;
1779 cpu_pause();
1780 }
1781 return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1782}
1783
b0d623f7 1784
91447636 1785/*
5ba3f43e 1786 * Routine: lck_rw_lock_exclusive_to_shared_gen
b0d623f7
A
1787 * Function:
1788 * assembly fast path has already dropped
1789 * our exclusive state and bumped lck_rw_shared_count
1790 * all we need to do here is determine if anyone
1791 * needs to be awakened.
91447636 1792 */
5ba3f43e 1793static void
b0d623f7
A
1794lck_rw_lock_exclusive_to_shared_gen(
1795 lck_rw_t *lck,
5ba3f43e 1796 uint32_t prior_lock_state)
91447636 1797{
3e170ce0
A
1798 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
1799 lck_rw_t *fake_lck;
91447636 1800
b0d623f7 1801 fake_lck = (lck_rw_t *)&prior_lock_state;
91447636 1802
b0d623f7 1803 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_START,
3e170ce0 1804 trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
91447636 1805
b0d623f7
A
1806 /*
1807 * don't wake up anyone waiting to take the lock exclusively
1808 * since we hold a read count... when the read count drops to 0,
1809 * the writers will be woken.
1810 *
1811 * wake up any waiting readers if we don't have any writers waiting,
1812 * or the lock is NOT marked as rw_priv_excl (writers have privilege)
1813 */
1814 if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
2d21ac55 1815 thread_wakeup(RW_LOCK_READER_EVENT(lck));
91447636
A
1816
1817 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) | DBG_FUNC_END,
3e170ce0 1818 trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
91447636 1819
2d21ac55
A
1820#if CONFIG_DTRACE
1821 LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
1822#endif
91447636
A
1823}
1824
1825
1826/*
1827 * Routine: lck_rw_try_lock
1828 */
1829boolean_t
1830lck_rw_try_lock(
1831 lck_rw_t *lck,
1832 lck_rw_type_t lck_rw_type)
1833{
1834 if (lck_rw_type == LCK_RW_TYPE_SHARED)
1835 return(lck_rw_try_lock_shared(lck));
1836 else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1837 return(lck_rw_try_lock_exclusive(lck));
1838 else
1839 panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1840 return(FALSE);
1841}
1842
5ba3f43e
A
1843/*
1844 * Routine: lck_rw_try_lock_shared
1845 */
1846
1847boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
1848{
1849 uint32_t data, prev;
1850
1851 for ( ; ; ) {
1852 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1853 if (data & LCK_RW_INTERLOCK) {
1854 atomic_exchange_abort();
1855 lck_rw_interlock_spin(lock);
1856 continue;
1857 }
1858 if (data & (LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1859 atomic_exchange_abort();
1860 return FALSE; /* lock is busy */
1861 }
1862 data += LCK_RW_SHARED_READER; /* Increment reader refcount */
1863 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1864 break;
1865 cpu_pause();
1866 }
1867 current_thread()->rwlock_count++;
1868 /* There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg */
1869#if CONFIG_DTRACE
1870 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1871#endif /* CONFIG_DTRACE */
1872 return TRUE;
1873}
1874
1875
1876/*
1877 * Routine: lck_rw_try_lock_exclusive
1878 */
1879
1880boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
1881{
1882 uint32_t data, prev;
1883
1884 for ( ; ; ) {
1885 data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1886 if (data & LCK_RW_INTERLOCK) {
1887 atomic_exchange_abort();
1888 lck_rw_interlock_spin(lock);
1889 continue;
1890 }
1891 if (data & (LCK_RW_SHARED_MASK | LCK_RW_WANT_EXCL | LCK_RW_WANT_UPGRADE)) {
1892 atomic_exchange_abort();
1893 return FALSE; /* can't get it */
1894 }
1895 data |= LCK_RW_WANT_EXCL;
1896 if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1897 break;
1898 cpu_pause();
1899 }
1900
1901 current_thread()->rwlock_count++;
1902#if CONFIG_DTRACE
1903 LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1904#endif /* CONFIG_DTRACE */
1905 return TRUE;
1906}
1907
91447636 1908
2d21ac55
A
1909void
1910lck_rw_assert(
1911 lck_rw_t *lck,
1912 unsigned int type)
1913{
1914 switch (type) {
1915 case LCK_RW_ASSERT_SHARED:
1916 if (lck->lck_rw_shared_count != 0) {
1917 return;
1918 }
1919 break;
1920 case LCK_RW_ASSERT_EXCLUSIVE:
1921 if ((lck->lck_rw_want_write ||
1922 lck->lck_rw_want_upgrade) &&
1923 lck->lck_rw_shared_count == 0) {
1924 return;
1925 }
1926 break;
1927 case LCK_RW_ASSERT_HELD:
1928 if (lck->lck_rw_want_write ||
1929 lck->lck_rw_want_upgrade ||
1930 lck->lck_rw_shared_count != 0) {
1931 return;
1932 }
1933 break;
39236c6e
A
1934 case LCK_RW_ASSERT_NOTHELD:
1935 if (!(lck->lck_rw_want_write ||
1936 lck->lck_rw_want_upgrade ||
1937 lck->lck_rw_shared_count != 0)) {
1938 return;
1939 }
1940 break;
2d21ac55
A
1941 default:
1942 break;
1943 }
1944
39236c6e
A
1945 panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, *(uint32_t *)lck);
1946}
1947
1948/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
1949void
1950lck_rw_clear_promotions_x86(thread_t thread)
1951{
1952#if MACH_LDEBUG
1953 /* It's fatal to leave a RW lock locked and return to userspace */
1954 panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1955#else
1956 /* Paper over the issue */
1957 thread->rwlock_count = 0;
1958 lck_rw_clear_promotion(thread);
1959#endif
2d21ac55
A
1960}
1961
5ba3f43e
A
1962boolean_t
1963lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
1964{
1965 lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
1966
1967 if (lck->lck_rw_want_write || lck->lck_rw_want_upgrade || force_yield) {
1968 lck_rw_unlock_shared(lck);
1969 mutex_pause(2);
1970 lck_rw_lock_shared(lck);
1971 return TRUE;
1972 }
1973
1974 return FALSE;
1975}
39236c6e 1976
3e170ce0
A
1977/*
1978 * Routine: kdp_lck_rw_lock_is_acquired_exclusive
1979 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1980 */
1981boolean_t
1982kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1983 if (not_in_kdp) {
1984 panic("panic: rw lock exclusive check done outside of kernel debugger");
1985 }
1986 return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
1987}
1988
1989
6d2010ae
A
1990#ifdef MUTEX_ZONE
1991extern zone_t lck_mtx_zone;
1992#endif
91447636
A
1993/*
1994 * Routine: lck_mtx_alloc_init
1995 */
1996lck_mtx_t *
1997lck_mtx_alloc_init(
1998 lck_grp_t *grp,
1999 lck_attr_t *attr)
2000{
2001 lck_mtx_t *lck;
6d2010ae
A
2002#ifdef MUTEX_ZONE
2003 if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
2004 lck_mtx_init(lck, grp, attr);
2005#else
91447636
A
2006 if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
2007 lck_mtx_init(lck, grp, attr);
6d2010ae 2008#endif
91447636
A
2009 return(lck);
2010}
2011
2012/*
2013 * Routine: lck_mtx_free
2014 */
2015void
2016lck_mtx_free(
2017 lck_mtx_t *lck,
2018 lck_grp_t *grp)
2019{
2020 lck_mtx_destroy(lck, grp);
6d2010ae
A
2021#ifdef MUTEX_ZONE
2022 zfree(lck_mtx_zone, lck);
2023#else
91447636 2024 kfree(lck, sizeof(lck_mtx_t));
6d2010ae 2025#endif
91447636
A
2026}
2027
2028/*
2029 * Routine: lck_mtx_ext_init
2030 */
2031static void
2032lck_mtx_ext_init(
2033 lck_mtx_ext_t *lck,
2034 lck_grp_t *grp,
2035 lck_attr_t *attr)
2036{
2d21ac55 2037 bzero((void *)lck, sizeof(lck_mtx_ext_t));
91447636
A
2038
2039 if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
91447636
A
2040 lck->lck_mtx_deb.type = MUTEX_TAG;
2041 lck->lck_mtx_attr |= LCK_MTX_ATTR_DEBUG;
2042 }
2043
2044 lck->lck_mtx_grp = grp;
2d21ac55
A
2045
2046 if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
6d2010ae 2047 lck->lck_mtx_attr |= LCK_MTX_ATTR_STAT;
b0d623f7 2048
6d2010ae 2049 lck->lck_mtx.lck_mtx_is_ext = 1;
39037602 2050 lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
91447636
A
2051}
2052
2053/*
2054 * Routine: lck_mtx_init
2055 */
2056void
2057lck_mtx_init(
2058 lck_mtx_t *lck,
2059 lck_grp_t *grp,
2060 lck_attr_t *attr)
2061{
2062 lck_mtx_ext_t *lck_ext;
2d21ac55
A
2063 lck_attr_t *lck_attr;
2064
2065 if (attr != LCK_ATTR_NULL)
2066 lck_attr = attr;
2067 else
2068 lck_attr = &LockDefaultLckAttr;
91447636 2069
2d21ac55 2070 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
91447636 2071 if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
2d21ac55 2072 lck_mtx_ext_init(lck_ext, grp, lck_attr);
91447636
A
2073 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2074 lck->lck_mtx_ptr = lck_ext;
2075 }
2076 } else {
b0d623f7 2077 lck->lck_mtx_owner = 0;
6d2010ae 2078 lck->lck_mtx_state = 0;
91447636 2079 }
39037602 2080 lck->lck_mtx_pad32 = 0xFFFFFFFF;
91447636
A
2081 lck_grp_reference(grp);
2082 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2083}
2084
2d21ac55
A
2085/*
2086 * Routine: lck_mtx_init_ext
2087 */
2088void
2089lck_mtx_init_ext(
2090 lck_mtx_t *lck,
2091 lck_mtx_ext_t *lck_ext,
2092 lck_grp_t *grp,
2093 lck_attr_t *attr)
2094{
2095 lck_attr_t *lck_attr;
2096
2097 if (attr != LCK_ATTR_NULL)
2098 lck_attr = attr;
2099 else
2100 lck_attr = &LockDefaultLckAttr;
2101
2102 if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2103 lck_mtx_ext_init(lck_ext, grp, lck_attr);
2104 lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2105 lck->lck_mtx_ptr = lck_ext;
2106 } else {
b0d623f7 2107 lck->lck_mtx_owner = 0;
6d2010ae 2108 lck->lck_mtx_state = 0;
2d21ac55 2109 }
39037602 2110 lck->lck_mtx_pad32 = 0xFFFFFFFF;
6d2010ae 2111
2d21ac55
A
2112 lck_grp_reference(grp);
2113 lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2114}
2115
91447636
A
2116/*
2117 * Routine: lck_mtx_destroy
2118 */
2119void
2120lck_mtx_destroy(
2121 lck_mtx_t *lck,
2122 lck_grp_t *grp)
2123{
2124 boolean_t lck_is_indirect;
2125
2126 if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2127 return;
39236c6e
A
2128#if MACH_LDEBUG
2129 lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2130#endif
91447636 2131 lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
b0d623f7
A
2132
2133 lck_mtx_lock_mark_destroyed(lck);
2134
91447636
A
2135 if (lck_is_indirect)
2136 kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2137 lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2138 lck_grp_deallocate(grp);
2139 return;
2140}
2141
b0d623f7
A
2142
2143#define LCK_MTX_LCK_WAIT_CODE 0x20
2144#define LCK_MTX_LCK_WAKEUP_CODE 0x21
2145#define LCK_MTX_LCK_SPIN_CODE 0x22
2146#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2147#define LCK_MTX_LCK_DEMOTE_CODE 0x24
2148
2149
2150/*
2151 * Routine: lck_mtx_unlock_wakeup_x86
2152 *
6d2010ae
A
2153 * Invoked on unlock when there is
2154 * contention (i.e. the assembly routine sees that
2155 * that mutex->lck_mtx_waiters != 0 or
2156 * that mutex->lck_mtx_promoted != 0...
b0d623f7 2157 *
6d2010ae 2158 * neither the mutex or interlock is held
b0d623f7
A
2159 */
2160void
2161lck_mtx_unlock_wakeup_x86 (
2162 lck_mtx_t *mutex,
6d2010ae 2163 int prior_lock_state)
b0d623f7 2164{
3e170ce0
A
2165 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2166 lck_mtx_t fake_lck;
6d2010ae
A
2167
2168 /*
2169 * prior_lock state is a snapshot of the 2nd word of the
2170 * lock in question... we'll fake up a lock with the bits
2171 * copied into place and carefully not access anything
2172 * beyond whats defined in the second word of a lck_mtx_t
2173 */
2174 fake_lck.lck_mtx_state = prior_lock_state;
2175
2176 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
3e170ce0 2177 trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
b0d623f7 2178
6d2010ae 2179 if (__probable(fake_lck.lck_mtx_waiters)) {
6d2010ae 2180 if (fake_lck.lck_mtx_waiters > 1)
3e170ce0 2181 thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
6d2010ae 2182 else
3e170ce0 2183 thread_wakeup_one(LCK_MTX_EVENT(mutex));
6d2010ae 2184 }
b0d623f7 2185
6d2010ae 2186 if (__improbable(fake_lck.lck_mtx_promoted)) {
b0d623f7
A
2187 thread_t thread = current_thread();
2188
2189
6d2010ae
A
2190 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
2191 thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
b0d623f7
A
2192
2193 if (thread->promotions > 0) {
2194 spl_t s = splsched();
2195
2196 thread_lock(thread);
2197
6d2010ae 2198 if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
b0d623f7 2199
6d2010ae 2200 thread->sched_flags &= ~TH_SFLAG_PROMOTED;
b0d623f7 2201
fe8ab488
A
2202 if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
2203 /* Thread still has a RW lock promotion */
2204 } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
6d2010ae 2205 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
3e170ce0 2206 thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
b0d623f7
A
2207
2208 set_sched_pri(thread, DEPRESSPRI);
2209 }
2210 else {
3e170ce0 2211 if (thread->base_pri < thread->sched_pri) {
6d2010ae 2212 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
3e170ce0 2213 thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
b0d623f7 2214
3e170ce0 2215 thread_recompute_sched_pri(thread, FALSE);
b0d623f7
A
2216 }
2217 }
2218 }
2219 thread_unlock(thread);
2220 splx(s);
2221 }
2222 }
6d2010ae 2223 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
3e170ce0 2224 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7
A
2225}
2226
2227
2228/*
2229 * Routine: lck_mtx_lock_acquire_x86
2230 *
2231 * Invoked on acquiring the mutex when there is
6d2010ae
A
2232 * contention (i.e. the assembly routine sees that
2233 * that mutex->lck_mtx_waiters != 0 or
2234 * thread->was_promoted_on_wakeup != 0)...
2235 *
2236 * mutex is owned... interlock is held... preemption is disabled
b0d623f7
A
2237 */
2238void
2239lck_mtx_lock_acquire_x86(
2240 lck_mtx_t *mutex)
2241{
3e170ce0
A
2242 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2243 thread_t thread;
2244 integer_t priority;
2245 spl_t s;
b0d623f7 2246
6d2010ae 2247 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
3e170ce0 2248 trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
b0d623f7 2249
6d2010ae
A
2250 if (mutex->lck_mtx_waiters)
2251 priority = mutex->lck_mtx_pri;
2252 else
2253 priority = 0;
b0d623f7 2254
6d2010ae 2255 thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */
b0d623f7 2256
6d2010ae 2257 if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
b0d623f7 2258
6d2010ae 2259 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
3e170ce0 2260 thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
b0d623f7 2261
6d2010ae
A
2262 s = splsched();
2263 thread_lock(thread);
b0d623f7 2264
39236c6e 2265 if (thread->sched_pri < priority) {
fe8ab488
A
2266 /* Do not promote past promotion ceiling */
2267 assert(priority <= MAXPRI_PROMOTE);
6d2010ae 2268 set_sched_pri(thread, priority);
39236c6e 2269 }
6d2010ae
A
2270 if (mutex->lck_mtx_promoted == 0) {
2271 mutex->lck_mtx_promoted = 1;
2272
b0d623f7 2273 thread->promotions++;
6d2010ae 2274 thread->sched_flags |= TH_SFLAG_PROMOTED;
b0d623f7 2275 }
6d2010ae
A
2276 thread->was_promoted_on_wakeup = 0;
2277
2278 thread_unlock(thread);
2279 splx(s);
b0d623f7 2280 }
6d2010ae 2281 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_END,
3e170ce0 2282 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
b0d623f7
A
2283}
2284
2285
3e170ce0
A
2286static int
2287lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate)
2288{
2289 int retval;
2290
2291 *istate = ml_set_interrupts_enabled(FALSE);
2292 retval = lck_mtx_ilk_try_lock(mutex);
2293
2294 if (retval == 0)
2295 ml_set_interrupts_enabled(*istate);
2296
2297 return retval;
2298}
2299
2300static void
2301lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
2302{
2303 lck_mtx_ilk_unlock(mutex);
2304 ml_set_interrupts_enabled(istate);
2305}
2306
b0d623f7 2307
91447636 2308/*
b0d623f7 2309 * Routine: lck_mtx_lock_spinwait_x86
0c530ab8
A
2310 *
2311 * Invoked trying to acquire a mutex when there is contention but
2312 * the holder is running on another processor. We spin for up to a maximum
2313 * time waiting for the lock to be released.
2314 *
2315 * Called with the interlock unlocked.
6d2010ae
A
2316 * returns 0 if mutex acquired
2317 * returns 1 if we spun
2318 * returns 2 if we didn't spin due to the holder not running
0c530ab8 2319 */
b0d623f7
A
2320int
2321lck_mtx_lock_spinwait_x86(
2322 lck_mtx_t *mutex)
0c530ab8 2323{
3e170ce0 2324 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
b0d623f7 2325 thread_t holder;
3e170ce0
A
2326 uint64_t overall_deadline;
2327 uint64_t check_owner_deadline;
2328 uint64_t cur_time;
b0d623f7
A
2329 int retval = 1;
2330 int loopcount = 0;
0c530ab8 2331
6d2010ae 2332 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
3e170ce0 2333 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
0c530ab8 2334
3e170ce0
A
2335 cur_time = mach_absolute_time();
2336 overall_deadline = cur_time + MutexSpin;
2337 check_owner_deadline = cur_time;
b0d623f7 2338
0c530ab8
A
2339 /*
2340 * Spin while:
2341 * - mutex is locked, and
b0d623f7 2342 * - its locked as a spin lock, and
0c530ab8 2343 * - owner is running on another processor, and
2d21ac55 2344 * - owner (processor) is not idling, and
0c530ab8
A
2345 * - we haven't spun for long enough.
2346 */
b0d623f7 2347 do {
6d2010ae 2348 if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
b0d623f7
A
2349 retval = 0;
2350 break;
2d21ac55 2351 }
3e170ce0 2352 cur_time = mach_absolute_time();
b0d623f7 2353
3e170ce0
A
2354 if (cur_time >= overall_deadline)
2355 break;
2356
2357 if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
2358 boolean_t istate;
2359
2360 if (lck_mtx_interlock_try_lock(mutex, &istate)) {
2361
2362 if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
2363
2364 if ( !(holder->machine.specFlags & OnProc) ||
2365 (holder->state & TH_IDLE)) {
2366
2367 lck_mtx_interlock_unlock(mutex, istate);
2368
2369 if (loopcount == 0)
2370 retval = 2;
2371 break;
2372 }
2373 }
2374 lck_mtx_interlock_unlock(mutex, istate);
2375
2376 check_owner_deadline = cur_time + (MutexSpin / 4);
b0d623f7
A
2377 }
2378 }
2379 cpu_pause();
2380
2381 loopcount++;
2382
3e170ce0 2383 } while (TRUE);
b0d623f7 2384
2d21ac55
A
2385#if CONFIG_DTRACE
2386 /*
3e170ce0 2387 * We've already kept a count via overall_deadline of how long we spun.
2d21ac55
A
2388 * If dtrace is active, then we compute backwards to decide how
2389 * long we spun.
2390 *
2391 * Note that we record a different probe id depending on whether
2392 * this is a direct or indirect mutex. This allows us to
2393 * penalize only lock groups that have debug/stats enabled
2394 * with dtrace processing if desired.
2395 */
6d2010ae 2396 if (__probable(mutex->lck_mtx_is_ext == 0)) {
b0d623f7 2397 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3e170ce0 2398 mach_absolute_time() - (overall_deadline - MutexSpin));
2d21ac55 2399 } else {
b0d623f7 2400 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3e170ce0 2401 mach_absolute_time() - (overall_deadline - MutexSpin));
2d21ac55
A
2402 }
2403 /* The lockstat acquire event is recorded by the assembly code beneath us. */
2404#endif
b0d623f7 2405
6d2010ae 2406 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_END,
3e170ce0 2407 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
b0d623f7
A
2408
2409 return retval;
0c530ab8
A
2410}
2411
b0d623f7
A
2412
2413
0c530ab8 2414/*
b0d623f7
A
2415 * Routine: lck_mtx_lock_wait_x86
2416 *
2417 * Invoked in order to wait on contention.
2418 *
2419 * Called with the interlock locked and
6d2010ae
A
2420 * preemption disabled...
2421 * returns it unlocked and with preemption enabled
0c530ab8
A
2422 */
2423void
b0d623f7
A
2424lck_mtx_lock_wait_x86 (
2425 lck_mtx_t *mutex)
0c530ab8 2426{
3e170ce0 2427 __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
b0d623f7
A
2428 thread_t self = current_thread();
2429 thread_t holder;
2430 integer_t priority;
b0d623f7
A
2431 spl_t s;
2432#if CONFIG_DTRACE
2433 uint64_t sleep_start = 0;
2434
2435 if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
2436 sleep_start = mach_absolute_time();
2437 }
2438#endif
6d2010ae 2439 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
3e170ce0 2440 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
b0d623f7
A
2441
2442 priority = self->sched_pri;
2443
3e170ce0
A
2444 if (priority < self->base_pri)
2445 priority = self->base_pri;
b0d623f7
A
2446 if (priority < BASEPRI_DEFAULT)
2447 priority = BASEPRI_DEFAULT;
2448
fe8ab488
A
2449 /* Do not promote past promotion ceiling */
2450 priority = MIN(priority, MAXPRI_PROMOTE);
39236c6e 2451
6d2010ae 2452 if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
b0d623f7 2453 mutex->lck_mtx_pri = priority;
6d2010ae 2454 mutex->lck_mtx_waiters++;
b0d623f7 2455
6d2010ae
A
2456 if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
2457 holder->sched_pri < mutex->lck_mtx_pri ) {
b0d623f7
A
2458 s = splsched();
2459 thread_lock(holder);
2460
fe8ab488
A
2461 /* holder priority may have been bumped by another thread
2462 * before thread_lock was taken
2463 */
6d2010ae 2464 if (holder->sched_pri < mutex->lck_mtx_pri) {
b0d623f7
A
2465 KERNEL_DEBUG_CONSTANT(
2466 MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
3e170ce0 2467 holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
fe8ab488
A
2468 /* Assert that we're not altering the priority of a
2469 * thread above the MAXPRI_PROMOTE band
2470 */
2471 assert(holder->sched_pri < MAXPRI_PROMOTE);
b0d623f7
A
2472 set_sched_pri(holder, priority);
2473
2474 if (mutex->lck_mtx_promoted == 0) {
2475 holder->promotions++;
6d2010ae
A
2476 holder->sched_flags |= TH_SFLAG_PROMOTED;
2477
b0d623f7
A
2478 mutex->lck_mtx_promoted = 1;
2479 }
2480 }
2481 thread_unlock(holder);
2482 splx(s);
2483 }
813fb2f6 2484 thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3e170ce0 2485 assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
b0d623f7
A
2486
2487 lck_mtx_ilk_unlock(mutex);
2488
2489 thread_block(THREAD_CONTINUE_NULL);
2490
6d2010ae 2491 KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
3e170ce0 2492 trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
b0d623f7
A
2493
2494#if CONFIG_DTRACE
2495 /*
2496 * Record the Dtrace lockstat probe for blocking, block time
2497 * measured from when we were entered.
2498 */
2499 if (sleep_start) {
6d2010ae 2500 if (mutex->lck_mtx_is_ext == 0) {
b0d623f7
A
2501 LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
2502 mach_absolute_time() - sleep_start);
2503 } else {
2504 LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
2505 mach_absolute_time() - sleep_start);
2506 }
2507 }
2508#endif
0c530ab8 2509}
3e170ce0
A
2510
2511/*
2512 * Routine: kdp_lck_mtx_lock_spin_is_acquired
2513 * NOT SAFE: To be used only by kernel debugger to avoid deadlock.
2514 * Returns: TRUE if lock is acquired.
2515 */
2516boolean_t
2517kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
2518{
2519 if (not_in_kdp) {
2520 panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
2521 }
2522
39037602 2523 if (lck->lck_mtx_ilocked || lck->lck_mtx_mlocked) {
3e170ce0
A
2524 return TRUE;
2525 }
2526
2527 return FALSE;
2528}
2529
813fb2f6
A
2530void
2531kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2532{
2533 lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
2534 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
2535 thread_t holder = (thread_t)mutex->lck_mtx_owner;
2536 waitinfo->owner = thread_tid(holder);
2537}
2538
2539void
2540kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
2541{
2542 lck_rw_t *rwlck = NULL;
2543 switch(waitinfo->wait_type) {
2544 case kThreadWaitKernelRWLockRead:
2545 rwlck = READ_EVENT_TO_RWLOCK(event);
2546 break;
2547 case kThreadWaitKernelRWLockWrite:
2548 case kThreadWaitKernelRWLockUpgrade:
2549 rwlck = WRITE_EVENT_TO_RWLOCK(event);
2550 break;
2551 default:
2552 panic("%s was called with an invalid blocking type", __FUNCTION__);
2553 break;
2554 }
2555 waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
2556 waitinfo->owner = 0;
2557}