git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2012 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	* File: kern/lock.c
	58	* Author: Avadis Tevanian, Jr., Michael Wayne Young
	59	* Date: 1985
	60	*
	61	* Locking primitives implementation
	62	*/
	63
	64	#include <mach_ldebug.h>
	65
	66	#include <kern/locks.h>
	67	#include <kern/kalloc.h>
	68	#include <kern/misc_protos.h>
	69	#include <kern/thread.h>
	70	#include <kern/processor.h>
	71	#include <kern/cpu_data.h>
	72	#include <kern/cpu_number.h>
	73	#include <kern/sched_prim.h>
	74	#include <kern/xpr.h>
	75	#include <kern/debug.h>
	76	#include <string.h>
	77
	78	#include <i386/machine_routines.h> /* machine_timeout_suspended() */
	79	#include <machine/machine_cpu.h>
	80	#include <i386/mp.h>
	81
	82	#include <sys/kdebug.h>
	83	#include <mach/branch_predicates.h>
	84
	85	/*
	86	* We need only enough declarations from the BSD-side to be able to
	87	* test if our probe is active, and to call __dtrace_probe(). Setting
	88	* NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
	89	*/
	90	#if CONFIG_DTRACE
	91	#define NEED_DTRACE_DEFS
	92	#include <../bsd/sys/lockstat.h>
	93	#endif
	94
	95	#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
	96	#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
	97	#define LCK_RW_LCK_SHARED_CODE 0x102
	98	#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
	99	#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
	100	#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
	101
	102	#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
	103	#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
	104	#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
	105	#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
	106	#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
	107	#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
	108	#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
	109	#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
	110
	111
	112	#define ANY_LOCK_DEBUG (USLOCK_DEBUG \|\| LOCK_DEBUG \|\| MUTEX_DEBUG)
	113
	114	unsigned int LcksOpts=0;
	115
	116	/* Forwards */
	117
	118	#if USLOCK_DEBUG
	119	/*
	120	* Perform simple lock checks.
	121	*/
	122	int uslock_check = 1;
	123	int max_lock_loops = 100000000;
	124	decl_simple_lock_data(extern , printf_lock)
	125	decl_simple_lock_data(extern , panic_lock)
	126	#endif /* USLOCK_DEBUG */
	127
	128	extern unsigned int not_in_kdp;
	129
	130	/*
	131	* We often want to know the addresses of the callers
	132	* of the various lock routines. However, this information
	133	* is only used for debugging and statistics.
	134	*/
	135	typedef void *pc_t;
	136	#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
	137	#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
	138	#if ANY_LOCK_DEBUG
	139	#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
	140	#define DECL_PC(pc) pc_t pc;
	141	#else /* ANY_LOCK_DEBUG */
	142	#define DECL_PC(pc)
	143	#ifdef lint
	144	/*
	145	* Eliminate lint complaints about unused local pc variables.
	146	*/
	147	#define OBTAIN_PC(pc) ++pc
	148	#else /* lint */
	149	#define OBTAIN_PC(pc)
	150	#endif /* lint */
	151	#endif /* USLOCK_DEBUG */
	152
	153
	154	/*
	155	* Portable lock package implementation of usimple_locks.
	156	*/
	157
	158	#if USLOCK_DEBUG
	159	#define USLDBG(stmt) stmt
	160	void usld_lock_init(usimple_lock_t, unsigned short);
	161	void usld_lock_pre(usimple_lock_t, pc_t);
	162	void usld_lock_post(usimple_lock_t, pc_t);
	163	void usld_unlock(usimple_lock_t, pc_t);
	164	void usld_lock_try_pre(usimple_lock_t, pc_t);
	165	void usld_lock_try_post(usimple_lock_t, pc_t);
	166	int usld_lock_common_checks(usimple_lock_t, char *);
	167	#else /* USLOCK_DEBUG */
	168	#define USLDBG(stmt)
	169	#endif /* USLOCK_DEBUG */
	170
	171
	172	extern int lck_rw_grab_want(lck_rw_t *lck);
	173	extern int lck_rw_grab_shared(lck_rw_t *lck);
	174	extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
	175
	176
	177	/*
	178	* Forward definitions
	179	*/
	180
	181	void lck_rw_lock_shared_gen(
	182	lck_rw_t *lck);
	183
	184	void lck_rw_lock_exclusive_gen(
	185	lck_rw_t *lck);
	186
	187	boolean_t lck_rw_lock_shared_to_exclusive_success(
	188	lck_rw_t *lck);
	189
	190	boolean_t lck_rw_lock_shared_to_exclusive_failure(
	191	lck_rw_t *lck,
	192	int prior_lock_state);
	193
	194	void lck_rw_lock_exclusive_to_shared_gen(
	195	lck_rw_t *lck,
	196	int prior_lock_state);
	197
	198	lck_rw_type_t lck_rw_done_gen(
	199	lck_rw_t *lck,
	200	int prior_lock_state);
	201
	202	void lck_rw_clear_promotions_x86(thread_t thread);
	203
	204	/*
	205	* Routine: lck_spin_alloc_init
	206	*/
	207	lck_spin_t *
	208	lck_spin_alloc_init(
	209	lck_grp_t *grp,
	210	lck_attr_t *attr)
	211	{
	212	lck_spin_t *lck;
	213
	214	if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
	215	lck_spin_init(lck, grp, attr);
	216
	217	return(lck);
	218	}
	219
	220	/*
	221	* Routine: lck_spin_free
	222	*/
	223	void
	224	lck_spin_free(
	225	lck_spin_t *lck,
	226	lck_grp_t *grp)
	227	{
	228	lck_spin_destroy(lck, grp);
	229	kfree(lck, sizeof(lck_spin_t));
	230	}
	231
	232	/*
	233	* Routine: lck_spin_init
	234	*/
	235	void
	236	lck_spin_init(
	237	lck_spin_t *lck,
	238	lck_grp_t *grp,
	239	__unused lck_attr_t *attr)
	240	{
	241	usimple_lock_init((usimple_lock_t) lck, 0);
	242	lck_grp_reference(grp);
	243	lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
	244	}
	245
	246	/*
	247	* Routine: lck_spin_destroy
	248	*/
	249	void
	250	lck_spin_destroy(
	251	lck_spin_t *lck,
	252	lck_grp_t *grp)
	253	{
	254	if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
	255	return;
	256	lck->interlock = LCK_SPIN_TAG_DESTROYED;
	257	lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
	258	lck_grp_deallocate(grp);
	259	return;
	260	}
	261
	262	/*
	263	* Routine: lck_spin_lock
	264	*/
	265	void
	266	lck_spin_lock(
	267	lck_spin_t *lck)
	268	{
	269	usimple_lock((usimple_lock_t) lck);
	270	}
	271
	272	/*
	273	* Routine: lck_spin_unlock
	274	*/
	275	void
	276	lck_spin_unlock(
	277	lck_spin_t *lck)
	278	{
	279	usimple_unlock((usimple_lock_t) lck);
	280	}
	281
	282
	283	/*
	284	* Routine: lck_spin_try_lock
	285	*/
	286	boolean_t
	287	lck_spin_try_lock(
	288	lck_spin_t *lck)
	289	{
	290	return((boolean_t)usimple_lock_try((usimple_lock_t) lck));
	291	}
	292
	293	/*
	294	* Routine: kdp_lck_spin_is_acquired
	295	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
	296	* Returns: TRUE if lock is acquired.
	297	*/
	298	boolean_t
	299	kdp_lck_spin_is_acquired(lck_spin_t *lck) {
	300	if (not_in_kdp) {
	301	panic("panic: spinlock acquired check done outside of kernel debugger");
	302	}
	303	return (lck->interlock != 0)? TRUE : FALSE;
	304	}
	305
	306	/*
	307	* Initialize a usimple_lock.
	308	*
	309	* No change in preemption state.
	310	*/
	311	void
	312	usimple_lock_init(
	313	usimple_lock_t l,
	314	__unused unsigned short tag)
	315	{
	316	#ifndef MACHINE_SIMPLE_LOCK
	317	USLDBG(usld_lock_init(l, tag));
	318	hw_lock_init(&l->interlock);
	319	#else
	320	simple_lock_init((simple_lock_t)l,tag);
	321	#endif
	322	}
	323
	324	volatile uint32_t spinlock_owner_cpu = ~0;
	325	volatile usimple_lock_t spinlock_timed_out;
	326
	327	uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
	328	uint64_t deadline;
	329	uint32_t i;
	330
	331	for (i = 0; i < real_ncpus; i++) {
	332	if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
	333	spinlock_owner_cpu = i;
	334	if ((uint32_t) cpu_number() == i)
	335	break;
	336	cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
	337	cpu_NMI_interrupt(i);
	338	deadline = mach_absolute_time() + (LockTimeOut * 2);
	339	while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
	340	cpu_pause();
	341	break;
	342	}
	343	}
	344
	345	return spinlock_owner_cpu;
	346	}
	347
	348	/*
	349	* Acquire a usimple_lock.
	350	*
	351	* Returns with preemption disabled. Note
	352	* that the hw_lock routines are responsible for
	353	* maintaining preemption state.
	354	*/
	355	void
	356	usimple_lock(
	357	usimple_lock_t l)
	358	{
	359	#ifndef MACHINE_SIMPLE_LOCK
	360	DECL_PC(pc);
	361
	362	OBTAIN_PC(pc);
	363	USLDBG(usld_lock_pre(l, pc));
	364
	365	if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) {
	366	boolean_t uslock_acquired = FALSE;
	367	while (machine_timeout_suspended()) {
	368	enable_preemption();
	369	if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
	370	break;
	371	}
	372
	373	if (uslock_acquired == FALSE) {
	374	uint32_t lock_cpu;
	375	uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
	376	spinlock_timed_out = l;
	377	lock_cpu = spinlock_timeout_NMI(lowner);
	378	panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
	379	}
	380	}
	381	USLDBG(usld_lock_post(l, pc));
	382	#else
	383	simple_lock((simple_lock_t)l);
	384	#endif
	385	}
	386
	387
	388	/*
	389	* Release a usimple_lock.
	390	*
	391	* Returns with preemption enabled. Note
	392	* that the hw_lock routines are responsible for
	393	* maintaining preemption state.
	394	*/
	395	void
	396	usimple_unlock(
	397	usimple_lock_t l)
	398	{
	399	#ifndef MACHINE_SIMPLE_LOCK
	400	DECL_PC(pc);
	401
	402	OBTAIN_PC(pc);
	403	USLDBG(usld_unlock(l, pc));
	404	hw_lock_unlock(&l->interlock);
	405	#else
	406	simple_unlock_rwmb((simple_lock_t)l);
	407	#endif
	408	}
	409
	410
	411	/*
	412	* Conditionally acquire a usimple_lock.
	413	*
	414	* On success, returns with preemption disabled.
	415	* On failure, returns with preemption in the same state
	416	* as when first invoked. Note that the hw_lock routines
	417	* are responsible for maintaining preemption state.
	418	*
	419	* XXX No stats are gathered on a miss; I preserved this
	420	* behavior from the original assembly-language code, but
	421	* doesn't it make sense to log misses? XXX
	422	*/
	423	unsigned int
	424	usimple_lock_try(
	425	usimple_lock_t l)
	426	{
	427	#ifndef MACHINE_SIMPLE_LOCK
	428	unsigned int success;
	429	DECL_PC(pc);
	430
	431	OBTAIN_PC(pc);
	432	USLDBG(usld_lock_try_pre(l, pc));
	433	if ((success = hw_lock_try(&l->interlock))) {
	434	USLDBG(usld_lock_try_post(l, pc));
	435	}
	436	return success;
	437	#else
	438	return(simple_lock_try((simple_lock_t)l));
	439	#endif
	440	}
	441
	442	#if USLOCK_DEBUG
	443	/*
	444	* States of a usimple_lock. The default when initializing
	445	* a usimple_lock is setting it up for debug checking.
	446	*/
	447	#define USLOCK_CHECKED 0x0001 /* lock is being checked */
	448	#define USLOCK_TAKEN 0x0002 /* lock has been taken */
	449	#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
	450	#define USLOCK_INITIALIZED (USLOCK_INIT\|USLOCK_CHECKED)
	451	#define USLOCK_CHECKING(l) (uslock_check && \
	452	((l)->debug.state & USLOCK_CHECKED))
	453
	454	/*
	455	* Trace activities of a particularly interesting lock.
	456	*/
	457	void usl_trace(usimple_lock_t, int, pc_t, const char *);
	458
	459
	460	/*
	461	* Initialize the debugging information contained
	462	* in a usimple_lock.
	463	*/
	464	void
	465	usld_lock_init(
	466	usimple_lock_t l,
	467	__unused unsigned short tag)
	468	{
	469	if (l == USIMPLE_LOCK_NULL)
	470	panic("lock initialization: null lock pointer");
	471	l->lock_type = USLOCK_TAG;
	472	l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
	473	l->debug.lock_cpu = l->debug.unlock_cpu = 0;
	474	l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
	475	l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
	476	l->debug.duration[0] = l->debug.duration[1] = 0;
	477	l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
	478	l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
	479	l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
	480	}
	481
	482
	483	/*
	484	* These checks apply to all usimple_locks, not just
	485	* those with USLOCK_CHECKED turned on.
	486	*/
	487	int
	488	usld_lock_common_checks(
	489	usimple_lock_t l,
	490	char *caller)
	491	{
	492	if (l == USIMPLE_LOCK_NULL)
	493	panic("%s: null lock pointer", caller);
	494	if (l->lock_type != USLOCK_TAG)
	495	panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
	496	if (!(l->debug.state & USLOCK_INIT))
	497	panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
	498	return USLOCK_CHECKING(l);
	499	}
	500
	501
	502	/*
	503	* Debug checks on a usimple_lock just before attempting
	504	* to acquire it.
	505	*/
	506	/* ARGSUSED */
	507	void
	508	usld_lock_pre(
	509	usimple_lock_t l,
	510	pc_t pc)
	511	{
	512	char caller[] = "usimple_lock";
	513
	514
	515	if (!usld_lock_common_checks(l, caller))
	516	return;
	517
	518	/*
	519	* Note that we have a weird case where we are getting a lock when we are]
	520	* in the process of putting the system to sleep. We are running with no
	521	* current threads, therefore we can't tell if we are trying to retake a lock
	522	* we have or someone on the other processor has it. Therefore we just
	523	* ignore this test if the locking thread is 0.
	524	*/
	525
	526	if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
	527	l->debug.lock_thread == (void *) current_thread()) {
	528	printf("%s: lock %p already locked (at %p) by",
	529	caller, l, l->debug.lock_pc);
	530	printf(" current thread %p (new attempt at pc %p)\n",
	531	l->debug.lock_thread, pc);
	532	panic("%s", caller);
	533	}
	534	mp_disable_preemption();
	535	usl_trace(l, cpu_number(), pc, caller);
	536	mp_enable_preemption();
	537	}
	538
	539
	540	/*
	541	* Debug checks on a usimple_lock just after acquiring it.
	542	*
	543	* Pre-emption has been disabled at this point,
	544	* so we are safe in using cpu_number.
	545	*/
	546	void
	547	usld_lock_post(
	548	usimple_lock_t l,
	549	pc_t pc)
	550	{
	551	register int mycpu;
	552	char caller[] = "successful usimple_lock";
	553
	554
	555	if (!usld_lock_common_checks(l, caller))
	556	return;
	557
	558	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
	559	panic("%s: lock %p became uninitialized",
	560	caller, l);
	561	if ((l->debug.state & USLOCK_TAKEN))
	562	panic("%s: lock 0x%p became TAKEN by someone else",
	563	caller, l);
	564
	565	mycpu = cpu_number();
	566	l->debug.lock_thread = (void *)current_thread();
	567	l->debug.state \|= USLOCK_TAKEN;
	568	l->debug.lock_pc = pc;
	569	l->debug.lock_cpu = mycpu;
	570
	571	usl_trace(l, mycpu, pc, caller);
	572	}
	573
	574
	575	/*
	576	* Debug checks on a usimple_lock just before
	577	* releasing it. Note that the caller has not
	578	* yet released the hardware lock.
	579	*
	580	* Preemption is still disabled, so there's
	581	* no problem using cpu_number.
	582	*/
	583	void
	584	usld_unlock(
	585	usimple_lock_t l,
	586	pc_t pc)
	587	{
	588	register int mycpu;
	589	char caller[] = "usimple_unlock";
	590
	591
	592	if (!usld_lock_common_checks(l, caller))
	593	return;
	594
	595	mycpu = cpu_number();
	596
	597	if (!(l->debug.state & USLOCK_TAKEN))
	598	panic("%s: lock 0x%p hasn't been taken",
	599	caller, l);
	600	if (l->debug.lock_thread != (void *) current_thread())
	601	panic("%s: unlocking lock 0x%p, owned by thread %p",
	602	caller, l, l->debug.lock_thread);
	603	if (l->debug.lock_cpu != mycpu) {
	604	printf("%s: unlocking lock 0x%p on cpu 0x%x",
	605	caller, l, mycpu);
	606	printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
	607	panic("%s", caller);
	608	}
	609	usl_trace(l, mycpu, pc, caller);
	610
	611	l->debug.unlock_thread = l->debug.lock_thread;
	612	l->debug.lock_thread = INVALID_PC;
	613	l->debug.state &= ~USLOCK_TAKEN;
	614	l->debug.unlock_pc = pc;
	615	l->debug.unlock_cpu = mycpu;
	616	}
	617
	618
	619	/*
	620	* Debug checks on a usimple_lock just before
	621	* attempting to acquire it.
	622	*
	623	* Preemption isn't guaranteed to be disabled.
	624	*/
	625	void
	626	usld_lock_try_pre(
	627	usimple_lock_t l,
	628	pc_t pc)
	629	{
	630	char caller[] = "usimple_lock_try";
	631
	632	if (!usld_lock_common_checks(l, caller))
	633	return;
	634	mp_disable_preemption();
	635	usl_trace(l, cpu_number(), pc, caller);
	636	mp_enable_preemption();
	637	}
	638
	639
	640	/*
	641	* Debug checks on a usimple_lock just after
	642	* successfully attempting to acquire it.
	643	*
	644	* Preemption has been disabled by the
	645	* lock acquisition attempt, so it's safe
	646	* to use cpu_number.
	647	*/
	648	void
	649	usld_lock_try_post(
	650	usimple_lock_t l,
	651	pc_t pc)
	652	{
	653	register int mycpu;
	654	char caller[] = "successful usimple_lock_try";
	655
	656	if (!usld_lock_common_checks(l, caller))
	657	return;
	658
	659	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
	660	panic("%s: lock 0x%p became uninitialized",
	661	caller, l);
	662	if ((l->debug.state & USLOCK_TAKEN))
	663	panic("%s: lock 0x%p became TAKEN by someone else",
	664	caller, l);
	665
	666	mycpu = cpu_number();
	667	l->debug.lock_thread = (void *) current_thread();
	668	l->debug.state \|= USLOCK_TAKEN;
	669	l->debug.lock_pc = pc;
	670	l->debug.lock_cpu = mycpu;
	671
	672	usl_trace(l, mycpu, pc, caller);
	673	}
	674
	675
	676	/*
	677	* For very special cases, set traced_lock to point to a
	678	* specific lock of interest. The result is a series of
	679	* XPRs showing lock operations on that lock. The lock_seq
	680	* value is used to show the order of those operations.
	681	*/
	682	usimple_lock_t traced_lock;
	683	unsigned int lock_seq;
	684
	685	void
	686	usl_trace(
	687	usimple_lock_t l,
	688	int mycpu,
	689	pc_t pc,
	690	const char * op_name)
	691	{
	692	if (traced_lock == l) {
	693	XPR(XPR_SLOCK,
	694	"seq %d, cpu %d, %s @ %x\n",
	695	(uintptr_t) lock_seq, (uintptr_t) mycpu,
	696	(uintptr_t) op_name, (uintptr_t) pc, 0);
	697	lock_seq++;
	698	}
	699	}
	700
	701
	702	#endif /* USLOCK_DEBUG */
	703
	704	/*
	705	* Routine: lck_rw_alloc_init
	706	*/
	707	lck_rw_t *
	708	lck_rw_alloc_init(
	709	lck_grp_t *grp,
	710	lck_attr_t *attr) {
	711	lck_rw_t *lck;
	712
	713	if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
	714	bzero(lck, sizeof(lck_rw_t));
	715	lck_rw_init(lck, grp, attr);
	716	}
	717
	718	return(lck);
	719	}
	720
	721	/*
	722	* Routine: lck_rw_free
	723	*/
	724	void
	725	lck_rw_free(
	726	lck_rw_t *lck,
	727	lck_grp_t *grp) {
	728	lck_rw_destroy(lck, grp);
	729	kfree(lck, sizeof(lck_rw_t));
	730	}
	731
	732	/*
	733	* Routine: lck_rw_init
	734	*/
	735	void
	736	lck_rw_init(
	737	lck_rw_t *lck,
	738	lck_grp_t *grp,
	739	lck_attr_t *attr)
	740	{
	741	lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
	742	attr : &LockDefaultLckAttr;
	743
	744	hw_lock_byte_init(&lck->lck_rw_interlock);
	745	lck->lck_rw_want_write = FALSE;
	746	lck->lck_rw_want_upgrade = FALSE;
	747	lck->lck_rw_shared_count = 0;
	748	lck->lck_rw_can_sleep = TRUE;
	749	lck->lck_r_waiting = lck->lck_w_waiting = 0;
	750	lck->lck_rw_tag = 0;
	751	lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
	752	LCK_ATTR_RW_SHARED_PRIORITY) == 0);
	753
	754	lck_grp_reference(grp);
	755	lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
	756	}
	757
	758	/*
	759	* Routine: lck_rw_destroy
	760	*/
	761	void
	762	lck_rw_destroy(
	763	lck_rw_t *lck,
	764	lck_grp_t *grp)
	765	{
	766	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
	767	return;
	768	#if MACH_LDEBUG
	769	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
	770	#endif
	771	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
	772	lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
	773	lck_grp_deallocate(grp);
	774	return;
	775	}
	776
	777	/*
	778	* Sleep locks. These use the same data structure and algorithm
	779	* as the spin locks, but the process sleeps while it is waiting
	780	* for the lock. These work on uniprocessor systems.
	781	*/
	782
	783	#define DECREMENTER_TIMEOUT 1000000
	784
	785	#define RW_LOCK_READER_EVENT(x) \
	786	((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
	787
	788	#define RW_LOCK_WRITER_EVENT(x) \
	789	((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
	790
	791	/*
	792	* We disable interrupts while holding the RW interlock to prevent an
	793	* interrupt from exacerbating hold time.
	794	* Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
	795	*/
	796	static boolean_t
	797	lck_interlock_lock(lck_rw_t *lck)
	798	{
	799	boolean_t istate;
	800
	801	istate = ml_set_interrupts_enabled(FALSE);
	802	hw_lock_byte_lock(&lck->lck_rw_interlock);
	803
	804	return istate;
	805	}
	806
	807	static void
	808	lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
	809	{
	810	hw_lock_byte_unlock(&lck->lck_rw_interlock);
	811	ml_set_interrupts_enabled(istate);
	812	}
	813
	814	/*
	815	* This inline is used when busy-waiting for an rw lock.
	816	* If interrupts were disabled when the lock primitive was called,
	817	* we poll the IPI handler for pending tlb flushes.
	818	* XXX This is a hack to avoid deadlocking on the pmap_system_lock.
	819	*/
	820	static inline void
	821	lck_rw_lock_pause(boolean_t interrupts_enabled)
	822	{
	823	if (!interrupts_enabled)
	824	handle_pending_TLB_flushes();
	825	cpu_pause();
	826	}
	827
	828
	829	/*
	830	* compute the deadline to spin against when
	831	* waiting for a change of state on a lck_rw_t
	832	*/
	833	static inline uint64_t
	834	lck_rw_deadline_for_spin(lck_rw_t *lck)
	835	{
	836	if (lck->lck_rw_can_sleep) {
	837	if (lck->lck_r_waiting \|\| lck->lck_w_waiting \|\| lck->lck_rw_shared_count > machine_info.max_cpus) {
	838	/*
	839	* there are already threads waiting on this lock... this
	840	* implies that they have spun beyond their deadlines waiting for
	841	* the desired state to show up so we will not bother spinning at this time...
	842	* or
	843	* the current number of threads sharing this lock exceeds our capacity to run them
	844	* concurrently and since all states we're going to spin for require the rw_shared_count
	845	* to be at 0, we'll not bother spinning since the latency for this to happen is
	846	* unpredictable...
	847	*/
	848	return (mach_absolute_time());
	849	}
	850	return (mach_absolute_time() + MutexSpin);
	851	} else
	852	return (mach_absolute_time() + (100000LL * 1000000000LL));
	853	}
	854
	855
	856	/*
	857	* Routine: lck_rw_lock_exclusive
	858	*/
	859	void
	860	lck_rw_lock_exclusive_gen(
	861	lck_rw_t *lck)
	862	{
	863	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
	864	uint64_t deadline = 0;
	865	int slept = 0;
	866	int gotlock = 0;
	867	int lockheld = 0;
	868	wait_result_t res = 0;
	869	boolean_t istate = -1;
	870
	871	#if CONFIG_DTRACE
	872	boolean_t dtrace_ls_initialized = FALSE;
	873	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
	874	uint64_t wait_interval = 0;
	875	int readers_at_sleep = 0;
	876	#endif
	877
	878	/*
	879	* Try to acquire the lck_rw_want_write bit.
	880	*/
	881	while ( !lck_rw_grab_want(lck)) {
	882
	883	#if CONFIG_DTRACE
	884	if (dtrace_ls_initialized == FALSE) {
	885	dtrace_ls_initialized = TRUE;
	886	dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
	887	dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
	888	dtrace_ls_enabled = dtrace_rwl_excl_spin \|\| dtrace_rwl_excl_block;
	889	if (dtrace_ls_enabled) {
	890	/*
	891	* Either sleeping or spinning is happening,
	892	* start a timing of our delay interval now.
	893	*/
	894	readers_at_sleep = lck->lck_rw_shared_count;
	895	wait_interval = mach_absolute_time();
	896	}
	897	}
	898	#endif
	899	if (istate == -1)
	900	istate = ml_get_interrupts_enabled();
	901
	902	deadline = lck_rw_deadline_for_spin(lck);
	903
	904	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) \| DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
	905
	906	while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
	907	lck_rw_lock_pause(istate);
	908
	909	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) \| DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
	910
	911	if (gotlock)
	912	break;
	913	/*
	914	* if we get here, the deadline has expired w/o us
	915	* being able to grab the lock exclusively
	916	* check to see if we're allowed to do a thread_block
	917	*/
	918	if (lck->lck_rw_can_sleep) {
	919
	920	istate = lck_interlock_lock(lck);
	921
	922	if (lck->lck_rw_want_write) {
	923
	924	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) \| DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
	925
	926	lck->lck_w_waiting = TRUE;
	927
	928	res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
	929	lck_interlock_unlock(lck, istate);
	930
	931	if (res == THREAD_WAITING) {
	932	res = thread_block(THREAD_CONTINUE_NULL);
	933	slept++;
	934	}
	935	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) \| DBG_FUNC_END, trace_lck, res, slept, 0, 0);
	936	} else {
	937	lck->lck_rw_want_write = TRUE;
	938	lck_interlock_unlock(lck, istate);
	939	break;
	940	}
	941	}
	942	}
	943	/*
	944	* Wait for readers (and upgrades) to finish...
	945	* the test for these conditions must be done simultaneously with
	946	* a check of the interlock not being held since
	947	* the rw_shared_count will drop to 0 first and then want_upgrade
	948	* will be set to 1 in the shared_to_exclusive scenario... those
	949	* adjustments are done behind the interlock and represent an
	950	* atomic change in state and must be considered as such
	951	* however, once we see the read count at 0, the want_upgrade not set
	952	* and the interlock not held, we are safe to proceed
	953	*/
	954	while (lck_rw_held_read_or_upgrade(lck)) {
	955
	956	#if CONFIG_DTRACE
	957	/*
	958	* Either sleeping or spinning is happening, start
	959	* a timing of our delay interval now. If we set it
	960	* to -1 we don't have accurate data so we cannot later
	961	* decide to record a dtrace spin or sleep event.
	962	*/
	963	if (dtrace_ls_initialized == FALSE) {
	964	dtrace_ls_initialized = TRUE;
	965	dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
	966	dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
	967	dtrace_ls_enabled = dtrace_rwl_excl_spin \|\| dtrace_rwl_excl_block;
	968	if (dtrace_ls_enabled) {
	969	/*
	970	* Either sleeping or spinning is happening,
	971	* start a timing of our delay interval now.
	972	*/
	973	readers_at_sleep = lck->lck_rw_shared_count;
	974	wait_interval = mach_absolute_time();
	975	}
	976	}
	977	#endif
	978	if (istate == -1)
	979	istate = ml_get_interrupts_enabled();
	980
	981	deadline = lck_rw_deadline_for_spin(lck);
	982
	983	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) \| DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
	984
	985	while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
	986	lck_rw_lock_pause(istate);
	987
	988	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) \| DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
	989
	990	if ( !lockheld)
	991	break;
	992	/*
	993	* if we get here, the deadline has expired w/o us
	994	* being able to grab the lock exclusively
	995	* check to see if we're allowed to do a thread_block
	996	*/
	997	if (lck->lck_rw_can_sleep) {
	998
	999	istate = lck_interlock_lock(lck);
	1000
	1001	if (lck->lck_rw_shared_count != 0 \|\| lck->lck_rw_want_upgrade) {
	1002	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) \| DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
	1003
	1004	lck->lck_w_waiting = TRUE;
	1005
	1006	res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
	1007	lck_interlock_unlock(lck, istate);
	1008
	1009	if (res == THREAD_WAITING) {
	1010	res = thread_block(THREAD_CONTINUE_NULL);
	1011	slept++;
	1012	}
	1013	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) \| DBG_FUNC_END, trace_lck, res, slept, 0, 0);
	1014	} else {
	1015	lck_interlock_unlock(lck, istate);
	1016	/*
	1017	* must own the lock now, since we checked for
	1018	* readers or upgrade owner behind the interlock
	1019	* no need for a call to 'lck_rw_held_read_or_upgrade'
	1020	*/
	1021	break;
	1022	}
	1023	}
	1024	}
	1025
	1026	#if CONFIG_DTRACE
	1027	/*
	1028	* Decide what latencies we suffered that are Dtrace events.
	1029	* If we have set wait_interval, then we either spun or slept.
	1030	* At least we get out from under the interlock before we record
	1031	* which is the best we can do here to minimize the impact
	1032	* of the tracing.
	1033	* If we have set wait_interval to -1, then dtrace was not enabled when we
	1034	* started sleeping/spinning so we don't record this event.
	1035	*/
	1036	if (dtrace_ls_enabled == TRUE) {
	1037	if (slept == 0) {
	1038	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
	1039	mach_absolute_time() - wait_interval, 1);
	1040	} else {
	1041	/*
	1042	* For the blocking case, we also record if when we blocked
	1043	* it was held for read or write, and how many readers.
	1044	* Notice that above we recorded this before we dropped
	1045	* the interlock so the count is accurate.
	1046	*/
	1047	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
	1048	mach_absolute_time() - wait_interval, 1,
	1049	(readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
	1050	}
	1051	}
	1052	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
	1053	#endif
	1054	}
	1055
	1056
	1057	/*
	1058	* Routine: lck_rw_done_gen
	1059	*
	1060	* called from the assembly language wrapper...
	1061	* prior_lock_state is the value in the 1st
	1062	* word of the lock at the time of a successful
	1063	* atomic compare and exchange with the new value...
	1064	* it represents the state of the lock before we
	1065	* decremented the rw_shared_count or cleared either
	1066	* rw_want_upgrade or rw_want_write and
	1067	* the lck_x_waiting bits... since the wrapper
	1068	* routine has already changed the state atomically,
	1069	* we just need to decide if we should
	1070	* wake up anyone and what value to return... we do
	1071	* this by examining the state of the lock before
	1072	* we changed it
	1073	*/
	1074	lck_rw_type_t
	1075	lck_rw_done_gen(
	1076	lck_rw_t *lck,
	1077	int prior_lock_state)
	1078	{
	1079	lck_rw_t *fake_lck;
	1080	lck_rw_type_t lock_type;
	1081	thread_t thread;
	1082	uint32_t rwlock_count;
	1083
	1084	/*
	1085	* prior_lock state is a snapshot of the 1st word of the
	1086	* lock in question... we'll fake up a pointer to it
	1087	* and carefully not access anything beyond whats defined
	1088	* in the first word of a lck_rw_t
	1089	*/
	1090	fake_lck = (lck_rw_t *)&prior_lock_state;
	1091
	1092	if (fake_lck->lck_rw_shared_count <= 1) {
	1093	if (fake_lck->lck_w_waiting)
	1094	thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
	1095
	1096	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
	1097	thread_wakeup(RW_LOCK_READER_EVENT(lck));
	1098	}
	1099	if (fake_lck->lck_rw_shared_count)
	1100	lock_type = LCK_RW_TYPE_SHARED;
	1101	else
	1102	lock_type = LCK_RW_TYPE_EXCLUSIVE;
	1103
	1104	/* Check if dropping the lock means that we need to unpromote */
	1105	thread = current_thread();
	1106	rwlock_count = thread->rwlock_count--;
	1107	#if MACH_LDEBUG
	1108	if (rwlock_count == 0) {
	1109	panic("rw lock count underflow for thread %p", thread);
	1110	}
	1111	#endif
	1112	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
	1113	/* sched_flags checked without lock, but will be rechecked while clearing */
	1114	lck_rw_clear_promotion(thread);
	1115	}
	1116
	1117	#if CONFIG_DTRACE
	1118	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
	1119	#endif
	1120
	1121	return(lock_type);
	1122	}
	1123
	1124
	1125	/*
	1126	* Routine: lck_rw_unlock
	1127	*/
	1128	void
	1129	lck_rw_unlock(
	1130	lck_rw_t *lck,
	1131	lck_rw_type_t lck_rw_type)
	1132	{
	1133	if (lck_rw_type == LCK_RW_TYPE_SHARED)
	1134	lck_rw_unlock_shared(lck);
	1135	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
	1136	lck_rw_unlock_exclusive(lck);
	1137	else
	1138	panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
	1139	}
	1140
	1141
	1142	/*
	1143	* Routine: lck_rw_unlock_shared
	1144	*/
	1145	void
	1146	lck_rw_unlock_shared(
	1147	lck_rw_t *lck)
	1148	{
	1149	lck_rw_type_t ret;
	1150
	1151	ret = lck_rw_done(lck);
	1152
	1153	if (ret != LCK_RW_TYPE_SHARED)
	1154	panic("lck_rw_unlock(): lock held in mode: %d\n", ret);
	1155	}
	1156
	1157
	1158	/*
	1159	* Routine: lck_rw_unlock_exclusive
	1160	*/
	1161	void
	1162	lck_rw_unlock_exclusive(
	1163	lck_rw_t *lck)
	1164	{
	1165	lck_rw_type_t ret;
	1166
	1167	ret = lck_rw_done(lck);
	1168
	1169	if (ret != LCK_RW_TYPE_EXCLUSIVE)
	1170	panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
	1171	}
	1172
	1173
	1174	/*
	1175	* Routine: lck_rw_lock
	1176	*/
	1177	void
	1178	lck_rw_lock(
	1179	lck_rw_t *lck,
	1180	lck_rw_type_t lck_rw_type)
	1181	{
	1182	if (lck_rw_type == LCK_RW_TYPE_SHARED)
	1183	lck_rw_lock_shared(lck);
	1184	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
	1185	lck_rw_lock_exclusive(lck);
	1186	else
	1187	panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
	1188	}
	1189
	1190
	1191	/*
	1192	* Routine: lck_rw_lock_shared_gen
	1193	* Function:
	1194	* assembly fast path code has determined that this lock
	1195	* is held exclusively... this is where we spin/block
	1196	* until we can acquire the lock in the shared mode
	1197	*/
	1198	void
	1199	lck_rw_lock_shared_gen(
	1200	lck_rw_t *lck)
	1201	{
	1202	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
	1203	uint64_t deadline = 0;
	1204	int gotlock = 0;
	1205	int slept = 0;
	1206	wait_result_t res = 0;
	1207	boolean_t istate = -1;
	1208
	1209	#if CONFIG_DTRACE
	1210	uint64_t wait_interval = 0;
	1211	int readers_at_sleep = 0;
	1212	boolean_t dtrace_ls_initialized = FALSE;
	1213	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
	1214	#endif
	1215
	1216	while ( !lck_rw_grab_shared(lck)) {
	1217
	1218	#if CONFIG_DTRACE
	1219	if (dtrace_ls_initialized == FALSE) {
	1220	dtrace_ls_initialized = TRUE;
	1221	dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
	1222	dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
	1223	dtrace_ls_enabled = dtrace_rwl_shared_spin \|\| dtrace_rwl_shared_block;
	1224	if (dtrace_ls_enabled) {
	1225	/*
	1226	* Either sleeping or spinning is happening,
	1227	* start a timing of our delay interval now.
	1228	*/
	1229	readers_at_sleep = lck->lck_rw_shared_count;
	1230	wait_interval = mach_absolute_time();
	1231	}
	1232	}
	1233	#endif
	1234	if (istate == -1)
	1235	istate = ml_get_interrupts_enabled();
	1236
	1237	deadline = lck_rw_deadline_for_spin(lck);
	1238
	1239	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) \| DBG_FUNC_START,
	1240	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
	1241
	1242	while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
	1243	lck_rw_lock_pause(istate);
	1244
	1245	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) \| DBG_FUNC_END,
	1246	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
	1247
	1248	if (gotlock)
	1249	break;
	1250	/*
	1251	* if we get here, the deadline has expired w/o us
	1252	* being able to grab the lock for read
	1253	* check to see if we're allowed to do a thread_block
	1254	*/
	1255	if (lck->lck_rw_can_sleep) {
	1256
	1257	istate = lck_interlock_lock(lck);
	1258
	1259	if ((lck->lck_rw_want_write \|\| lck->lck_rw_want_upgrade) &&
	1260	((lck->lck_rw_shared_count == 0) \|\| lck->lck_rw_priv_excl)) {
	1261
	1262	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) \| DBG_FUNC_START,
	1263	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
	1264
	1265	lck->lck_r_waiting = TRUE;
	1266
	1267	res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
	1268	lck_interlock_unlock(lck, istate);
	1269
	1270	if (res == THREAD_WAITING) {
	1271	res = thread_block(THREAD_CONTINUE_NULL);
	1272	slept++;
	1273	}
	1274	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) \| DBG_FUNC_END,
	1275	trace_lck, res, slept, 0, 0);
	1276	} else {
	1277	lck->lck_rw_shared_count++;
	1278	lck_interlock_unlock(lck, istate);
	1279	break;
	1280	}
	1281	}
	1282	}
	1283
	1284	#if CONFIG_DTRACE
	1285	if (dtrace_ls_enabled == TRUE) {
	1286	if (slept == 0) {
	1287	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
	1288	} else {
	1289	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
	1290	mach_absolute_time() - wait_interval, 0,
	1291	(readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
	1292	}
	1293	}
	1294	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
	1295	#endif
	1296	}
	1297
	1298
	1299	/*
	1300	* Routine: lck_rw_lock_shared_to_exclusive_failure
	1301	* Function:
	1302	* assembly fast path code has already dropped our read
	1303	* count and determined that someone else owns 'lck_rw_want_upgrade'
	1304	* if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
	1305	* all we need to do here is determine if a wakeup is needed
	1306	*/
	1307	boolean_t
	1308	lck_rw_lock_shared_to_exclusive_failure(
	1309	lck_rw_t *lck,
	1310	int prior_lock_state)
	1311	{
	1312	lck_rw_t *fake_lck;
	1313	thread_t thread = current_thread();
	1314	uint32_t rwlock_count;
	1315
	1316	/* Check if dropping the lock means that we need to unpromote */
	1317	rwlock_count = thread->rwlock_count--;
	1318	#if MACH_LDEBUG
	1319	if (rwlock_count == 0) {
	1320	panic("rw lock count underflow for thread %p", thread);
	1321	}
	1322	#endif
	1323	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
	1324	/* sched_flags checked without lock, but will be rechecked while clearing */
	1325	lck_rw_clear_promotion(thread);
	1326	}
	1327
	1328	/*
	1329	* prior_lock state is a snapshot of the 1st word of the
	1330	* lock in question... we'll fake up a pointer to it
	1331	* and carefully not access anything beyond whats defined
	1332	* in the first word of a lck_rw_t
	1333	*/
	1334	fake_lck = (lck_rw_t *)&prior_lock_state;
	1335
	1336	if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
	1337	/*
	1338	* Someone else has requested upgrade.
	1339	* Since we've released the read lock, wake
	1340	* him up if he's blocked waiting
	1341	*/
	1342	thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
	1343	}
	1344	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) \| DBG_FUNC_NONE,
	1345	VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
	1346
	1347	return (FALSE);
	1348	}
	1349
	1350
	1351	/*
	1352	* Routine: lck_rw_lock_shared_to_exclusive_failure
	1353	* Function:
	1354	* assembly fast path code has already dropped our read
	1355	* count and successfully acquired 'lck_rw_want_upgrade'
	1356	* we just need to wait for the rest of the readers to drain
	1357	* and then we can return as the exclusive holder of this lock
	1358	*/
	1359	boolean_t
	1360	lck_rw_lock_shared_to_exclusive_success(
	1361	lck_rw_t *lck)
	1362	{
	1363	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
	1364	uint64_t deadline = 0;
	1365	int slept = 0;
	1366	int still_shared = 0;
	1367	wait_result_t res;
	1368	boolean_t istate = -1;
	1369
	1370	#if CONFIG_DTRACE
	1371	uint64_t wait_interval = 0;
	1372	int readers_at_sleep = 0;
	1373	boolean_t dtrace_ls_initialized = FALSE;
	1374	boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
	1375	#endif
	1376
	1377	while (lck->lck_rw_shared_count != 0) {
	1378
	1379	#if CONFIG_DTRACE
	1380	if (dtrace_ls_initialized == FALSE) {
	1381	dtrace_ls_initialized = TRUE;
	1382	dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
	1383	dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
	1384	dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin \|\| dtrace_rwl_shared_to_excl_block;
	1385	if (dtrace_ls_enabled) {
	1386	/*
	1387	* Either sleeping or spinning is happening,
	1388	* start a timing of our delay interval now.
	1389	*/
	1390	readers_at_sleep = lck->lck_rw_shared_count;
	1391	wait_interval = mach_absolute_time();
	1392	}
	1393	}
	1394	#endif
	1395	if (istate == -1)
	1396	istate = ml_get_interrupts_enabled();
	1397
	1398	deadline = lck_rw_deadline_for_spin(lck);
	1399
	1400	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) \| DBG_FUNC_START,
	1401	trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
	1402
	1403	while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
	1404	lck_rw_lock_pause(istate);
	1405
	1406	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) \| DBG_FUNC_END,
	1407	trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
	1408
	1409	if ( !still_shared)
	1410	break;
	1411	/*
	1412	* if we get here, the deadline has expired w/o
	1413	* the rw_shared_count having drained to 0
	1414	* check to see if we're allowed to do a thread_block
	1415	*/
	1416	if (lck->lck_rw_can_sleep) {
	1417
	1418	istate = lck_interlock_lock(lck);
	1419
	1420	if (lck->lck_rw_shared_count != 0) {
	1421	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) \| DBG_FUNC_START,
	1422	trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
	1423
	1424	lck->lck_w_waiting = TRUE;
	1425
	1426	res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
	1427	lck_interlock_unlock(lck, istate);
	1428
	1429	if (res == THREAD_WAITING) {
	1430	res = thread_block(THREAD_CONTINUE_NULL);
	1431	slept++;
	1432	}
	1433	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) \| DBG_FUNC_END,
	1434	trace_lck, res, slept, 0, 0);
	1435	} else {
	1436	lck_interlock_unlock(lck, istate);
	1437	break;
	1438	}
	1439	}
	1440	}
	1441	#if CONFIG_DTRACE
	1442	/*
	1443	* We infer whether we took the sleep/spin path above by checking readers_at_sleep.
	1444	*/
	1445	if (dtrace_ls_enabled == TRUE) {
	1446	if (slept == 0) {
	1447	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
	1448	} else {
	1449	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
	1450	mach_absolute_time() - wait_interval, 1,
	1451	(readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
	1452	}
	1453	}
	1454	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
	1455	#endif
	1456	return (TRUE);
	1457	}
	1458
	1459
	1460	/*
	1461	* Routine: lck_rw_lock_exclusive_to_shared
	1462	* Function:
	1463	* assembly fast path has already dropped
	1464	* our exclusive state and bumped lck_rw_shared_count
	1465	* all we need to do here is determine if anyone
	1466	* needs to be awakened.
	1467	*/
	1468	void
	1469	lck_rw_lock_exclusive_to_shared_gen(
	1470	lck_rw_t *lck,
	1471	int prior_lock_state)
	1472	{
	1473	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
	1474	lck_rw_t *fake_lck;
	1475
	1476	/*
	1477	* prior_lock state is a snapshot of the 1st word of the
	1478	* lock in question... we'll fake up a pointer to it
	1479	* and carefully not access anything beyond whats defined
	1480	* in the first word of a lck_rw_t
	1481	*/
	1482	fake_lck = (lck_rw_t *)&prior_lock_state;
	1483
	1484	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) \| DBG_FUNC_START,
	1485	trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
	1486
	1487	/*
	1488	* don't wake up anyone waiting to take the lock exclusively
	1489	* since we hold a read count... when the read count drops to 0,
	1490	* the writers will be woken.
	1491	*
	1492	* wake up any waiting readers if we don't have any writers waiting,
	1493	* or the lock is NOT marked as rw_priv_excl (writers have privilege)
	1494	*/
	1495	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
	1496	thread_wakeup(RW_LOCK_READER_EVENT(lck));
	1497
	1498	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) \| DBG_FUNC_END,
	1499	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
	1500
	1501	#if CONFIG_DTRACE
	1502	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
	1503	#endif
	1504	}
	1505
	1506
	1507	/*
	1508	* Routine: lck_rw_try_lock
	1509	*/
	1510	boolean_t
	1511	lck_rw_try_lock(
	1512	lck_rw_t *lck,
	1513	lck_rw_type_t lck_rw_type)
	1514	{
	1515	if (lck_rw_type == LCK_RW_TYPE_SHARED)
	1516	return(lck_rw_try_lock_shared(lck));
	1517	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
	1518	return(lck_rw_try_lock_exclusive(lck));
	1519	else
	1520	panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
	1521	return(FALSE);
	1522	}
	1523
	1524
	1525	void
	1526	lck_rw_assert(
	1527	lck_rw_t *lck,
	1528	unsigned int type)
	1529	{
	1530	switch (type) {
	1531	case LCK_RW_ASSERT_SHARED:
	1532	if (lck->lck_rw_shared_count != 0) {
	1533	return;
	1534	}
	1535	break;
	1536	case LCK_RW_ASSERT_EXCLUSIVE:
	1537	if ((lck->lck_rw_want_write \|\|
	1538	lck->lck_rw_want_upgrade) &&
	1539	lck->lck_rw_shared_count == 0) {
	1540	return;
	1541	}
	1542	break;
	1543	case LCK_RW_ASSERT_HELD:
	1544	if (lck->lck_rw_want_write \|\|
	1545	lck->lck_rw_want_upgrade \|\|
	1546	lck->lck_rw_shared_count != 0) {
	1547	return;
	1548	}
	1549	break;
	1550	case LCK_RW_ASSERT_NOTHELD:
	1551	if (!(lck->lck_rw_want_write \|\|
	1552	lck->lck_rw_want_upgrade \|\|
	1553	lck->lck_rw_shared_count != 0)) {
	1554	return;
	1555	}
	1556	break;
	1557	default:
	1558	break;
	1559	}
	1560
	1561	panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, (uint32_t )lck);
	1562	}
	1563
	1564	/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
	1565	void
	1566	lck_rw_clear_promotions_x86(thread_t thread)
	1567	{
	1568	#if MACH_LDEBUG
	1569	/* It's fatal to leave a RW lock locked and return to userspace */
	1570	panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
	1571	#else
	1572	/* Paper over the issue */
	1573	thread->rwlock_count = 0;
	1574	lck_rw_clear_promotion(thread);
	1575	#endif
	1576	}
	1577
	1578
	1579	/*
	1580	* Routine: kdp_lck_rw_lock_is_acquired_exclusive
	1581	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
	1582	*/
	1583	boolean_t
	1584	kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
	1585	if (not_in_kdp) {
	1586	panic("panic: rw lock exclusive check done outside of kernel debugger");
	1587	}
	1588	return ((lck->lck_rw_want_upgrade \|\| lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
	1589	}
	1590
	1591
	1592	#ifdef MUTEX_ZONE
	1593	extern zone_t lck_mtx_zone;
	1594	#endif
	1595	/*
	1596	* Routine: lck_mtx_alloc_init
	1597	*/
	1598	lck_mtx_t *
	1599	lck_mtx_alloc_init(
	1600	lck_grp_t *grp,
	1601	lck_attr_t *attr)
	1602	{
	1603	lck_mtx_t *lck;
	1604	#ifdef MUTEX_ZONE
	1605	if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
	1606	lck_mtx_init(lck, grp, attr);
	1607	#else
	1608	if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
	1609	lck_mtx_init(lck, grp, attr);
	1610	#endif
	1611	return(lck);
	1612	}
	1613
	1614	/*
	1615	* Routine: lck_mtx_free
	1616	*/
	1617	void
	1618	lck_mtx_free(
	1619	lck_mtx_t *lck,
	1620	lck_grp_t *grp)
	1621	{
	1622	lck_mtx_destroy(lck, grp);
	1623	#ifdef MUTEX_ZONE
	1624	zfree(lck_mtx_zone, lck);
	1625	#else
	1626	kfree(lck, sizeof(lck_mtx_t));
	1627	#endif
	1628	}
	1629
	1630	/*
	1631	* Routine: lck_mtx_ext_init
	1632	*/
	1633	static void
	1634	lck_mtx_ext_init(
	1635	lck_mtx_ext_t *lck,
	1636	lck_grp_t *grp,
	1637	lck_attr_t *attr)
	1638	{
	1639	bzero((void *)lck, sizeof(lck_mtx_ext_t));
	1640
	1641	if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
	1642	lck->lck_mtx_deb.type = MUTEX_TAG;
	1643	lck->lck_mtx_attr \|= LCK_MTX_ATTR_DEBUG;
	1644	}
	1645
	1646	lck->lck_mtx_grp = grp;
	1647
	1648	if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
	1649	lck->lck_mtx_attr \|= LCK_MTX_ATTR_STAT;
	1650
	1651	lck->lck_mtx.lck_mtx_is_ext = 1;
	1652	lck->lck_mtx.lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
	1653	}
	1654
	1655	/*
	1656	* Routine: lck_mtx_init
	1657	*/
	1658	void
	1659	lck_mtx_init(
	1660	lck_mtx_t *lck,
	1661	lck_grp_t *grp,
	1662	lck_attr_t *attr)
	1663	{
	1664	lck_mtx_ext_t *lck_ext;
	1665	lck_attr_t *lck_attr;
	1666
	1667	if (attr != LCK_ATTR_NULL)
	1668	lck_attr = attr;
	1669	else
	1670	lck_attr = &LockDefaultLckAttr;
	1671
	1672	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
	1673	if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
	1674	lck_mtx_ext_init(lck_ext, grp, lck_attr);
	1675	lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
	1676	lck->lck_mtx_ptr = lck_ext;
	1677	}
	1678	} else {
	1679	lck->lck_mtx_owner = 0;
	1680	lck->lck_mtx_state = 0;
	1681	}
	1682	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
	1683	lck_grp_reference(grp);
	1684	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
	1685	}
	1686
	1687	/*
	1688	* Routine: lck_mtx_init_ext
	1689	*/
	1690	void
	1691	lck_mtx_init_ext(
	1692	lck_mtx_t *lck,
	1693	lck_mtx_ext_t *lck_ext,
	1694	lck_grp_t *grp,
	1695	lck_attr_t *attr)
	1696	{
	1697	lck_attr_t *lck_attr;
	1698
	1699	if (attr != LCK_ATTR_NULL)
	1700	lck_attr = attr;
	1701	else
	1702	lck_attr = &LockDefaultLckAttr;
	1703
	1704	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
	1705	lck_mtx_ext_init(lck_ext, grp, lck_attr);
	1706	lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
	1707	lck->lck_mtx_ptr = lck_ext;
	1708	} else {
	1709	lck->lck_mtx_owner = 0;
	1710	lck->lck_mtx_state = 0;
	1711	}
	1712	lck->lck_mtx_sw.lck_mtxd.lck_mtxd_pad32 = 0xFFFFFFFF;
	1713
	1714	lck_grp_reference(grp);
	1715	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
	1716	}
	1717
	1718	/*
	1719	* Routine: lck_mtx_destroy
	1720	*/
	1721	void
	1722	lck_mtx_destroy(
	1723	lck_mtx_t *lck,
	1724	lck_grp_t *grp)
	1725	{
	1726	boolean_t lck_is_indirect;
	1727
	1728	if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
	1729	return;
	1730	#if MACH_LDEBUG
	1731	lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
	1732	#endif
	1733	lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
	1734
	1735	lck_mtx_lock_mark_destroyed(lck);
	1736
	1737	if (lck_is_indirect)
	1738	kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
	1739	lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
	1740	lck_grp_deallocate(grp);
	1741	return;
	1742	}
	1743
	1744
	1745	#define LCK_MTX_LCK_WAIT_CODE 0x20
	1746	#define LCK_MTX_LCK_WAKEUP_CODE 0x21
	1747	#define LCK_MTX_LCK_SPIN_CODE 0x22
	1748	#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
	1749	#define LCK_MTX_LCK_DEMOTE_CODE 0x24
	1750
	1751
	1752	/*
	1753	* Routine: lck_mtx_unlock_wakeup_x86
	1754	*
	1755	* Invoked on unlock when there is
	1756	* contention (i.e. the assembly routine sees that
	1757	* that mutex->lck_mtx_waiters != 0 or
	1758	* that mutex->lck_mtx_promoted != 0...
	1759	*
	1760	* neither the mutex or interlock is held
	1761	*/
	1762	void
	1763	lck_mtx_unlock_wakeup_x86 (
	1764	lck_mtx_t *mutex,
	1765	int prior_lock_state)
	1766	{
	1767	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
	1768	lck_mtx_t fake_lck;
	1769
	1770	/*
	1771	* prior_lock state is a snapshot of the 2nd word of the
	1772	* lock in question... we'll fake up a lock with the bits
	1773	* copied into place and carefully not access anything
	1774	* beyond whats defined in the second word of a lck_mtx_t
	1775	*/
	1776	fake_lck.lck_mtx_state = prior_lock_state;
	1777
	1778	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) \| DBG_FUNC_START,
	1779	trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
	1780
	1781	if (__probable(fake_lck.lck_mtx_waiters)) {
	1782	if (fake_lck.lck_mtx_waiters > 1)
	1783	thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
	1784	else
	1785	thread_wakeup_one(LCK_MTX_EVENT(mutex));
	1786	}
	1787
	1788	if (__improbable(fake_lck.lck_mtx_promoted)) {
	1789	thread_t thread = current_thread();
	1790
	1791
	1792	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) \| DBG_FUNC_NONE,
	1793	thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
	1794
	1795	if (thread->promotions > 0) {
	1796	spl_t s = splsched();
	1797
	1798	thread_lock(thread);
	1799
	1800	if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
	1801
	1802	thread->sched_flags &= ~TH_SFLAG_PROMOTED;
	1803
	1804	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
	1805	/* Thread still has a RW lock promotion */
	1806	} else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
	1807	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) \| DBG_FUNC_NONE,
	1808	thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
	1809
	1810	set_sched_pri(thread, DEPRESSPRI);
	1811	}
	1812	else {
	1813	if (thread->base_pri < thread->sched_pri) {
	1814	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) \| DBG_FUNC_NONE,
	1815	thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
	1816
	1817	thread_recompute_sched_pri(thread, FALSE);
	1818	}
	1819	}
	1820	}
	1821	thread_unlock(thread);
	1822	splx(s);
	1823	}
	1824	}
	1825	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) \| DBG_FUNC_END,
	1826	trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
	1827	}
	1828
	1829
	1830	/*
	1831	* Routine: lck_mtx_lock_acquire_x86
	1832	*
	1833	* Invoked on acquiring the mutex when there is
	1834	* contention (i.e. the assembly routine sees that
	1835	* that mutex->lck_mtx_waiters != 0 or
	1836	* thread->was_promoted_on_wakeup != 0)...
	1837	*
	1838	* mutex is owned... interlock is held... preemption is disabled
	1839	*/
	1840	void
	1841	lck_mtx_lock_acquire_x86(
	1842	lck_mtx_t *mutex)
	1843	{
	1844	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
	1845	thread_t thread;
	1846	integer_t priority;
	1847	spl_t s;
	1848
	1849	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) \| DBG_FUNC_START,
	1850	trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
	1851
	1852	if (mutex->lck_mtx_waiters)
	1853	priority = mutex->lck_mtx_pri;
	1854	else
	1855	priority = 0;
	1856
	1857	thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */
	1858
	1859	if (thread->sched_pri < priority \|\| thread->was_promoted_on_wakeup) {
	1860
	1861	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) \| DBG_FUNC_NONE,
	1862	thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
	1863
	1864	s = splsched();
	1865	thread_lock(thread);
	1866
	1867	if (thread->sched_pri < priority) {
	1868	/* Do not promote past promotion ceiling */
	1869	assert(priority <= MAXPRI_PROMOTE);
	1870	set_sched_pri(thread, priority);
	1871	}
	1872	if (mutex->lck_mtx_promoted == 0) {
	1873	mutex->lck_mtx_promoted = 1;
	1874
	1875	thread->promotions++;
	1876	thread->sched_flags \|= TH_SFLAG_PROMOTED;
	1877	}
	1878	thread->was_promoted_on_wakeup = 0;
	1879
	1880	thread_unlock(thread);
	1881	splx(s);
	1882	}
	1883	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) \| DBG_FUNC_END,
	1884	trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
	1885	}
	1886
	1887
	1888	static int
	1889	lck_mtx_interlock_try_lock(lck_mtx_t mutex, boolean_t istate)
	1890	{
	1891	int retval;
	1892
	1893	*istate = ml_set_interrupts_enabled(FALSE);
	1894	retval = lck_mtx_ilk_try_lock(mutex);
	1895
	1896	if (retval == 0)
	1897	ml_set_interrupts_enabled(*istate);
	1898
	1899	return retval;
	1900	}
	1901
	1902	static void
	1903	lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
	1904	{
	1905	lck_mtx_ilk_unlock(mutex);
	1906	ml_set_interrupts_enabled(istate);
	1907	}
	1908
	1909
	1910	/*
	1911	* Routine: lck_mtx_lock_spinwait_x86
	1912	*
	1913	* Invoked trying to acquire a mutex when there is contention but
	1914	* the holder is running on another processor. We spin for up to a maximum
	1915	* time waiting for the lock to be released.
	1916	*
	1917	* Called with the interlock unlocked.
	1918	* returns 0 if mutex acquired
	1919	* returns 1 if we spun
	1920	* returns 2 if we didn't spin due to the holder not running
	1921	*/
	1922	int
	1923	lck_mtx_lock_spinwait_x86(
	1924	lck_mtx_t *mutex)
	1925	{
	1926	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
	1927	thread_t holder;
	1928	uint64_t overall_deadline;
	1929	uint64_t check_owner_deadline;
	1930	uint64_t cur_time;
	1931	int retval = 1;
	1932	int loopcount = 0;
	1933
	1934	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) \| DBG_FUNC_START,
	1935	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
	1936
	1937	cur_time = mach_absolute_time();
	1938	overall_deadline = cur_time + MutexSpin;
	1939	check_owner_deadline = cur_time;
	1940
	1941	/*
	1942	* Spin while:
	1943	* - mutex is locked, and
	1944	* - its locked as a spin lock, and
	1945	* - owner is running on another processor, and
	1946	* - owner (processor) is not idling, and
	1947	* - we haven't spun for long enough.
	1948	*/
	1949	do {
	1950	if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
	1951	retval = 0;
	1952	break;
	1953	}
	1954	cur_time = mach_absolute_time();
	1955
	1956	if (cur_time >= overall_deadline)
	1957	break;
	1958
	1959	if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
	1960	boolean_t istate;
	1961
	1962	if (lck_mtx_interlock_try_lock(mutex, &istate)) {
	1963
	1964	if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
	1965
	1966	if ( !(holder->machine.specFlags & OnProc) \|\|
	1967	(holder->state & TH_IDLE)) {
	1968
	1969	lck_mtx_interlock_unlock(mutex, istate);
	1970
	1971	if (loopcount == 0)
	1972	retval = 2;
	1973	break;
	1974	}
	1975	}
	1976	lck_mtx_interlock_unlock(mutex, istate);
	1977
	1978	check_owner_deadline = cur_time + (MutexSpin / 4);
	1979	}
	1980	}
	1981	cpu_pause();
	1982
	1983	loopcount++;
	1984
	1985	} while (TRUE);
	1986
	1987	#if CONFIG_DTRACE
	1988	/*
	1989	* We've already kept a count via overall_deadline of how long we spun.
	1990	* If dtrace is active, then we compute backwards to decide how
	1991	* long we spun.
	1992	*
	1993	* Note that we record a different probe id depending on whether
	1994	* this is a direct or indirect mutex. This allows us to
	1995	* penalize only lock groups that have debug/stats enabled
	1996	* with dtrace processing if desired.
	1997	*/
	1998	if (__probable(mutex->lck_mtx_is_ext == 0)) {
	1999	LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
	2000	mach_absolute_time() - (overall_deadline - MutexSpin));
	2001	} else {
	2002	LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
	2003	mach_absolute_time() - (overall_deadline - MutexSpin));
	2004	}
	2005	/* The lockstat acquire event is recorded by the assembly code beneath us. */
	2006	#endif
	2007
	2008	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) \| DBG_FUNC_END,
	2009	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
	2010
	2011	return retval;
	2012	}
	2013
	2014
	2015
	2016	/*
	2017	* Routine: lck_mtx_lock_wait_x86
	2018	*
	2019	* Invoked in order to wait on contention.
	2020	*
	2021	* Called with the interlock locked and
	2022	* preemption disabled...
	2023	* returns it unlocked and with preemption enabled
	2024	*/
	2025	void
	2026	lck_mtx_lock_wait_x86 (
	2027	lck_mtx_t *mutex)
	2028	{
	2029	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
	2030	thread_t self = current_thread();
	2031	thread_t holder;
	2032	integer_t priority;
	2033	spl_t s;
	2034	#if CONFIG_DTRACE
	2035	uint64_t sleep_start = 0;
	2036
	2037	if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] \|\| lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
	2038	sleep_start = mach_absolute_time();
	2039	}
	2040	#endif
	2041	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) \| DBG_FUNC_START,
	2042	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
	2043
	2044	priority = self->sched_pri;
	2045
	2046	if (priority < self->base_pri)
	2047	priority = self->base_pri;
	2048	if (priority < BASEPRI_DEFAULT)
	2049	priority = BASEPRI_DEFAULT;
	2050
	2051	/* Do not promote past promotion ceiling */
	2052	priority = MIN(priority, MAXPRI_PROMOTE);
	2053
	2054	if (mutex->lck_mtx_waiters == 0 \|\| priority > mutex->lck_mtx_pri)
	2055	mutex->lck_mtx_pri = priority;
	2056	mutex->lck_mtx_waiters++;
	2057
	2058	if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
	2059	holder->sched_pri < mutex->lck_mtx_pri ) {
	2060	s = splsched();
	2061	thread_lock(holder);
	2062
	2063	/* holder priority may have been bumped by another thread
	2064	* before thread_lock was taken
	2065	*/
	2066	if (holder->sched_pri < mutex->lck_mtx_pri) {
	2067	KERNEL_DEBUG_CONSTANT(
	2068	MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) \| DBG_FUNC_NONE,
	2069	holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
	2070	/* Assert that we're not altering the priority of a
	2071	* thread above the MAXPRI_PROMOTE band
	2072	*/
	2073	assert(holder->sched_pri < MAXPRI_PROMOTE);
	2074	set_sched_pri(holder, priority);
	2075
	2076	if (mutex->lck_mtx_promoted == 0) {
	2077	holder->promotions++;
	2078	holder->sched_flags \|= TH_SFLAG_PROMOTED;
	2079
	2080	mutex->lck_mtx_promoted = 1;
	2081	}
	2082	}
	2083	thread_unlock(holder);
	2084	splx(s);
	2085	}
	2086	assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
	2087
	2088	lck_mtx_ilk_unlock(mutex);
	2089
	2090	thread_block(THREAD_CONTINUE_NULL);
	2091
	2092	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) \| DBG_FUNC_END,
	2093	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
	2094
	2095	#if CONFIG_DTRACE
	2096	/*
	2097	* Record the Dtrace lockstat probe for blocking, block time
	2098	* measured from when we were entered.
	2099	*/
	2100	if (sleep_start) {
	2101	if (mutex->lck_mtx_is_ext == 0) {
	2102	LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
	2103	mach_absolute_time() - sleep_start);
	2104	} else {
	2105	LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
	2106	mach_absolute_time() - sleep_start);
	2107	}
	2108	}
	2109	#endif
	2110	}
	2111
	2112	/*
	2113	* Routine: kdp_lck_mtx_lock_spin_is_acquired
	2114	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
	2115	* Returns: TRUE if lock is acquired.
	2116	*/
	2117	boolean_t
	2118	kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
	2119	{
	2120	if (not_in_kdp) {
	2121	panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
	2122	}
	2123
	2124	if (lck->lck_mtx_sw.lck_mtxd.lck_mtxd_ilocked \|\| lck->lck_mtx_sw.lck_mtxd.lck_mtxd_mlocked) {
	2125	return TRUE;
	2126	}
	2127
	2128	return FALSE;
	2129	}
	2130