git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2012 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56	/*
	57	* File: kern/lock.c
	58	* Author: Avadis Tevanian, Jr., Michael Wayne Young
	59	* Date: 1985
	60	*
	61	* Locking primitives implementation
	62	*/
	63
	64	#include <mach_ldebug.h>
	65
	66	#include <kern/locks.h>
	67	#include <kern/kalloc.h>
	68	#include <kern/misc_protos.h>
	69	#include <kern/thread.h>
	70	#include <kern/processor.h>
	71	#include <kern/cpu_data.h>
	72	#include <kern/cpu_number.h>
	73	#include <kern/sched_prim.h>
	74	#include <kern/xpr.h>
	75	#include <kern/debug.h>
	76	#include <string.h>
	77
	78	#include <i386/machine_routines.h> /* machine_timeout_suspended() */
	79	#include <machine/machine_cpu.h>
	80	#include <i386/mp.h>
	81
	82	#include <sys/kdebug.h>
	83	#include <mach/branch_predicates.h>
	84
	85	/*
	86	* We need only enough declarations from the BSD-side to be able to
	87	* test if our probe is active, and to call __dtrace_probe(). Setting
	88	* NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
	89	*/
	90	#if CONFIG_DTRACE
	91	#define NEED_DTRACE_DEFS
	92	#include <../bsd/sys/lockstat.h>
	93	#endif
	94
	95	#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
	96	#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
	97	#define LCK_RW_LCK_SHARED_CODE 0x102
	98	#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
	99	#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
	100	#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
	101
	102	#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
	103	#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
	104	#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
	105	#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
	106	#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
	107	#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
	108	#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
	109	#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
	110
	111
	112	#define ANY_LOCK_DEBUG (USLOCK_DEBUG \|\| LOCK_DEBUG \|\| MUTEX_DEBUG)
	113
	114	unsigned int LcksOpts=0;
	115
	116	/* Forwards */
	117
	118	#if USLOCK_DEBUG
	119	/*
	120	* Perform simple lock checks.
	121	*/
	122	int uslock_check = 1;
	123	int max_lock_loops = 100000000;
	124	decl_simple_lock_data(extern , printf_lock)
	125	decl_simple_lock_data(extern , panic_lock)
	126	#endif /* USLOCK_DEBUG */
	127
	128	extern unsigned int not_in_kdp;
	129
	130	/*
	131	* We often want to know the addresses of the callers
	132	* of the various lock routines. However, this information
	133	* is only used for debugging and statistics.
	134	*/
	135	typedef void *pc_t;
	136	#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
	137	#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
	138	#if ANY_LOCK_DEBUG
	139	#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
	140	#define DECL_PC(pc) pc_t pc;
	141	#else /* ANY_LOCK_DEBUG */
	142	#define DECL_PC(pc)
	143	#ifdef lint
	144	/*
	145	* Eliminate lint complaints about unused local pc variables.
	146	*/
	147	#define OBTAIN_PC(pc) ++pc
	148	#else /* lint */
	149	#define OBTAIN_PC(pc)
	150	#endif /* lint */
	151	#endif /* USLOCK_DEBUG */
	152
	153
	154	/*
	155	* Portable lock package implementation of usimple_locks.
	156	*/
	157
	158	#if USLOCK_DEBUG
	159	#define USLDBG(stmt) stmt
	160	void usld_lock_init(usimple_lock_t, unsigned short);
	161	void usld_lock_pre(usimple_lock_t, pc_t);
	162	void usld_lock_post(usimple_lock_t, pc_t);
	163	void usld_unlock(usimple_lock_t, pc_t);
	164	void usld_lock_try_pre(usimple_lock_t, pc_t);
	165	void usld_lock_try_post(usimple_lock_t, pc_t);
	166	int usld_lock_common_checks(usimple_lock_t, char *);
	167	#else /* USLOCK_DEBUG */
	168	#define USLDBG(stmt)
	169	#endif /* USLOCK_DEBUG */
	170
	171
	172	extern int lck_rw_grab_want(lck_rw_t *lck);
	173	extern int lck_rw_grab_shared(lck_rw_t *lck);
	174	extern int lck_rw_held_read_or_upgrade(lck_rw_t *lck);
	175
	176
	177	/*
	178	* Forward definitions
	179	*/
	180
	181	void lck_rw_lock_shared_gen(
	182	lck_rw_t *lck);
	183
	184	void lck_rw_lock_exclusive_gen(
	185	lck_rw_t *lck);
	186
	187	boolean_t lck_rw_lock_shared_to_exclusive_success(
	188	lck_rw_t *lck);
	189
	190	boolean_t lck_rw_lock_shared_to_exclusive_failure(
	191	lck_rw_t *lck,
	192	int prior_lock_state);
	193
	194	void lck_rw_lock_exclusive_to_shared_gen(
	195	lck_rw_t *lck,
	196	int prior_lock_state);
	197
	198	lck_rw_type_t lck_rw_done_gen(
	199	lck_rw_t *lck,
	200	int prior_lock_state);
	201
	202	void lck_rw_clear_promotions_x86(thread_t thread);
	203
	204	/*
	205	* Routine: lck_spin_alloc_init
	206	*/
	207	lck_spin_t *
	208	lck_spin_alloc_init(
	209	lck_grp_t *grp,
	210	lck_attr_t *attr)
	211	{
	212	lck_spin_t *lck;
	213
	214	if ((lck = (lck_spin_t *)kalloc(sizeof(lck_spin_t))) != 0)
	215	lck_spin_init(lck, grp, attr);
	216
	217	return(lck);
	218	}
	219
	220	/*
	221	* Routine: lck_spin_free
	222	*/
	223	void
	224	lck_spin_free(
	225	lck_spin_t *lck,
	226	lck_grp_t *grp)
	227	{
	228	lck_spin_destroy(lck, grp);
	229	kfree(lck, sizeof(lck_spin_t));
	230	}
	231
	232	/*
	233	* Routine: lck_spin_init
	234	*/
	235	void
	236	lck_spin_init(
	237	lck_spin_t *lck,
	238	lck_grp_t *grp,
	239	__unused lck_attr_t *attr)
	240	{
	241	usimple_lock_init((usimple_lock_t) lck, 0);
	242	lck_grp_reference(grp);
	243	lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
	244	}
	245
	246	/*
	247	* Routine: lck_spin_destroy
	248	*/
	249	void
	250	lck_spin_destroy(
	251	lck_spin_t *lck,
	252	lck_grp_t *grp)
	253	{
	254	if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
	255	return;
	256	lck->interlock = LCK_SPIN_TAG_DESTROYED;
	257	lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
	258	lck_grp_deallocate(grp);
	259	return;
	260	}
	261
	262	/*
	263	* Routine: lck_spin_lock
	264	*/
	265	void
	266	lck_spin_lock(
	267	lck_spin_t *lck)
	268	{
	269	usimple_lock((usimple_lock_t) lck);
	270	}
	271
	272	/*
	273	* Routine: lck_spin_unlock
	274	*/
	275	void
	276	lck_spin_unlock(
	277	lck_spin_t *lck)
	278	{
	279	usimple_unlock((usimple_lock_t) lck);
	280	}
	281
	282
	283	/*
	284	* Routine: lck_spin_try_lock
	285	*/
	286	boolean_t
	287	lck_spin_try_lock(
	288	lck_spin_t *lck)
	289	{
	290	boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
	291	#if DEVELOPMENT \|\| DEBUG
	292	if (lrval) {
	293	pltrace(FALSE);
	294	}
	295	#endif
	296	return(lrval);
	297	}
	298
	299	/*
	300	* Routine: lck_spin_assert
	301	*/
	302	void
	303	lck_spin_assert(lck_spin_t *lock, unsigned int type)
	304	{
	305	thread_t thread, holder;
	306	uintptr_t state;
	307
	308	if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
	309	panic("lck_spin_assert(): invalid arg (%u)", type);
	310	}
	311
	312	state = lock->interlock;
	313	holder = (thread_t)state;
	314	thread = current_thread();
	315	if (type == LCK_ASSERT_OWNED) {
	316	if (__improbable(holder == THREAD_NULL)) {
	317	panic("Lock not owned %p = %lx", lock, state);
	318	}
	319	if (__improbable(holder != thread)) {
	320	panic("Lock not owned by current thread %p = %lx", lock, state);
	321	}
	322	} else if (type == LCK_ASSERT_NOTOWNED) {
	323	if (__improbable(holder != THREAD_NULL)) {
	324	if (holder == thread) {
	325	panic("Lock owned by current thread %p = %lx", lock, state);
	326	} else {
	327	panic("Lock %p owned by thread %p", lock, holder);
	328	}
	329	}
	330	}
	331	}
	332
	333	/*
	334	* Routine: kdp_lck_spin_is_acquired
	335	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
	336	* Returns: TRUE if lock is acquired.
	337	*/
	338	boolean_t
	339	kdp_lck_spin_is_acquired(lck_spin_t *lck) {
	340	if (not_in_kdp) {
	341	panic("panic: spinlock acquired check done outside of kernel debugger");
	342	}
	343	return (lck->interlock != 0)? TRUE : FALSE;
	344	}
	345
	346	/*
	347	* Initialize a usimple_lock.
	348	*
	349	* No change in preemption state.
	350	*/
	351	void
	352	usimple_lock_init(
	353	usimple_lock_t l,
	354	__unused unsigned short tag)
	355	{
	356	#ifndef MACHINE_SIMPLE_LOCK
	357	USLDBG(usld_lock_init(l, tag));
	358	hw_lock_init(&l->interlock);
	359	#else
	360	simple_lock_init((simple_lock_t)l,tag);
	361	#endif
	362	}
	363
	364	volatile uint32_t spinlock_owner_cpu = ~0;
	365	volatile usimple_lock_t spinlock_timed_out;
	366
	367	uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
	368	uint64_t deadline;
	369	uint32_t i;
	370
	371	for (i = 0; i < real_ncpus; i++) {
	372	if ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr) {
	373	spinlock_owner_cpu = i;
	374	if ((uint32_t) cpu_number() == i)
	375	break;
	376	cpu_datap(i)->cpu_NMI_acknowledged = FALSE;
	377	cpu_NMI_interrupt(i);
	378	deadline = mach_absolute_time() + (LockTimeOut * 2);
	379	while (mach_absolute_time() < deadline && cpu_datap(i)->cpu_NMI_acknowledged == FALSE)
	380	cpu_pause();
	381	break;
	382	}
	383	}
	384
	385	return spinlock_owner_cpu;
	386	}
	387
	388	/*
	389	* Acquire a usimple_lock.
	390	*
	391	* Returns with preemption disabled. Note
	392	* that the hw_lock routines are responsible for
	393	* maintaining preemption state.
	394	*/
	395	void
	396	usimple_lock(
	397	usimple_lock_t l)
	398	{
	399	#ifndef MACHINE_SIMPLE_LOCK
	400	DECL_PC(pc);
	401
	402	OBTAIN_PC(pc);
	403	USLDBG(usld_lock_pre(l, pc));
	404
	405	if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == 0)) {
	406	boolean_t uslock_acquired = FALSE;
	407	while (machine_timeout_suspended()) {
	408	enable_preemption();
	409	if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
	410	break;
	411	}
	412
	413	if (uslock_acquired == FALSE) {
	414	uint32_t lock_cpu;
	415	uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
	416	spinlock_timed_out = l;
	417	lock_cpu = spinlock_timeout_NMI(lowner);
	418	panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx", l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data);
	419	}
	420	}
	421	#if DEVELOPMENT \|\| DEBUG
	422	pltrace(FALSE);
	423	#endif
	424
	425	USLDBG(usld_lock_post(l, pc));
	426	#else
	427	simple_lock((simple_lock_t)l);
	428	#endif
	429	}
	430
	431
	432	/*
	433	* Release a usimple_lock.
	434	*
	435	* Returns with preemption enabled. Note
	436	* that the hw_lock routines are responsible for
	437	* maintaining preemption state.
	438	*/
	439	void
	440	usimple_unlock(
	441	usimple_lock_t l)
	442	{
	443	#ifndef MACHINE_SIMPLE_LOCK
	444	DECL_PC(pc);
	445
	446	OBTAIN_PC(pc);
	447	USLDBG(usld_unlock(l, pc));
	448	#if DEVELOPMENT \|\| DEBUG
	449	pltrace(TRUE);
	450	#endif
	451	hw_lock_unlock(&l->interlock);
	452	#else
	453	simple_unlock_rwmb((simple_lock_t)l);
	454	#endif
	455	}
	456
	457
	458	/*
	459	* Conditionally acquire a usimple_lock.
	460	*
	461	* On success, returns with preemption disabled.
	462	* On failure, returns with preemption in the same state
	463	* as when first invoked. Note that the hw_lock routines
	464	* are responsible for maintaining preemption state.
	465	*
	466	* XXX No stats are gathered on a miss; I preserved this
	467	* behavior from the original assembly-language code, but
	468	* doesn't it make sense to log misses? XXX
	469	*/
	470	unsigned int
	471	usimple_lock_try(
	472	usimple_lock_t l)
	473	{
	474	#ifndef MACHINE_SIMPLE_LOCK
	475	unsigned int success;
	476	DECL_PC(pc);
	477
	478	OBTAIN_PC(pc);
	479	USLDBG(usld_lock_try_pre(l, pc));
	480	if ((success = hw_lock_try(&l->interlock))) {
	481	#if DEVELOPMENT \|\| DEBUG
	482	pltrace(FALSE);
	483	#endif
	484	USLDBG(usld_lock_try_post(l, pc));
	485	}
	486	return success;
	487	#else
	488	return(simple_lock_try((simple_lock_t)l));
	489	#endif
	490	}
	491
	492	/*
	493	* Acquire a usimple_lock while polling for pending TLB flushes
	494	* and spinning on a lock.
	495	*
	496	*/
	497	void
	498	usimple_lock_try_lock_loop(usimple_lock_t l)
	499	{
	500	boolean_t istate = ml_get_interrupts_enabled();
	501	while (!simple_lock_try((l))) {
	502	if (!istate)
	503	handle_pending_TLB_flushes();
	504	cpu_pause();
	505	}
	506	}
	507
	508	#if USLOCK_DEBUG
	509	/*
	510	* States of a usimple_lock. The default when initializing
	511	* a usimple_lock is setting it up for debug checking.
	512	*/
	513	#define USLOCK_CHECKED 0x0001 /* lock is being checked */
	514	#define USLOCK_TAKEN 0x0002 /* lock has been taken */
	515	#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
	516	#define USLOCK_INITIALIZED (USLOCK_INIT\|USLOCK_CHECKED)
	517	#define USLOCK_CHECKING(l) (uslock_check && \
	518	((l)->debug.state & USLOCK_CHECKED))
	519
	520	/*
	521	* Trace activities of a particularly interesting lock.
	522	*/
	523	void usl_trace(usimple_lock_t, int, pc_t, const char *);
	524
	525
	526	/*
	527	* Initialize the debugging information contained
	528	* in a usimple_lock.
	529	*/
	530	void
	531	usld_lock_init(
	532	usimple_lock_t l,
	533	__unused unsigned short tag)
	534	{
	535	if (l == USIMPLE_LOCK_NULL)
	536	panic("lock initialization: null lock pointer");
	537	l->lock_type = USLOCK_TAG;
	538	l->debug.state = uslock_check ? USLOCK_INITIALIZED : 0;
	539	l->debug.lock_cpu = l->debug.unlock_cpu = 0;
	540	l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
	541	l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
	542	l->debug.duration[0] = l->debug.duration[1] = 0;
	543	l->debug.unlock_cpu = l->debug.unlock_cpu = 0;
	544	l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
	545	l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
	546	}
	547
	548
	549	/*
	550	* These checks apply to all usimple_locks, not just
	551	* those with USLOCK_CHECKED turned on.
	552	*/
	553	int
	554	usld_lock_common_checks(
	555	usimple_lock_t l,
	556	char *caller)
	557	{
	558	if (l == USIMPLE_LOCK_NULL)
	559	panic("%s: null lock pointer", caller);
	560	if (l->lock_type != USLOCK_TAG)
	561	panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
	562	if (!(l->debug.state & USLOCK_INIT))
	563	panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
	564	return USLOCK_CHECKING(l);
	565	}
	566
	567
	568	/*
	569	* Debug checks on a usimple_lock just before attempting
	570	* to acquire it.
	571	*/
	572	/* ARGSUSED */
	573	void
	574	usld_lock_pre(
	575	usimple_lock_t l,
	576	pc_t pc)
	577	{
	578	char caller[] = "usimple_lock";
	579
	580
	581	if (!usld_lock_common_checks(l, caller))
	582	return;
	583
	584	/*
	585	* Note that we have a weird case where we are getting a lock when we are]
	586	* in the process of putting the system to sleep. We are running with no
	587	* current threads, therefore we can't tell if we are trying to retake a lock
	588	* we have or someone on the other processor has it. Therefore we just
	589	* ignore this test if the locking thread is 0.
	590	*/
	591
	592	if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
	593	l->debug.lock_thread == (void *) current_thread()) {
	594	printf("%s: lock %p already locked (at %p) by",
	595	caller, l, l->debug.lock_pc);
	596	printf(" current thread %p (new attempt at pc %p)\n",
	597	l->debug.lock_thread, pc);
	598	panic("%s", caller);
	599	}
	600	mp_disable_preemption();
	601	usl_trace(l, cpu_number(), pc, caller);
	602	mp_enable_preemption();
	603	}
	604
	605
	606	/*
	607	* Debug checks on a usimple_lock just after acquiring it.
	608	*
	609	* Pre-emption has been disabled at this point,
	610	* so we are safe in using cpu_number.
	611	*/
	612	void
	613	usld_lock_post(
	614	usimple_lock_t l,
	615	pc_t pc)
	616	{
	617	int mycpu;
	618	char caller[] = "successful usimple_lock";
	619
	620
	621	if (!usld_lock_common_checks(l, caller))
	622	return;
	623
	624	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
	625	panic("%s: lock %p became uninitialized",
	626	caller, l);
	627	if ((l->debug.state & USLOCK_TAKEN))
	628	panic("%s: lock 0x%p became TAKEN by someone else",
	629	caller, l);
	630
	631	mycpu = cpu_number();
	632	l->debug.lock_thread = (void *)current_thread();
	633	l->debug.state \|= USLOCK_TAKEN;
	634	l->debug.lock_pc = pc;
	635	l->debug.lock_cpu = mycpu;
	636
	637	usl_trace(l, mycpu, pc, caller);
	638	}
	639
	640
	641	/*
	642	* Debug checks on a usimple_lock just before
	643	* releasing it. Note that the caller has not
	644	* yet released the hardware lock.
	645	*
	646	* Preemption is still disabled, so there's
	647	* no problem using cpu_number.
	648	*/
	649	void
	650	usld_unlock(
	651	usimple_lock_t l,
	652	pc_t pc)
	653	{
	654	int mycpu;
	655	char caller[] = "usimple_unlock";
	656
	657
	658	if (!usld_lock_common_checks(l, caller))
	659	return;
	660
	661	mycpu = cpu_number();
	662
	663	if (!(l->debug.state & USLOCK_TAKEN))
	664	panic("%s: lock 0x%p hasn't been taken",
	665	caller, l);
	666	if (l->debug.lock_thread != (void *) current_thread())
	667	panic("%s: unlocking lock 0x%p, owned by thread %p",
	668	caller, l, l->debug.lock_thread);
	669	if (l->debug.lock_cpu != mycpu) {
	670	printf("%s: unlocking lock 0x%p on cpu 0x%x",
	671	caller, l, mycpu);
	672	printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
	673	panic("%s", caller);
	674	}
	675	usl_trace(l, mycpu, pc, caller);
	676
	677	l->debug.unlock_thread = l->debug.lock_thread;
	678	l->debug.lock_thread = INVALID_PC;
	679	l->debug.state &= ~USLOCK_TAKEN;
	680	l->debug.unlock_pc = pc;
	681	l->debug.unlock_cpu = mycpu;
	682	}
	683
	684
	685	/*
	686	* Debug checks on a usimple_lock just before
	687	* attempting to acquire it.
	688	*
	689	* Preemption isn't guaranteed to be disabled.
	690	*/
	691	void
	692	usld_lock_try_pre(
	693	usimple_lock_t l,
	694	pc_t pc)
	695	{
	696	char caller[] = "usimple_lock_try";
	697
	698	if (!usld_lock_common_checks(l, caller))
	699	return;
	700	mp_disable_preemption();
	701	usl_trace(l, cpu_number(), pc, caller);
	702	mp_enable_preemption();
	703	}
	704
	705
	706	/*
	707	* Debug checks on a usimple_lock just after
	708	* successfully attempting to acquire it.
	709	*
	710	* Preemption has been disabled by the
	711	* lock acquisition attempt, so it's safe
	712	* to use cpu_number.
	713	*/
	714	void
	715	usld_lock_try_post(
	716	usimple_lock_t l,
	717	pc_t pc)
	718	{
	719	int mycpu;
	720	char caller[] = "successful usimple_lock_try";
	721
	722	if (!usld_lock_common_checks(l, caller))
	723	return;
	724
	725	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
	726	panic("%s: lock 0x%p became uninitialized",
	727	caller, l);
	728	if ((l->debug.state & USLOCK_TAKEN))
	729	panic("%s: lock 0x%p became TAKEN by someone else",
	730	caller, l);
	731
	732	mycpu = cpu_number();
	733	l->debug.lock_thread = (void *) current_thread();
	734	l->debug.state \|= USLOCK_TAKEN;
	735	l->debug.lock_pc = pc;
	736	l->debug.lock_cpu = mycpu;
	737
	738	usl_trace(l, mycpu, pc, caller);
	739	}
	740
	741
	742	/*
	743	* For very special cases, set traced_lock to point to a
	744	* specific lock of interest. The result is a series of
	745	* XPRs showing lock operations on that lock. The lock_seq
	746	* value is used to show the order of those operations.
	747	*/
	748	usimple_lock_t traced_lock;
	749	unsigned int lock_seq;
	750
	751	void
	752	usl_trace(
	753	usimple_lock_t l,
	754	int mycpu,
	755	pc_t pc,
	756	const char * op_name)
	757	{
	758	if (traced_lock == l) {
	759	XPR(XPR_SLOCK,
	760	"seq %d, cpu %d, %s @ %x\n",
	761	(uintptr_t) lock_seq, (uintptr_t) mycpu,
	762	(uintptr_t) op_name, (uintptr_t) pc, 0);
	763	lock_seq++;
	764	}
	765	}
	766
	767
	768	#endif /* USLOCK_DEBUG */
	769
	770	/*
	771	* Routine: lck_rw_alloc_init
	772	*/
	773	lck_rw_t *
	774	lck_rw_alloc_init(
	775	lck_grp_t *grp,
	776	lck_attr_t *attr) {
	777	lck_rw_t *lck;
	778
	779	if ((lck = (lck_rw_t *)kalloc(sizeof(lck_rw_t))) != 0) {
	780	bzero(lck, sizeof(lck_rw_t));
	781	lck_rw_init(lck, grp, attr);
	782	}
	783
	784	return(lck);
	785	}
	786
	787	/*
	788	* Routine: lck_rw_free
	789	*/
	790	void
	791	lck_rw_free(
	792	lck_rw_t *lck,
	793	lck_grp_t *grp) {
	794	lck_rw_destroy(lck, grp);
	795	kfree(lck, sizeof(lck_rw_t));
	796	}
	797
	798	/*
	799	* Routine: lck_rw_init
	800	*/
	801	void
	802	lck_rw_init(
	803	lck_rw_t *lck,
	804	lck_grp_t *grp,
	805	lck_attr_t *attr)
	806	{
	807	lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
	808	attr : &LockDefaultLckAttr;
	809
	810	hw_lock_byte_init(&lck->lck_rw_interlock);
	811	lck->lck_rw_want_write = FALSE;
	812	lck->lck_rw_want_upgrade = FALSE;
	813	lck->lck_rw_shared_count = 0;
	814	lck->lck_rw_can_sleep = TRUE;
	815	lck->lck_r_waiting = lck->lck_w_waiting = 0;
	816	lck->lck_rw_tag = 0;
	817	lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
	818	LCK_ATTR_RW_SHARED_PRIORITY) == 0);
	819
	820	lck_grp_reference(grp);
	821	lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
	822	}
	823
	824	/*
	825	* Routine: lck_rw_destroy
	826	*/
	827	void
	828	lck_rw_destroy(
	829	lck_rw_t *lck,
	830	lck_grp_t *grp)
	831	{
	832	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
	833	return;
	834	#if MACH_LDEBUG
	835	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
	836	#endif
	837	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
	838	lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
	839	lck_grp_deallocate(grp);
	840	return;
	841	}
	842
	843	/*
	844	* Sleep locks. These use the same data structure and algorithm
	845	* as the spin locks, but the process sleeps while it is waiting
	846	* for the lock. These work on uniprocessor systems.
	847	*/
	848
	849	#define DECREMENTER_TIMEOUT 1000000
	850
	851	#define RW_LOCK_READER_EVENT(x) \
	852	((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_tag))))
	853
	854	#define RW_LOCK_WRITER_EVENT(x) \
	855	((event_t) (((unsigned char*) (x)) + (offsetof(lck_rw_t, lck_rw_pad8))))
	856
	857	/*
	858	* We disable interrupts while holding the RW interlock to prevent an
	859	* interrupt from exacerbating hold time.
	860	* Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
	861	*/
	862	static boolean_t
	863	lck_interlock_lock(lck_rw_t *lck)
	864	{
	865	boolean_t istate;
	866
	867	istate = ml_set_interrupts_enabled(FALSE);
	868	hw_lock_byte_lock(&lck->lck_rw_interlock);
	869
	870	return istate;
	871	}
	872
	873	static void
	874	lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
	875	{
	876	hw_lock_byte_unlock(&lck->lck_rw_interlock);
	877	ml_set_interrupts_enabled(istate);
	878	}
	879
	880	/*
	881	* This inline is used when busy-waiting for an rw lock.
	882	* If interrupts were disabled when the lock primitive was called,
	883	* we poll the IPI handler for pending tlb flushes.
	884	* XXX This is a hack to avoid deadlocking on the pmap_system_lock.
	885	*/
	886	static inline void
	887	lck_rw_lock_pause(boolean_t interrupts_enabled)
	888	{
	889	if (!interrupts_enabled)
	890	handle_pending_TLB_flushes();
	891	cpu_pause();
	892	}
	893
	894
	895	/*
	896	* compute the deadline to spin against when
	897	* waiting for a change of state on a lck_rw_t
	898	*/
	899	static inline uint64_t
	900	lck_rw_deadline_for_spin(lck_rw_t *lck)
	901	{
	902	if (lck->lck_rw_can_sleep) {
	903	if (lck->lck_r_waiting \|\| lck->lck_w_waiting \|\| lck->lck_rw_shared_count > machine_info.max_cpus) {
	904	/*
	905	* there are already threads waiting on this lock... this
	906	* implies that they have spun beyond their deadlines waiting for
	907	* the desired state to show up so we will not bother spinning at this time...
	908	* or
	909	* the current number of threads sharing this lock exceeds our capacity to run them
	910	* concurrently and since all states we're going to spin for require the rw_shared_count
	911	* to be at 0, we'll not bother spinning since the latency for this to happen is
	912	* unpredictable...
	913	*/
	914	return (mach_absolute_time());
	915	}
	916	return (mach_absolute_time() + MutexSpin);
	917	} else
	918	return (mach_absolute_time() + (100000LL * 1000000000LL));
	919	}
	920
	921
	922	/*
	923	* Routine: lck_rw_lock_exclusive
	924	*/
	925	void
	926	lck_rw_lock_exclusive_gen(
	927	lck_rw_t *lck)
	928	{
	929	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
	930	uint64_t deadline = 0;
	931	int slept = 0;
	932	int gotlock = 0;
	933	int lockheld = 0;
	934	wait_result_t res = 0;
	935	boolean_t istate = -1;
	936
	937	#if CONFIG_DTRACE
	938	boolean_t dtrace_ls_initialized = FALSE;
	939	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
	940	uint64_t wait_interval = 0;
	941	int readers_at_sleep = 0;
	942	#endif
	943
	944	/*
	945	* Try to acquire the lck_rw_want_write bit.
	946	*/
	947	while ( !lck_rw_grab_want(lck)) {
	948
	949	#if CONFIG_DTRACE
	950	if (dtrace_ls_initialized == FALSE) {
	951	dtrace_ls_initialized = TRUE;
	952	dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
	953	dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
	954	dtrace_ls_enabled = dtrace_rwl_excl_spin \|\| dtrace_rwl_excl_block;
	955	if (dtrace_ls_enabled) {
	956	/*
	957	* Either sleeping or spinning is happening,
	958	* start a timing of our delay interval now.
	959	*/
	960	readers_at_sleep = lck->lck_rw_shared_count;
	961	wait_interval = mach_absolute_time();
	962	}
	963	}
	964	#endif
	965	if (istate == -1)
	966	istate = ml_get_interrupts_enabled();
	967
	968	deadline = lck_rw_deadline_for_spin(lck);
	969
	970	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) \| DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
	971
	972	while (((gotlock = lck_rw_grab_want(lck)) == 0) && mach_absolute_time() < deadline)
	973	lck_rw_lock_pause(istate);
	974
	975	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) \| DBG_FUNC_END, trace_lck, 0, 0, gotlock, 0);
	976
	977	if (gotlock)
	978	break;
	979	/*
	980	* if we get here, the deadline has expired w/o us
	981	* being able to grab the lock exclusively
	982	* check to see if we're allowed to do a thread_block
	983	*/
	984	if (lck->lck_rw_can_sleep) {
	985
	986	istate = lck_interlock_lock(lck);
	987
	988	if (lck->lck_rw_want_write) {
	989
	990	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) \| DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
	991
	992	lck->lck_w_waiting = TRUE;
	993
	994	res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
	995	lck_interlock_unlock(lck, istate);
	996
	997	if (res == THREAD_WAITING) {
	998	res = thread_block(THREAD_CONTINUE_NULL);
	999	slept++;
	1000	}
	1001	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) \| DBG_FUNC_END, trace_lck, res, slept, 0, 0);
	1002	} else {
	1003	lck->lck_rw_want_write = TRUE;
	1004	lck_interlock_unlock(lck, istate);
	1005	break;
	1006	}
	1007	}
	1008	}
	1009	/*
	1010	* Wait for readers (and upgrades) to finish...
	1011	* the test for these conditions must be done simultaneously with
	1012	* a check of the interlock not being held since
	1013	* the rw_shared_count will drop to 0 first and then want_upgrade
	1014	* will be set to 1 in the shared_to_exclusive scenario... those
	1015	* adjustments are done behind the interlock and represent an
	1016	* atomic change in state and must be considered as such
	1017	* however, once we see the read count at 0, the want_upgrade not set
	1018	* and the interlock not held, we are safe to proceed
	1019	*/
	1020	while (lck_rw_held_read_or_upgrade(lck)) {
	1021
	1022	#if CONFIG_DTRACE
	1023	/*
	1024	* Either sleeping or spinning is happening, start
	1025	* a timing of our delay interval now. If we set it
	1026	* to -1 we don't have accurate data so we cannot later
	1027	* decide to record a dtrace spin or sleep event.
	1028	*/
	1029	if (dtrace_ls_initialized == FALSE) {
	1030	dtrace_ls_initialized = TRUE;
	1031	dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != 0);
	1032	dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != 0);
	1033	dtrace_ls_enabled = dtrace_rwl_excl_spin \|\| dtrace_rwl_excl_block;
	1034	if (dtrace_ls_enabled) {
	1035	/*
	1036	* Either sleeping or spinning is happening,
	1037	* start a timing of our delay interval now.
	1038	*/
	1039	readers_at_sleep = lck->lck_rw_shared_count;
	1040	wait_interval = mach_absolute_time();
	1041	}
	1042	}
	1043	#endif
	1044	if (istate == -1)
	1045	istate = ml_get_interrupts_enabled();
	1046
	1047	deadline = lck_rw_deadline_for_spin(lck);
	1048
	1049	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) \| DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
	1050
	1051	while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
	1052	lck_rw_lock_pause(istate);
	1053
	1054	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) \| DBG_FUNC_END, trace_lck, 0, 0, lockheld, 0);
	1055
	1056	if ( !lockheld)
	1057	break;
	1058	/*
	1059	* if we get here, the deadline has expired w/o us
	1060	* being able to grab the lock exclusively
	1061	* check to see if we're allowed to do a thread_block
	1062	*/
	1063	if (lck->lck_rw_can_sleep) {
	1064
	1065	istate = lck_interlock_lock(lck);
	1066
	1067	if (lck->lck_rw_shared_count != 0 \|\| lck->lck_rw_want_upgrade) {
	1068	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) \| DBG_FUNC_START, trace_lck, 0, 0, 0, 0);
	1069
	1070	lck->lck_w_waiting = TRUE;
	1071
	1072	res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
	1073	lck_interlock_unlock(lck, istate);
	1074
	1075	if (res == THREAD_WAITING) {
	1076	res = thread_block(THREAD_CONTINUE_NULL);
	1077	slept++;
	1078	}
	1079	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) \| DBG_FUNC_END, trace_lck, res, slept, 0, 0);
	1080	} else {
	1081	lck_interlock_unlock(lck, istate);
	1082	/*
	1083	* must own the lock now, since we checked for
	1084	* readers or upgrade owner behind the interlock
	1085	* no need for a call to 'lck_rw_held_read_or_upgrade'
	1086	*/
	1087	break;
	1088	}
	1089	}
	1090	}
	1091
	1092	#if CONFIG_DTRACE
	1093	/*
	1094	* Decide what latencies we suffered that are Dtrace events.
	1095	* If we have set wait_interval, then we either spun or slept.
	1096	* At least we get out from under the interlock before we record
	1097	* which is the best we can do here to minimize the impact
	1098	* of the tracing.
	1099	* If we have set wait_interval to -1, then dtrace was not enabled when we
	1100	* started sleeping/spinning so we don't record this event.
	1101	*/
	1102	if (dtrace_ls_enabled == TRUE) {
	1103	if (slept == 0) {
	1104	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
	1105	mach_absolute_time() - wait_interval, 1);
	1106	} else {
	1107	/*
	1108	* For the blocking case, we also record if when we blocked
	1109	* it was held for read or write, and how many readers.
	1110	* Notice that above we recorded this before we dropped
	1111	* the interlock so the count is accurate.
	1112	*/
	1113	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
	1114	mach_absolute_time() - wait_interval, 1,
	1115	(readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
	1116	}
	1117	}
	1118	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, 1);
	1119	#endif
	1120	}
	1121
	1122
	1123	/*
	1124	* Routine: lck_rw_done_gen
	1125	*
	1126	* called from the assembly language wrapper...
	1127	* prior_lock_state is the value in the 1st
	1128	* word of the lock at the time of a successful
	1129	* atomic compare and exchange with the new value...
	1130	* it represents the state of the lock before we
	1131	* decremented the rw_shared_count or cleared either
	1132	* rw_want_upgrade or rw_want_write and
	1133	* the lck_x_waiting bits... since the wrapper
	1134	* routine has already changed the state atomically,
	1135	* we just need to decide if we should
	1136	* wake up anyone and what value to return... we do
	1137	* this by examining the state of the lock before
	1138	* we changed it
	1139	*/
	1140	lck_rw_type_t
	1141	lck_rw_done_gen(
	1142	lck_rw_t *lck,
	1143	int prior_lock_state)
	1144	{
	1145	lck_rw_t *fake_lck;
	1146	lck_rw_type_t lock_type;
	1147	thread_t thread;
	1148	uint32_t rwlock_count;
	1149
	1150	/*
	1151	* prior_lock state is a snapshot of the 1st word of the
	1152	* lock in question... we'll fake up a pointer to it
	1153	* and carefully not access anything beyond whats defined
	1154	* in the first word of a lck_rw_t
	1155	*/
	1156	fake_lck = (lck_rw_t *)&prior_lock_state;
	1157
	1158	if (fake_lck->lck_rw_shared_count <= 1) {
	1159	if (fake_lck->lck_w_waiting)
	1160	thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
	1161
	1162	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
	1163	thread_wakeup(RW_LOCK_READER_EVENT(lck));
	1164	}
	1165	if (fake_lck->lck_rw_shared_count)
	1166	lock_type = LCK_RW_TYPE_SHARED;
	1167	else
	1168	lock_type = LCK_RW_TYPE_EXCLUSIVE;
	1169
	1170	/* Check if dropping the lock means that we need to unpromote */
	1171	thread = current_thread();
	1172	rwlock_count = thread->rwlock_count--;
	1173	#if MACH_LDEBUG
	1174	if (rwlock_count == 0) {
	1175	panic("rw lock count underflow for thread %p", thread);
	1176	}
	1177	#endif
	1178	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
	1179	/* sched_flags checked without lock, but will be rechecked while clearing */
	1180	lck_rw_clear_promotion(thread);
	1181	}
	1182
	1183	#if CONFIG_DTRACE
	1184	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
	1185	#endif
	1186
	1187	return(lock_type);
	1188	}
	1189
	1190
	1191	/*
	1192	* Routine: lck_rw_unlock
	1193	*/
	1194	void
	1195	lck_rw_unlock(
	1196	lck_rw_t *lck,
	1197	lck_rw_type_t lck_rw_type)
	1198	{
	1199	if (lck_rw_type == LCK_RW_TYPE_SHARED)
	1200	lck_rw_unlock_shared(lck);
	1201	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
	1202	lck_rw_unlock_exclusive(lck);
	1203	else
	1204	panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
	1205	}
	1206
	1207
	1208	/*
	1209	* Routine: lck_rw_unlock_shared
	1210	*/
	1211	void
	1212	lck_rw_unlock_shared(
	1213	lck_rw_t *lck)
	1214	{
	1215	lck_rw_type_t ret;
	1216
	1217	ret = lck_rw_done(lck);
	1218
	1219	if (ret != LCK_RW_TYPE_SHARED)
	1220	panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
	1221	}
	1222
	1223
	1224	/*
	1225	* Routine: lck_rw_unlock_exclusive
	1226	*/
	1227	void
	1228	lck_rw_unlock_exclusive(
	1229	lck_rw_t *lck)
	1230	{
	1231	lck_rw_type_t ret;
	1232
	1233	ret = lck_rw_done(lck);
	1234
	1235	if (ret != LCK_RW_TYPE_EXCLUSIVE)
	1236	panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
	1237	}
	1238
	1239
	1240	/*
	1241	* Routine: lck_rw_lock
	1242	*/
	1243	void
	1244	lck_rw_lock(
	1245	lck_rw_t *lck,
	1246	lck_rw_type_t lck_rw_type)
	1247	{
	1248	if (lck_rw_type == LCK_RW_TYPE_SHARED)
	1249	lck_rw_lock_shared(lck);
	1250	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
	1251	lck_rw_lock_exclusive(lck);
	1252	else
	1253	panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
	1254	}
	1255
	1256
	1257	/*
	1258	* Routine: lck_rw_lock_shared_gen
	1259	* Function:
	1260	* assembly fast path code has determined that this lock
	1261	* is held exclusively... this is where we spin/block
	1262	* until we can acquire the lock in the shared mode
	1263	*/
	1264	void
	1265	lck_rw_lock_shared_gen(
	1266	lck_rw_t *lck)
	1267	{
	1268	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
	1269	uint64_t deadline = 0;
	1270	int gotlock = 0;
	1271	int slept = 0;
	1272	wait_result_t res = 0;
	1273	boolean_t istate = -1;
	1274
	1275	#if CONFIG_DTRACE
	1276	uint64_t wait_interval = 0;
	1277	int readers_at_sleep = 0;
	1278	boolean_t dtrace_ls_initialized = FALSE;
	1279	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
	1280	#endif
	1281
	1282	while ( !lck_rw_grab_shared(lck)) {
	1283
	1284	#if CONFIG_DTRACE
	1285	if (dtrace_ls_initialized == FALSE) {
	1286	dtrace_ls_initialized = TRUE;
	1287	dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != 0);
	1288	dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != 0);
	1289	dtrace_ls_enabled = dtrace_rwl_shared_spin \|\| dtrace_rwl_shared_block;
	1290	if (dtrace_ls_enabled) {
	1291	/*
	1292	* Either sleeping or spinning is happening,
	1293	* start a timing of our delay interval now.
	1294	*/
	1295	readers_at_sleep = lck->lck_rw_shared_count;
	1296	wait_interval = mach_absolute_time();
	1297	}
	1298	}
	1299	#endif
	1300	if (istate == -1)
	1301	istate = ml_get_interrupts_enabled();
	1302
	1303	deadline = lck_rw_deadline_for_spin(lck);
	1304
	1305	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) \| DBG_FUNC_START,
	1306	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
	1307
	1308	while (((gotlock = lck_rw_grab_shared(lck)) == 0) && mach_absolute_time() < deadline)
	1309	lck_rw_lock_pause(istate);
	1310
	1311	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) \| DBG_FUNC_END,
	1312	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, 0);
	1313
	1314	if (gotlock)
	1315	break;
	1316	/*
	1317	* if we get here, the deadline has expired w/o us
	1318	* being able to grab the lock for read
	1319	* check to see if we're allowed to do a thread_block
	1320	*/
	1321	if (lck->lck_rw_can_sleep) {
	1322
	1323	istate = lck_interlock_lock(lck);
	1324
	1325	if ((lck->lck_rw_want_write \|\| lck->lck_rw_want_upgrade) &&
	1326	((lck->lck_rw_shared_count == 0) \|\| lck->lck_rw_priv_excl)) {
	1327
	1328	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) \| DBG_FUNC_START,
	1329	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, 0, 0);
	1330
	1331	lck->lck_r_waiting = TRUE;
	1332
	1333	res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
	1334	lck_interlock_unlock(lck, istate);
	1335
	1336	if (res == THREAD_WAITING) {
	1337	res = thread_block(THREAD_CONTINUE_NULL);
	1338	slept++;
	1339	}
	1340	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) \| DBG_FUNC_END,
	1341	trace_lck, res, slept, 0, 0);
	1342	} else {
	1343	lck->lck_rw_shared_count++;
	1344	lck_interlock_unlock(lck, istate);
	1345	break;
	1346	}
	1347	}
	1348	}
	1349
	1350	#if CONFIG_DTRACE
	1351	if (dtrace_ls_enabled == TRUE) {
	1352	if (slept == 0) {
	1353	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, 0);
	1354	} else {
	1355	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
	1356	mach_absolute_time() - wait_interval, 0,
	1357	(readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
	1358	}
	1359	}
	1360	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, 0);
	1361	#endif
	1362	}
	1363
	1364
	1365	/*
	1366	* Routine: lck_rw_lock_shared_to_exclusive_failure
	1367	* Function:
	1368	* assembly fast path code has already dropped our read
	1369	* count and determined that someone else owns 'lck_rw_want_upgrade'
	1370	* if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
	1371	* all we need to do here is determine if a wakeup is needed
	1372	*/
	1373	boolean_t
	1374	lck_rw_lock_shared_to_exclusive_failure(
	1375	lck_rw_t *lck,
	1376	int prior_lock_state)
	1377	{
	1378	lck_rw_t *fake_lck;
	1379	thread_t thread = current_thread();
	1380	uint32_t rwlock_count;
	1381
	1382	/* Check if dropping the lock means that we need to unpromote */
	1383	rwlock_count = thread->rwlock_count--;
	1384	#if MACH_LDEBUG
	1385	if (rwlock_count == 0) {
	1386	panic("rw lock count underflow for thread %p", thread);
	1387	}
	1388	#endif
	1389	if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
	1390	/* sched_flags checked without lock, but will be rechecked while clearing */
	1391	lck_rw_clear_promotion(thread);
	1392	}
	1393
	1394	/*
	1395	* prior_lock state is a snapshot of the 1st word of the
	1396	* lock in question... we'll fake up a pointer to it
	1397	* and carefully not access anything beyond whats defined
	1398	* in the first word of a lck_rw_t
	1399	*/
	1400	fake_lck = (lck_rw_t *)&prior_lock_state;
	1401
	1402	if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == 1) {
	1403	/*
	1404	* Someone else has requested upgrade.
	1405	* Since we've released the read lock, wake
	1406	* him up if he's blocked waiting
	1407	*/
	1408	thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
	1409	}
	1410	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) \| DBG_FUNC_NONE,
	1411	VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, 0, 0);
	1412
	1413	return (FALSE);
	1414	}
	1415
	1416
	1417	/*
	1418	* Routine: lck_rw_lock_shared_to_exclusive_failure
	1419	* Function:
	1420	* assembly fast path code has already dropped our read
	1421	* count and successfully acquired 'lck_rw_want_upgrade'
	1422	* we just need to wait for the rest of the readers to drain
	1423	* and then we can return as the exclusive holder of this lock
	1424	*/
	1425	boolean_t
	1426	lck_rw_lock_shared_to_exclusive_success(
	1427	lck_rw_t *lck)
	1428	{
	1429	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
	1430	uint64_t deadline = 0;
	1431	int slept = 0;
	1432	int still_shared = 0;
	1433	wait_result_t res;
	1434	boolean_t istate = -1;
	1435
	1436	#if CONFIG_DTRACE
	1437	uint64_t wait_interval = 0;
	1438	int readers_at_sleep = 0;
	1439	boolean_t dtrace_ls_initialized = FALSE;
	1440	boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
	1441	#endif
	1442
	1443	while (lck->lck_rw_shared_count != 0) {
	1444
	1445	#if CONFIG_DTRACE
	1446	if (dtrace_ls_initialized == FALSE) {
	1447	dtrace_ls_initialized = TRUE;
	1448	dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != 0);
	1449	dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != 0);
	1450	dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin \|\| dtrace_rwl_shared_to_excl_block;
	1451	if (dtrace_ls_enabled) {
	1452	/*
	1453	* Either sleeping or spinning is happening,
	1454	* start a timing of our delay interval now.
	1455	*/
	1456	readers_at_sleep = lck->lck_rw_shared_count;
	1457	wait_interval = mach_absolute_time();
	1458	}
	1459	}
	1460	#endif
	1461	if (istate == -1)
	1462	istate = ml_get_interrupts_enabled();
	1463
	1464	deadline = lck_rw_deadline_for_spin(lck);
	1465
	1466	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) \| DBG_FUNC_START,
	1467	trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
	1468
	1469	while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
	1470	lck_rw_lock_pause(istate);
	1471
	1472	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) \| DBG_FUNC_END,
	1473	trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
	1474
	1475	if ( !still_shared)
	1476	break;
	1477	/*
	1478	* if we get here, the deadline has expired w/o
	1479	* the rw_shared_count having drained to 0
	1480	* check to see if we're allowed to do a thread_block
	1481	*/
	1482	if (lck->lck_rw_can_sleep) {
	1483
	1484	istate = lck_interlock_lock(lck);
	1485
	1486	if (lck->lck_rw_shared_count != 0) {
	1487	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) \| DBG_FUNC_START,
	1488	trace_lck, lck->lck_rw_shared_count, 0, 0, 0);
	1489
	1490	lck->lck_w_waiting = TRUE;
	1491
	1492	res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
	1493	lck_interlock_unlock(lck, istate);
	1494
	1495	if (res == THREAD_WAITING) {
	1496	res = thread_block(THREAD_CONTINUE_NULL);
	1497	slept++;
	1498	}
	1499	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) \| DBG_FUNC_END,
	1500	trace_lck, res, slept, 0, 0);
	1501	} else {
	1502	lck_interlock_unlock(lck, istate);
	1503	break;
	1504	}
	1505	}
	1506	}
	1507	#if CONFIG_DTRACE
	1508	/*
	1509	* We infer whether we took the sleep/spin path above by checking readers_at_sleep.
	1510	*/
	1511	if (dtrace_ls_enabled == TRUE) {
	1512	if (slept == 0) {
	1513	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, 0);
	1514	} else {
	1515	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
	1516	mach_absolute_time() - wait_interval, 1,
	1517	(readers_at_sleep == 0 ? 1 : 0), readers_at_sleep);
	1518	}
	1519	}
	1520	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, 1);
	1521	#endif
	1522	return (TRUE);
	1523	}
	1524
	1525
	1526	/*
	1527	* Routine: lck_rw_lock_exclusive_to_shared
	1528	* Function:
	1529	* assembly fast path has already dropped
	1530	* our exclusive state and bumped lck_rw_shared_count
	1531	* all we need to do here is determine if anyone
	1532	* needs to be awakened.
	1533	*/
	1534	void
	1535	lck_rw_lock_exclusive_to_shared_gen(
	1536	lck_rw_t *lck,
	1537	int prior_lock_state)
	1538	{
	1539	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
	1540	lck_rw_t *fake_lck;
	1541
	1542	/*
	1543	* prior_lock state is a snapshot of the 1st word of the
	1544	* lock in question... we'll fake up a pointer to it
	1545	* and carefully not access anything beyond whats defined
	1546	* in the first word of a lck_rw_t
	1547	*/
	1548	fake_lck = (lck_rw_t *)&prior_lock_state;
	1549
	1550	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) \| DBG_FUNC_START,
	1551	trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, 0, 0);
	1552
	1553	/*
	1554	* don't wake up anyone waiting to take the lock exclusively
	1555	* since we hold a read count... when the read count drops to 0,
	1556	* the writers will be woken.
	1557	*
	1558	* wake up any waiting readers if we don't have any writers waiting,
	1559	* or the lock is NOT marked as rw_priv_excl (writers have privilege)
	1560	*/
	1561	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
	1562	thread_wakeup(RW_LOCK_READER_EVENT(lck));
	1563
	1564	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) \| DBG_FUNC_END,
	1565	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, 0);
	1566
	1567	#if CONFIG_DTRACE
	1568	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, 0);
	1569	#endif
	1570	}
	1571
	1572
	1573	/*
	1574	* Routine: lck_rw_try_lock
	1575	*/
	1576	boolean_t
	1577	lck_rw_try_lock(
	1578	lck_rw_t *lck,
	1579	lck_rw_type_t lck_rw_type)
	1580	{
	1581	if (lck_rw_type == LCK_RW_TYPE_SHARED)
	1582	return(lck_rw_try_lock_shared(lck));
	1583	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
	1584	return(lck_rw_try_lock_exclusive(lck));
	1585	else
	1586	panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
	1587	return(FALSE);
	1588	}
	1589
	1590
	1591	void
	1592	lck_rw_assert(
	1593	lck_rw_t *lck,
	1594	unsigned int type)
	1595	{
	1596	switch (type) {
	1597	case LCK_RW_ASSERT_SHARED:
	1598	if (lck->lck_rw_shared_count != 0) {
	1599	return;
	1600	}
	1601	break;
	1602	case LCK_RW_ASSERT_EXCLUSIVE:
	1603	if ((lck->lck_rw_want_write \|\|
	1604	lck->lck_rw_want_upgrade) &&
	1605	lck->lck_rw_shared_count == 0) {
	1606	return;
	1607	}
	1608	break;
	1609	case LCK_RW_ASSERT_HELD:
	1610	if (lck->lck_rw_want_write \|\|
	1611	lck->lck_rw_want_upgrade \|\|
	1612	lck->lck_rw_shared_count != 0) {
	1613	return;
	1614	}
	1615	break;
	1616	case LCK_RW_ASSERT_NOTHELD:
	1617	if (!(lck->lck_rw_want_write \|\|
	1618	lck->lck_rw_want_upgrade \|\|
	1619	lck->lck_rw_shared_count != 0)) {
	1620	return;
	1621	}
	1622	break;
	1623	default:
	1624	break;
	1625	}
	1626
	1627	panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, (uint32_t )lck);
	1628	}
	1629
	1630	/* On return to userspace, this routine is called if the rwlock_count is somehow imbalanced */
	1631	void
	1632	lck_rw_clear_promotions_x86(thread_t thread)
	1633	{
	1634	#if MACH_LDEBUG
	1635	/* It's fatal to leave a RW lock locked and return to userspace */
	1636	panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
	1637	#else
	1638	/* Paper over the issue */
	1639	thread->rwlock_count = 0;
	1640	lck_rw_clear_promotion(thread);
	1641	#endif
	1642	}
	1643
	1644
	1645	/*
	1646	* Routine: kdp_lck_rw_lock_is_acquired_exclusive
	1647	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
	1648	*/
	1649	boolean_t
	1650	kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
	1651	if (not_in_kdp) {
	1652	panic("panic: rw lock exclusive check done outside of kernel debugger");
	1653	}
	1654	return ((lck->lck_rw_want_upgrade \|\| lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
	1655	}
	1656
	1657
	1658	#ifdef MUTEX_ZONE
	1659	extern zone_t lck_mtx_zone;
	1660	#endif
	1661	/*
	1662	* Routine: lck_mtx_alloc_init
	1663	*/
	1664	lck_mtx_t *
	1665	lck_mtx_alloc_init(
	1666	lck_grp_t *grp,
	1667	lck_attr_t *attr)
	1668	{
	1669	lck_mtx_t *lck;
	1670	#ifdef MUTEX_ZONE
	1671	if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != 0)
	1672	lck_mtx_init(lck, grp, attr);
	1673	#else
	1674	if ((lck = (lck_mtx_t *)kalloc(sizeof(lck_mtx_t))) != 0)
	1675	lck_mtx_init(lck, grp, attr);
	1676	#endif
	1677	return(lck);
	1678	}
	1679
	1680	/*
	1681	* Routine: lck_mtx_free
	1682	*/
	1683	void
	1684	lck_mtx_free(
	1685	lck_mtx_t *lck,
	1686	lck_grp_t *grp)
	1687	{
	1688	lck_mtx_destroy(lck, grp);
	1689	#ifdef MUTEX_ZONE
	1690	zfree(lck_mtx_zone, lck);
	1691	#else
	1692	kfree(lck, sizeof(lck_mtx_t));
	1693	#endif
	1694	}
	1695
	1696	/*
	1697	* Routine: lck_mtx_ext_init
	1698	*/
	1699	static void
	1700	lck_mtx_ext_init(
	1701	lck_mtx_ext_t *lck,
	1702	lck_grp_t *grp,
	1703	lck_attr_t *attr)
	1704	{
	1705	bzero((void *)lck, sizeof(lck_mtx_ext_t));
	1706
	1707	if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
	1708	lck->lck_mtx_deb.type = MUTEX_TAG;
	1709	lck->lck_mtx_attr \|= LCK_MTX_ATTR_DEBUG;
	1710	}
	1711
	1712	lck->lck_mtx_grp = grp;
	1713
	1714	if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
	1715	lck->lck_mtx_attr \|= LCK_MTX_ATTR_STAT;
	1716
	1717	lck->lck_mtx.lck_mtx_is_ext = 1;
	1718	lck->lck_mtx.lck_mtx_pad32 = 0xFFFFFFFF;
	1719	}
	1720
	1721	/*
	1722	* Routine: lck_mtx_init
	1723	*/
	1724	void
	1725	lck_mtx_init(
	1726	lck_mtx_t *lck,
	1727	lck_grp_t *grp,
	1728	lck_attr_t *attr)
	1729	{
	1730	lck_mtx_ext_t *lck_ext;
	1731	lck_attr_t *lck_attr;
	1732
	1733	if (attr != LCK_ATTR_NULL)
	1734	lck_attr = attr;
	1735	else
	1736	lck_attr = &LockDefaultLckAttr;
	1737
	1738	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
	1739	if ((lck_ext = (lck_mtx_ext_t *)kalloc(sizeof(lck_mtx_ext_t))) != 0) {
	1740	lck_mtx_ext_init(lck_ext, grp, lck_attr);
	1741	lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
	1742	lck->lck_mtx_ptr = lck_ext;
	1743	}
	1744	} else {
	1745	lck->lck_mtx_owner = 0;
	1746	lck->lck_mtx_state = 0;
	1747	}
	1748	lck->lck_mtx_pad32 = 0xFFFFFFFF;
	1749	lck_grp_reference(grp);
	1750	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
	1751	}
	1752
	1753	/*
	1754	* Routine: lck_mtx_init_ext
	1755	*/
	1756	void
	1757	lck_mtx_init_ext(
	1758	lck_mtx_t *lck,
	1759	lck_mtx_ext_t *lck_ext,
	1760	lck_grp_t *grp,
	1761	lck_attr_t *attr)
	1762	{
	1763	lck_attr_t *lck_attr;
	1764
	1765	if (attr != LCK_ATTR_NULL)
	1766	lck_attr = attr;
	1767	else
	1768	lck_attr = &LockDefaultLckAttr;
	1769
	1770	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
	1771	lck_mtx_ext_init(lck_ext, grp, lck_attr);
	1772	lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
	1773	lck->lck_mtx_ptr = lck_ext;
	1774	} else {
	1775	lck->lck_mtx_owner = 0;
	1776	lck->lck_mtx_state = 0;
	1777	}
	1778	lck->lck_mtx_pad32 = 0xFFFFFFFF;
	1779
	1780	lck_grp_reference(grp);
	1781	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
	1782	}
	1783
	1784	/*
	1785	* Routine: lck_mtx_destroy
	1786	*/
	1787	void
	1788	lck_mtx_destroy(
	1789	lck_mtx_t *lck,
	1790	lck_grp_t *grp)
	1791	{
	1792	boolean_t lck_is_indirect;
	1793
	1794	if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
	1795	return;
	1796	#if MACH_LDEBUG
	1797	lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
	1798	#endif
	1799	lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
	1800
	1801	lck_mtx_lock_mark_destroyed(lck);
	1802
	1803	if (lck_is_indirect)
	1804	kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
	1805	lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
	1806	lck_grp_deallocate(grp);
	1807	return;
	1808	}
	1809
	1810
	1811	#define LCK_MTX_LCK_WAIT_CODE 0x20
	1812	#define LCK_MTX_LCK_WAKEUP_CODE 0x21
	1813	#define LCK_MTX_LCK_SPIN_CODE 0x22
	1814	#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
	1815	#define LCK_MTX_LCK_DEMOTE_CODE 0x24
	1816
	1817
	1818	/*
	1819	* Routine: lck_mtx_unlock_wakeup_x86
	1820	*
	1821	* Invoked on unlock when there is
	1822	* contention (i.e. the assembly routine sees that
	1823	* that mutex->lck_mtx_waiters != 0 or
	1824	* that mutex->lck_mtx_promoted != 0...
	1825	*
	1826	* neither the mutex or interlock is held
	1827	*/
	1828	void
	1829	lck_mtx_unlock_wakeup_x86 (
	1830	lck_mtx_t *mutex,
	1831	int prior_lock_state)
	1832	{
	1833	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
	1834	lck_mtx_t fake_lck;
	1835
	1836	/*
	1837	* prior_lock state is a snapshot of the 2nd word of the
	1838	* lock in question... we'll fake up a lock with the bits
	1839	* copied into place and carefully not access anything
	1840	* beyond whats defined in the second word of a lck_mtx_t
	1841	*/
	1842	fake_lck.lck_mtx_state = prior_lock_state;
	1843
	1844	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) \| DBG_FUNC_START,
	1845	trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
	1846
	1847	if (__probable(fake_lck.lck_mtx_waiters)) {
	1848	if (fake_lck.lck_mtx_waiters > 1)
	1849	thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
	1850	else
	1851	thread_wakeup_one(LCK_MTX_EVENT(mutex));
	1852	}
	1853
	1854	if (__improbable(fake_lck.lck_mtx_promoted)) {
	1855	thread_t thread = current_thread();
	1856
	1857
	1858	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) \| DBG_FUNC_NONE,
	1859	thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
	1860
	1861	if (thread->promotions > 0) {
	1862	spl_t s = splsched();
	1863
	1864	thread_lock(thread);
	1865
	1866	if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
	1867
	1868	thread->sched_flags &= ~TH_SFLAG_PROMOTED;
	1869
	1870	if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
	1871	/* Thread still has a RW lock promotion */
	1872	} else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
	1873	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) \| DBG_FUNC_NONE,
	1874	thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
	1875
	1876	set_sched_pri(thread, DEPRESSPRI);
	1877	}
	1878	else {
	1879	if (thread->base_pri < thread->sched_pri) {
	1880	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) \| DBG_FUNC_NONE,
	1881	thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
	1882
	1883	thread_recompute_sched_pri(thread, FALSE);
	1884	}
	1885	}
	1886	}
	1887	thread_unlock(thread);
	1888	splx(s);
	1889	}
	1890	}
	1891	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) \| DBG_FUNC_END,
	1892	trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
	1893	}
	1894
	1895
	1896	/*
	1897	* Routine: lck_mtx_lock_acquire_x86
	1898	*
	1899	* Invoked on acquiring the mutex when there is
	1900	* contention (i.e. the assembly routine sees that
	1901	* that mutex->lck_mtx_waiters != 0 or
	1902	* thread->was_promoted_on_wakeup != 0)...
	1903	*
	1904	* mutex is owned... interlock is held... preemption is disabled
	1905	*/
	1906	void
	1907	lck_mtx_lock_acquire_x86(
	1908	lck_mtx_t *mutex)
	1909	{
	1910	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
	1911	thread_t thread;
	1912	integer_t priority;
	1913	spl_t s;
	1914
	1915	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) \| DBG_FUNC_START,
	1916	trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
	1917
	1918	if (mutex->lck_mtx_waiters)
	1919	priority = mutex->lck_mtx_pri;
	1920	else
	1921	priority = 0;
	1922
	1923	thread = (thread_t)mutex->lck_mtx_owner; /* faster then current_thread() */
	1924
	1925	if (thread->sched_pri < priority \|\| thread->was_promoted_on_wakeup) {
	1926
	1927	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) \| DBG_FUNC_NONE,
	1928	thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
	1929
	1930	s = splsched();
	1931	thread_lock(thread);
	1932
	1933	if (thread->sched_pri < priority) {
	1934	/* Do not promote past promotion ceiling */
	1935	assert(priority <= MAXPRI_PROMOTE);
	1936	set_sched_pri(thread, priority);
	1937	}
	1938	if (mutex->lck_mtx_promoted == 0) {
	1939	mutex->lck_mtx_promoted = 1;
	1940
	1941	thread->promotions++;
	1942	thread->sched_flags \|= TH_SFLAG_PROMOTED;
	1943	}
	1944	thread->was_promoted_on_wakeup = 0;
	1945
	1946	thread_unlock(thread);
	1947	splx(s);
	1948	}
	1949	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) \| DBG_FUNC_END,
	1950	trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
	1951	}
	1952
	1953
	1954	static int
	1955	lck_mtx_interlock_try_lock(lck_mtx_t mutex, boolean_t istate)
	1956	{
	1957	int retval;
	1958
	1959	*istate = ml_set_interrupts_enabled(FALSE);
	1960	retval = lck_mtx_ilk_try_lock(mutex);
	1961
	1962	if (retval == 0)
	1963	ml_set_interrupts_enabled(*istate);
	1964
	1965	return retval;
	1966	}
	1967
	1968	static void
	1969	lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
	1970	{
	1971	lck_mtx_ilk_unlock(mutex);
	1972	ml_set_interrupts_enabled(istate);
	1973	}
	1974
	1975
	1976	/*
	1977	* Routine: lck_mtx_lock_spinwait_x86
	1978	*
	1979	* Invoked trying to acquire a mutex when there is contention but
	1980	* the holder is running on another processor. We spin for up to a maximum
	1981	* time waiting for the lock to be released.
	1982	*
	1983	* Called with the interlock unlocked.
	1984	* returns 0 if mutex acquired
	1985	* returns 1 if we spun
	1986	* returns 2 if we didn't spin due to the holder not running
	1987	*/
	1988	int
	1989	lck_mtx_lock_spinwait_x86(
	1990	lck_mtx_t *mutex)
	1991	{
	1992	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
	1993	thread_t holder;
	1994	uint64_t overall_deadline;
	1995	uint64_t check_owner_deadline;
	1996	uint64_t cur_time;
	1997	int retval = 1;
	1998	int loopcount = 0;
	1999
	2000	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) \| DBG_FUNC_START,
	2001	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, 0, 0);
	2002
	2003	cur_time = mach_absolute_time();
	2004	overall_deadline = cur_time + MutexSpin;
	2005	check_owner_deadline = cur_time;
	2006
	2007	/*
	2008	* Spin while:
	2009	* - mutex is locked, and
	2010	* - its locked as a spin lock, and
	2011	* - owner is running on another processor, and
	2012	* - owner (processor) is not idling, and
	2013	* - we haven't spun for long enough.
	2014	*/
	2015	do {
	2016	if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
	2017	retval = 0;
	2018	break;
	2019	}
	2020	cur_time = mach_absolute_time();
	2021
	2022	if (cur_time >= overall_deadline)
	2023	break;
	2024
	2025	if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
	2026	boolean_t istate;
	2027
	2028	if (lck_mtx_interlock_try_lock(mutex, &istate)) {
	2029
	2030	if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
	2031
	2032	if ( !(holder->machine.specFlags & OnProc) \|\|
	2033	(holder->state & TH_IDLE)) {
	2034
	2035	lck_mtx_interlock_unlock(mutex, istate);
	2036
	2037	if (loopcount == 0)
	2038	retval = 2;
	2039	break;
	2040	}
	2041	}
	2042	lck_mtx_interlock_unlock(mutex, istate);
	2043
	2044	check_owner_deadline = cur_time + (MutexSpin / 4);
	2045	}
	2046	}
	2047	cpu_pause();
	2048
	2049	loopcount++;
	2050
	2051	} while (TRUE);
	2052
	2053	#if CONFIG_DTRACE
	2054	/*
	2055	* We've already kept a count via overall_deadline of how long we spun.
	2056	* If dtrace is active, then we compute backwards to decide how
	2057	* long we spun.
	2058	*
	2059	* Note that we record a different probe id depending on whether
	2060	* this is a direct or indirect mutex. This allows us to
	2061	* penalize only lock groups that have debug/stats enabled
	2062	* with dtrace processing if desired.
	2063	*/
	2064	if (__probable(mutex->lck_mtx_is_ext == 0)) {
	2065	LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
	2066	mach_absolute_time() - (overall_deadline - MutexSpin));
	2067	} else {
	2068	LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
	2069	mach_absolute_time() - (overall_deadline - MutexSpin));
	2070	}
	2071	/* The lockstat acquire event is recorded by the assembly code beneath us. */
	2072	#endif
	2073
	2074	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) \| DBG_FUNC_END,
	2075	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, 0);
	2076
	2077	return retval;
	2078	}
	2079
	2080
	2081
	2082	/*
	2083	* Routine: lck_mtx_lock_wait_x86
	2084	*
	2085	* Invoked in order to wait on contention.
	2086	*
	2087	* Called with the interlock locked and
	2088	* preemption disabled...
	2089	* returns it unlocked and with preemption enabled
	2090	*/
	2091	void
	2092	lck_mtx_lock_wait_x86 (
	2093	lck_mtx_t *mutex)
	2094	{
	2095	__kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
	2096	thread_t self = current_thread();
	2097	thread_t holder;
	2098	integer_t priority;
	2099	spl_t s;
	2100	#if CONFIG_DTRACE
	2101	uint64_t sleep_start = 0;
	2102
	2103	if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] \|\| lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
	2104	sleep_start = mach_absolute_time();
	2105	}
	2106	#endif
	2107	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) \| DBG_FUNC_START,
	2108	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
	2109
	2110	priority = self->sched_pri;
	2111
	2112	if (priority < self->base_pri)
	2113	priority = self->base_pri;
	2114	if (priority < BASEPRI_DEFAULT)
	2115	priority = BASEPRI_DEFAULT;
	2116
	2117	/* Do not promote past promotion ceiling */
	2118	priority = MIN(priority, MAXPRI_PROMOTE);
	2119
	2120	if (mutex->lck_mtx_waiters == 0 \|\| priority > mutex->lck_mtx_pri)
	2121	mutex->lck_mtx_pri = priority;
	2122	mutex->lck_mtx_waiters++;
	2123
	2124	if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
	2125	holder->sched_pri < mutex->lck_mtx_pri ) {
	2126	s = splsched();
	2127	thread_lock(holder);
	2128
	2129	/* holder priority may have been bumped by another thread
	2130	* before thread_lock was taken
	2131	*/
	2132	if (holder->sched_pri < mutex->lck_mtx_pri) {
	2133	KERNEL_DEBUG_CONSTANT(
	2134	MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) \| DBG_FUNC_NONE,
	2135	holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
	2136	/* Assert that we're not altering the priority of a
	2137	* thread above the MAXPRI_PROMOTE band
	2138	*/
	2139	assert(holder->sched_pri < MAXPRI_PROMOTE);
	2140	set_sched_pri(holder, priority);
	2141
	2142	if (mutex->lck_mtx_promoted == 0) {
	2143	holder->promotions++;
	2144	holder->sched_flags \|= TH_SFLAG_PROMOTED;
	2145
	2146	mutex->lck_mtx_promoted = 1;
	2147	}
	2148	}
	2149	thread_unlock(holder);
	2150	splx(s);
	2151	}
	2152	assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
	2153
	2154	lck_mtx_ilk_unlock(mutex);
	2155
	2156	thread_block(THREAD_CONTINUE_NULL);
	2157
	2158	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) \| DBG_FUNC_END,
	2159	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
	2160
	2161	#if CONFIG_DTRACE
	2162	/*
	2163	* Record the Dtrace lockstat probe for blocking, block time
	2164	* measured from when we were entered.
	2165	*/
	2166	if (sleep_start) {
	2167	if (mutex->lck_mtx_is_ext == 0) {
	2168	LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
	2169	mach_absolute_time() - sleep_start);
	2170	} else {
	2171	LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
	2172	mach_absolute_time() - sleep_start);
	2173	}
	2174	}
	2175	#endif
	2176	}
	2177
	2178	/*
	2179	* Routine: kdp_lck_mtx_lock_spin_is_acquired
	2180	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
	2181	* Returns: TRUE if lock is acquired.
	2182	*/
	2183	boolean_t
	2184	kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
	2185	{
	2186	if (not_in_kdp) {
	2187	panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
	2188	}
	2189
	2190	if (lck->lck_mtx_ilocked \|\| lck->lck_mtx_mlocked) {
	2191	return TRUE;
	2192	}
	2193
	2194	return FALSE;
	2195	}
	2196