git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2015-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_FREE_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56
	57	/*
	58	* un-comment the following lines to debug the link/prepost tables
	59	* NOTE: this expands each element by ~40 bytes
	60	*/
	61	//#define KEEP_WAITQ_LINK_STATS
	62	//#define KEEP_WAITQ_PREPOST_STATS
	63
	64	#include <kern/ast.h>
	65	#include <kern/backtrace.h>
	66	#include <kern/kern_types.h>
	67	#include <kern/ltable.h>
	68	#include <kern/mach_param.h>
	69	#include <kern/queue.h>
	70	#include <kern/sched_prim.h>
	71	#include <kern/simple_lock.h>
	72	#include <kern/spl.h>
	73	#include <kern/waitq.h>
	74	#include <kern/zalloc.h>
	75	#include <kern/policy_internal.h>
	76	#include <kern/turnstile.h>
	77
	78	#include <os/hash.h>
	79	#include <libkern/OSAtomic.h>
	80	#include <mach/sync_policy.h>
	81	#include <vm/vm_kern.h>
	82
	83	#include <sys/kdebug.h>
	84
	85	#if defined(KEEP_WAITQ_LINK_STATS) \|\| defined(KEEP_WAITQ_PREPOST_STATS)
	86	# if !CONFIG_LTABLE_STATS
	87	# error "You must configure LTABLE_STATS to use WAITQ_[LINK\|PREPOST]_STATS"
	88	# endif
	89	# if !CONFIG_WAITQ_STATS
	90	# error "You must configure WAITQ_STATS to use WAITQ_[LINK\|PREPOST]_STATS"
	91	# endif
	92	#endif
	93
	94	#if CONFIG_WAITQ_DEBUG
	95	#define wqdbg(fmt, ...) \
	96	printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__)
	97	#else
	98	#define wqdbg(fmt, ...) do { } while (0)
	99	#endif
	100
	101	#ifdef WAITQ_VERBOSE_DEBUG
	102	#define wqdbg_v(fmt, ...) \
	103	printf("WQ[v:%s]: " fmt "\n", __func__, ## __VA_ARGS__)
	104	#else
	105	#define wqdbg_v(fmt, ...) do { } while (0)
	106	#endif
	107
	108	#define wqinfo(fmt, ...) \
	109	printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__)
	110
	111	#define wqerr(fmt, ...) \
	112	printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__)
	113
	114	/*
	115	* file-static functions / data
	116	*/
	117	static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
	118	uint64_t *reserved_preposts,
	119	int priority, spl_t *spl);
	120
	121	static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
	122	event64_t event,
	123	thread_t thread, spl_t *spl);
	124
	125	#define WAITQ_SET_MAX (task_max * 3)
	126	static zone_t waitq_set_zone;
	127
	128
	129	#define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
	130	#define ROUNDDOWN(x, y) (((x)/(y))*(y))
	131
	132
	133	#if CONFIG_LTABLE_STATS \|\| CONFIG_WAITQ_STATS
	134	static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
	135	#endif
	136
	137	lck_grp_t waitq_lck_grp;
	138
	139	#if __arm64__
	140
	141	#define waitq_lock_to(wq, to) \
	142	(hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, to, &waitq_lck_grp))
	143
	144	#define waitq_lock_unlock(wq) \
	145	(hw_unlock_bit(&(wq)->waitq_interlock, LCK_ILOCK))
	146
	147	#define waitq_lock_init(wq) \
	148	(wq->waitq_interlock = 0)
	149
	150	#else
	151
	152	#define waitq_lock_to(wq, to) \
	153	(hw_lock_to(&(wq)->waitq_interlock, to, &waitq_lck_grp))
	154
	155	#define waitq_lock_unlock(wq) \
	156	(hw_lock_unlock(&(wq)->waitq_interlock))
	157
	158	#define waitq_lock_init(wq) \
	159	(hw_lock_init(&(wq)->waitq_interlock))
	160
	161	#endif /* __arm64__ */
	162
	163	/*
	164	* Prepost callback function for specially marked waitq sets
	165	* (prepost alternative)
	166	*/
	167	extern void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *ctx);
	168
	169	#define DEFAULT_MIN_FREE_TABLE_ELEM 100
	170	static uint32_t g_min_free_table_elem;
	171	static uint32_t g_min_free_cache;
	172
	173
	174	/* ----------------------------------------------------------------------
	175	*
	176	* SetID Link Table Implementation
	177	*
	178	* ---------------------------------------------------------------------- */
	179	static struct link_table g_wqlinktable;
	180
	181	enum wq_link_type {
	182	WQL_ALL = -1,
	183	WQL_FREE = LT_FREE,
	184	WQL_WQS = LT_ELEM,
	185	WQL_LINK = LT_LINK,
	186	};
	187
	188	struct waitq_link {
	189	struct lt_elem wqte;
	190
	191	union {
	192	/* wqt_type == WQL_WQS (LT_ELEM) */
	193	struct {
	194	struct waitq_set *wql_set;
	195	/* uint64_t sl_prepost_id; */
	196	} wql_wqs;
	197
	198	/* wqt_type == WQL_LINK (LT_LINK) */
	199	struct {
	200	uint64_t left_setid;
	201	uint64_t right_setid;
	202	} wql_link;
	203	};
	204	#ifdef KEEP_WAITQ_LINK_STATS
	205	thread_t sl_alloc_th;
	206	task_t sl_alloc_task;
	207	uintptr_t sl_alloc_bt[NWAITQ_BTFRAMES];
	208	uint64_t sl_alloc_ts;
	209	uintptr_t sl_invalidate_bt[NWAITQ_BTFRAMES];
	210	uint64_t sl_invalidate_ts;
	211	uintptr_t sl_mkvalid_bt[NWAITQ_BTFRAMES];
	212	uint64_t sl_mkvalid_ts;
	213	uint64_t sl_free_ts;
	214	#endif
	215	};
	216	#if !defined(KEEP_WAITQ_LINK_STATS)
	217	static_assert((sizeof(struct waitq_link) & (sizeof(struct waitq_link) - 1)) == 0,
	218	"waitq_link struct must be a power of two!");
	219	#endif
	220
	221	#define wql_refcnt(link) \
	222	(lt_bits_refcnt((link)->wqte.lt_bits))
	223
	224	#define wql_type(link) \
	225	(lt_bits_type((link)->wqte.lt_bits))
	226
	227	#define wql_mkvalid(link) \
	228	do { \
	229	lt_elem_mkvalid(&(link)->wqte); \
	230	wql_do_mkvalid_stats(&(link)->wqte); \
	231	} while (0)
	232
	233	#define wql_is_valid(link) \
	234	lt_bits_valid((link)->wqte.lt_bits)
	235
	236	#define wql_setid wqte.lt_id
	237
	238	#define WQL_WQS_POISON ((void *)(0xf00df00d))
	239	#define WQL_LINK_POISON (0x0bad0badffffffffull)
	240
	241	static void
	242	wql_poison(struct link_table table, struct lt_elem elem)
	243	{
	244	struct waitq_link link = (struct waitq_link )elem;
	245	(void)table;
	246
	247	switch (wql_type(link)) {
	248	case WQL_WQS:
	249	link->wql_wqs.wql_set = WQL_WQS_POISON;
	250	break;
	251	case WQL_LINK:
	252	link->wql_link.left_setid = WQL_LINK_POISON;
	253	link->wql_link.right_setid = WQL_LINK_POISON;
	254	break;
	255	default:
	256	break;
	257	}
	258	#ifdef KEEP_WAITQ_LINK_STATS
	259	memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
	260	link->sl_alloc_ts = 0;
	261	memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
	262	link->sl_mkvalid_ts = 0;
	263
	264	link->sl_alloc_th = THREAD_NULL;
	265	/* leave the sl_alloc_task in place for debugging */
	266
	267	link->sl_free_ts = mach_absolute_time();
	268	#endif
	269	}
	270
	271	#ifdef KEEP_WAITQ_LINK_STATS
	272	static __inline__ void
	273	wql_do_alloc_stats(struct lt_elem *elem)
	274	{
	275	if (elem) {
	276	struct waitq_link link = (struct waitq_link )elem;
	277	memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
	278	waitq_grab_backtrace(link->sl_alloc_bt, 0);
	279	link->sl_alloc_th = current_thread();
	280	link->sl_alloc_task = current_task();
	281
	282	assert(link->sl_alloc_ts == 0);
	283	link->sl_alloc_ts = mach_absolute_time();
	284
	285	memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
	286	link->sl_invalidate_ts = 0;
	287	}
	288	}
	289
	290	static __inline__ void
	291	wql_do_invalidate_stats(struct lt_elem *elem)
	292	{
	293	struct waitq_link link = (struct waitq_link )elem;
	294
	295	if (!elem) {
	296	return;
	297	}
	298
	299	assert(link->sl_mkvalid_ts > 0);
	300
	301	memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
	302	link->sl_invalidate_ts = mach_absolute_time();
	303	waitq_grab_backtrace(link->sl_invalidate_bt, 0);
	304	}
	305
	306	static __inline__ void
	307	wql_do_mkvalid_stats(struct lt_elem *elem)
	308	{
	309	struct waitq_link link = (struct waitq_link )elem;
	310
	311	if (!elem) {
	312	return;
	313	}
	314
	315	memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
	316	link->sl_mkvalid_ts = mach_absolute_time();
	317	waitq_grab_backtrace(link->sl_mkvalid_bt, 0);
	318	}
	319	#else
	320	#define wql_do_alloc_stats(e)
	321	#define wql_do_invalidate_stats(e)
	322	#define wql_do_mkvalid_stats(e)
	323	#endif /* KEEP_WAITQ_LINK_STATS */
	324
	325	static void
	326	wql_init(void)
	327	{
	328	uint32_t tablesz = 0, max_links = 0;
	329
	330	if (PE_parse_boot_argn("wql_tsize", &tablesz, sizeof(tablesz)) != TRUE) {
	331	tablesz = (uint32_t)g_lt_max_tbl_size;
	332	}
	333
	334	tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
	335	max_links = tablesz / sizeof(struct waitq_link);
	336	assert(max_links > 0 && tablesz > 0);
	337
	338	/* we have a restricted index range */
	339	if (max_links > (LT_IDX_MAX + 1)) {
	340	max_links = LT_IDX_MAX + 1;
	341	}
	342
	343	wqinfo("init linktable with max:%d elements (%d bytes)",
	344	max_links, tablesz);
	345	ltable_init(&g_wqlinktable, "wqslab.wql", max_links,
	346	sizeof(struct waitq_link), wql_poison);
	347	}
	348
	349	static void
	350	wql_ensure_free_space(void)
	351	{
	352	if (g_wqlinktable.nelem - g_wqlinktable.used_elem < g_min_free_table_elem) {
	353	/*
	354	* we don't hold locks on these values, so check for underflow
	355	*/
	356	if (g_wqlinktable.used_elem <= g_wqlinktable.nelem) {
	357	wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d",
	358	g_wqlinktable.nelem, g_wqlinktable.used_elem,
	359	g_min_free_table_elem);
	360	ltable_grow(&g_wqlinktable, g_min_free_table_elem);
	361	}
	362	}
	363	}
	364
	365	static struct waitq_link *
	366	wql_alloc_link(int type)
	367	{
	368	struct lt_elem *elem;
	369
	370	elem = ltable_alloc_elem(&g_wqlinktable, type, 1, 0);
	371	wql_do_alloc_stats(elem);
	372	return (struct waitq_link *)elem;
	373	}
	374
	375	static void
	376	wql_realloc_link(struct waitq_link *link, int type)
	377	{
	378	ltable_realloc_elem(&g_wqlinktable, &link->wqte, type);
	379	#ifdef KEEP_WAITQ_LINK_STATS
	380	memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
	381	link->sl_alloc_ts = 0;
	382	wql_do_alloc_stats(&link->wqte);
	383
	384	memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
	385	link->sl_invalidate_ts = 0;
	386	#endif
	387	}
	388
	389	static void
	390	wql_invalidate(struct waitq_link *link)
	391	{
	392	lt_elem_invalidate(&link->wqte);
	393	wql_do_invalidate_stats(&link->wqte);
	394	}
	395
	396	static struct waitq_link *
	397	wql_get_link(uint64_t setid)
	398	{
	399	struct lt_elem *elem;
	400
	401	elem = ltable_get_elem(&g_wqlinktable, setid);
	402	return (struct waitq_link *)elem;
	403	}
	404
	405	static void
	406	wql_put_link(struct waitq_link *link)
	407	{
	408	if (!link) {
	409	return;
	410	}
	411	ltable_put_elem(&g_wqlinktable, (struct lt_elem *)link);
	412	}
	413
	414	static struct waitq_link *
	415	wql_get_reserved(uint64_t setid, int type)
	416	{
	417	struct lt_elem *elem;
	418
	419	elem = lt_elem_list_first(&g_wqlinktable, setid);
	420	if (!elem) {
	421	return NULL;
	422	}
	423	ltable_realloc_elem(&g_wqlinktable, elem, type);
	424	return (struct waitq_link *)elem;
	425	}
	426
	427
	428	static inline int waitq_maybe_remove_link(struct waitq *waitq,
	429	uint64_t setid,
	430	struct waitq_link *parent,
	431	struct waitq_link *left,
	432	struct waitq_link *right);
	433
	434	enum {
	435	LINK_WALK_ONE_LEVEL = 0,
	436	LINK_WALK_FULL_DAG = 1,
	437	LINK_WALK_FULL_DAG_UNLOCKED = 2,
	438	};
	439
	440	typedef int (wql_callback_func)(struct waitq waitq, void *ctx,
	441	struct waitq_link *link);
	442
	443	/**
	444	* walk_waitq_links: walk all table elements (of type 'link_type') pointed to by 'setid'
	445	*
	446	* Conditions:
	447	* waitq is locked (or NULL)
	448	* 'setid' is managed by 'waitq'
	449	* this could be direct (waitq->waitq_set_id == setid)
	450	* OR indirect (setid is the left/right ID in a LINK chain,
	451	* whose root is waitq->waitq_set_id)
	452	*
	453	* Notes:
	454	* This function uses recursion to walk the set of table elements
	455	* pointed to by 'setid'. For each element encountered, 'cb' will be
	456	* called. If non-zero, the return value of this callback function can
	457	* early-out of the table walk.
	458	*
	459	* For each link element encountered, the function takes a reference to
	460	* it. The reference is dropped only after the callback and any recursion
	461	* has completed.
	462	*
	463	* The assumed table/link/tree structure:
	464	* 'setid'
	465	* / \
	466	* / \
	467	* L(LINK) R(LINK)
	468	* /\ /\
	469	* / \ / \
	470	* / \ Rl() Rr()
	471	* Ll() Lr() /\ /\
	472	* /\ /\ ... ... ... ...
	473	* ... ... ... ...
	474	* \
	475	* WQS(wqset_q.waitq_setid == Sx)
	476	* [waitq set is a membet of setid, 'Sx')
	477	*
	478	* 'Sx'
	479	* / \
	480	* / \
	481	* L(LINK) R(LINK)
	482	* /\ /\
	483	* ... ... ... ...
	484	*
	485	* The basic algorithm is as follows:
	486	* *) take a reference to the table object pointed to by 'setid'
	487	* *) if appropriate, call 'cb' (potentially early-out on non-zero return)
	488	* *) if the link object points to a waitq set, and the walk type
	489	* is 'FULL_DAG' (full directed-acyclic-graph), then try to lock
	490	* the associated waitq set object and recursively walk all sets to
	491	* which that set belongs. This is a DFS of the tree structure.
	492	* *) recurse down the left side of the tree (following the
	493	* 'left_setid' pointer in the link object
	494	* *) recurse down the right side of the tree (following the
	495	* 'right_setid' pointer in the link object
	496	*/
	497	static __attribute__((noinline))
	498	int
	499	walk_waitq_links(int walk_type, struct waitq *waitq,
	500	uint64_t setid, int link_type,
	501	void *ctx, wql_callback_func cb)
	502	{
	503	struct waitq_link *link;
	504	uint64_t nextid;
	505	int wqltype;
	506
	507	link = wql_get_link(setid);
	508
	509	/* invalid link */
	510	if (!link) {
	511	return WQ_ITERATE_CONTINUE;
	512	}
	513
	514	setid = nextid = 0;
	515	wqltype = wql_type(link);
	516	if (wqltype == WQL_LINK) {
	517	setid = link->wql_link.left_setid;
	518	nextid = link->wql_link.right_setid;
	519	}
	520
	521	/*
	522	* Make the callback only on specified link_type (or all links)
	523	* Note that after the callback, the link object may be
	524	* invalid. The only valid thing we can do is put our
	525	* reference to it (which may put it back on the free list)
	526	*/
	527	if (link_type == WQL_ALL \|\| link_type == wqltype) {
	528	/* allow the callback to early-out */
	529	int ret = cb(waitq, ctx, link);
	530	if (ret != WQ_ITERATE_CONTINUE) {
	531	wql_put_link(link);
	532	return ret;
	533	}
	534	}
	535
	536	if (wqltype == WQL_WQS &&
	537	(walk_type == LINK_WALK_FULL_DAG \|\|
	538	walk_type == LINK_WALK_FULL_DAG_UNLOCKED)) {
	539	/*
	540	* Recurse down any sets to which this wait queue set was
	541	* added. We do this just before we put our reference to
	542	* the link object (which may free it).
	543	*/
	544	struct waitq_set *wqset = link->wql_wqs.wql_set;
	545	int ret = WQ_ITERATE_CONTINUE;
	546	int should_unlock = 0;
	547	uint64_t wqset_setid = 0;
	548
	549	if (waitq_set_is_valid(wqset) && walk_type == LINK_WALK_FULL_DAG) {
	550	assert(!waitq_irq_safe(&wqset->wqset_q));
	551	waitq_set_lock(wqset);
	552	should_unlock = 1;
	553	}
	554
	555	/*
	556	* verify the linked waitq set as it could have been
	557	* invalidated before we grabbed the lock!
	558	*/
	559	if (wqset->wqset_id != link->wql_setid.id) {
	560	/* This is the bottom of the tree: just get out */
	561	if (should_unlock) {
	562	waitq_set_unlock(wqset);
	563	}
	564	wql_put_link(link);
	565	return WQ_ITERATE_CONTINUE;
	566	}
	567
	568	wqset_setid = wqset->wqset_q.waitq_set_id;
	569
	570	if (wqset_setid > 0) {
	571	ret = walk_waitq_links(walk_type, &wqset->wqset_q,
	572	wqset_setid, link_type, ctx, cb);
	573	}
	574	if (should_unlock) {
	575	waitq_set_unlock(wqset);
	576	}
	577	if (ret != WQ_ITERATE_CONTINUE) {
	578	wql_put_link(link);
	579	return ret;
	580	}
	581	}
	582
	583	wql_put_link(link);
	584
	585	/* recurse down left side of the tree */
	586	if (setid) {
	587	int ret = walk_waitq_links(walk_type, waitq, setid, link_type, ctx, cb);
	588	if (ret != WQ_ITERATE_CONTINUE) {
	589	return ret;
	590	}
	591	}
	592
	593	/* recurse down right side of the tree */
	594	if (nextid) {
	595	return walk_waitq_links(walk_type, waitq, nextid, link_type, ctx, cb);
	596	}
	597
	598	return WQ_ITERATE_CONTINUE;
	599	}
	600
	601	/* ----------------------------------------------------------------------
	602	*
	603	* Prepost Link Table Implementation
	604	*
	605	* ---------------------------------------------------------------------- */
	606	static struct link_table g_prepost_table;
	607
	608	enum wq_prepost_type {
	609	WQP_FREE = LT_FREE,
	610	WQP_WQ = LT_ELEM,
	611	WQP_POST = LT_LINK,
	612	};
	613
	614	struct wq_prepost {
	615	struct lt_elem wqte;
	616
	617	union {
	618	/* wqt_type == WQP_WQ (LT_ELEM) */
	619	struct {
	620	struct waitq *wqp_wq_ptr;
	621	} wqp_wq;
	622	/* wqt_type == WQP_POST (LT_LINK) */
	623	struct {
	624	uint64_t wqp_next_id;
	625	uint64_t wqp_wq_id;
	626	} wqp_post;
	627	};
	628	#ifdef KEEP_WAITQ_PREPOST_STATS
	629	thread_t wqp_alloc_th;
	630	task_t wqp_alloc_task;
	631	uintptr_t wqp_alloc_bt[NWAITQ_BTFRAMES];
	632	#endif
	633	};
	634	#if !defined(KEEP_WAITQ_PREPOST_STATS)
	635	static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - 1)) == 0,
	636	"wq_prepost struct must be a power of two!");
	637	#endif
	638
	639	#define wqp_refcnt(wqp) \
	640	(lt_bits_refcnt((wqp)->wqte.lt_bits))
	641
	642	#define wqp_type(wqp) \
	643	(lt_bits_type((wqp)->wqte.lt_bits))
	644
	645	#define wqp_set_valid(wqp) \
	646	lt_elem_mkvalid(&(wqp)->wqte)
	647
	648	#define wqp_is_valid(wqp) \
	649	lt_bits_valid((wqp)->wqte.lt_bits)
	650
	651	#define wqp_prepostid wqte.lt_id
	652
	653	#define WQP_WQ_POISON (0x0bad0badffffffffull)
	654	#define WQP_POST_POISON (0xf00df00df00df00d)
	655
	656	static void
	657	wqp_poison(struct link_table table, struct lt_elem elem)
	658	{
	659	struct wq_prepost wqp = (struct wq_prepost )elem;
	660	(void)table;
	661
	662	switch (wqp_type(wqp)) {
	663	case WQP_WQ:
	664	break;
	665	case WQP_POST:
	666	wqp->wqp_post.wqp_next_id = WQP_POST_POISON;
	667	wqp->wqp_post.wqp_wq_id = WQP_POST_POISON;
	668	break;
	669	default:
	670	break;
	671	}
	672	}
	673
	674	#ifdef KEEP_WAITQ_PREPOST_STATS
	675	static __inline__ void
	676	wqp_do_alloc_stats(struct lt_elem *elem)
	677	{
	678	if (!elem) {
	679	return;
	680	}
	681
	682	struct wq_prepost wqp = (struct wq_prepost )elem;
	683	uintptr_t alloc_bt[sizeof(wqp->wqp_alloc_bt)];
	684
	685	waitq_grab_backtrace(alloc_bt, NWAITQ_BTFRAMES);
	686
	687	/* be sure the take stats for _all_ allocated objects */
	688	for (;;) {
	689	memcpy(wqp->wqp_alloc_bt, alloc_bt, sizeof(alloc_bt));
	690	wqp->wqp_alloc_th = current_thread();
	691	wqp->wqp_alloc_task = current_task();
	692	wqp = (struct wq_prepost *)lt_elem_list_next(&g_prepost_table, &wqp->wqte);
	693	if (!wqp) {
	694	break;
	695	}
	696	}
	697	}
	698	#else
	699	#define wqp_do_alloc_stats(e)
	700	#endif /* KEEP_WAITQ_LINK_STATS */
	701
	702	static void
	703	wqp_init(void)
	704	{
	705	uint32_t tablesz = 0, max_wqp = 0;
	706
	707	if (PE_parse_boot_argn("wqp_tsize", &tablesz, sizeof(tablesz)) != TRUE) {
	708	tablesz = (uint32_t)g_lt_max_tbl_size;
	709	}
	710
	711	tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
	712	max_wqp = tablesz / sizeof(struct wq_prepost);
	713	assert(max_wqp > 0 && tablesz > 0);
	714
	715	/* we have a restricted index range */
	716	if (max_wqp > (LT_IDX_MAX + 1)) {
	717	max_wqp = LT_IDX_MAX + 1;
	718	}
	719
	720	wqinfo("init prepost table with max:%d elements (%d bytes)",
	721	max_wqp, tablesz);
	722	ltable_init(&g_prepost_table, "wqslab.prepost", max_wqp,
	723	sizeof(struct wq_prepost), wqp_poison);
	724	}
	725
	726	/*
	727	* Refill the per-CPU cache.
	728	*/
	729	static void
	730	wq_prepost_refill_cpu_cache(uint32_t nalloc)
	731	{
	732	struct lt_elem new_head, old_head;
	733	struct wqp_cache *cache;
	734
	735	/* require preemption enabled to allocate elements */
	736	if (get_preemption_level() != 0) {
	737	return;
	738	}
	739
	740	new_head = ltable_alloc_elem(&g_prepost_table,
	741	LT_RESERVED, nalloc, 1);
	742	if (new_head == NULL) {
	743	return;
	744	}
	745
	746	disable_preemption();
	747	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
	748
	749	/* check once more before putting these elements on the list */
	750	if (cache->avail >= WQP_CACHE_MAX) {
	751	lt_elem_list_release(&g_prepost_table, new_head, LT_RESERVED);
	752	enable_preemption();
	753	return;
	754	}
	755
	756	cache->avail += nalloc;
	757	if (cache->head == 0 \|\| cache->head == LT_IDX_MAX) {
	758	cache->head = new_head->lt_id.id;
	759	goto out;
	760	}
	761
	762	old_head = lt_elem_list_first(&g_prepost_table, cache->head);
	763	(void)lt_elem_list_link(&g_prepost_table, new_head, old_head);
	764	cache->head = new_head->lt_id.id;
	765
	766	out:
	767	enable_preemption();
	768	return;
	769	}
	770
	771	static void
	772	wq_prepost_ensure_free_space(void)
	773	{
	774	uint32_t free_elem;
	775	uint32_t min_free;
	776	struct wqp_cache *cache;
	777
	778	if (g_min_free_cache == 0) {
	779	g_min_free_cache = (WQP_CACHE_MAX * ml_get_max_cpus());
	780	}
	781
	782	/*
	783	* Ensure that we always have a pool of per-CPU prepost elements
	784	*/
	785	disable_preemption();
	786	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
	787	free_elem = cache->avail;
	788	enable_preemption();
	789
	790	if (free_elem < (WQP_CACHE_MAX / 3)) {
	791	wq_prepost_refill_cpu_cache(WQP_CACHE_MAX - free_elem);
	792	}
	793
	794	/*
	795	* Now ensure that we have a sufficient amount of free table space
	796	*/
	797	free_elem = g_prepost_table.nelem - g_prepost_table.used_elem;
	798	min_free = g_min_free_table_elem + g_min_free_cache;
	799	if (free_elem < min_free) {
	800	/*
	801	* we don't hold locks on these values, so check for underflow
	802	*/
	803	if (g_prepost_table.used_elem <= g_prepost_table.nelem) {
	804	wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d+%d",
	805	g_prepost_table.nelem, g_prepost_table.used_elem,
	806	g_min_free_table_elem, g_min_free_cache);
	807	ltable_grow(&g_prepost_table, min_free);
	808	}
	809	}
	810	}
	811
	812	static struct wq_prepost *
	813	wq_prepost_alloc(int type, int nelem)
	814	{
	815	struct lt_elem *elem;
	816	struct wq_prepost *wqp;
	817	struct wqp_cache *cache;
	818
	819	if (type != LT_RESERVED) {
	820	goto do_alloc;
	821	}
	822	if (nelem == 0) {
	823	return NULL;
	824	}
	825
	826	/*
	827	* First try to grab the elements from the per-CPU cache if we are
	828	* allocating RESERVED elements
	829	*/
	830	disable_preemption();
	831	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
	832	if (nelem <= (int)cache->avail) {
	833	struct lt_elem first, next = NULL;
	834	int nalloc = nelem;
	835
	836	cache->avail -= nelem;
	837
	838	/* grab the first element */
	839	first = lt_elem_list_first(&g_prepost_table, cache->head);
	840
	841	/* find the last element and re-adjust the cache head */
	842	for (elem = first; elem != NULL && nalloc > 0; elem = next) {
	843	next = lt_elem_list_next(&g_prepost_table, elem);
	844	if (--nalloc == 0) {
	845	/* terminate the allocated list */
	846	elem->lt_next_idx = LT_IDX_MAX;
	847	break;
	848	}
	849	}
	850	assert(nalloc == 0);
	851	if (!next) {
	852	cache->head = LT_IDX_MAX;
	853	} else {
	854	cache->head = next->lt_id.id;
	855	}
	856	/* assert that we don't have mis-matched book keeping */
	857	assert(!(cache->head == LT_IDX_MAX && cache->avail > 0));
	858	enable_preemption();
	859	elem = first;
	860	goto out;
	861	}
	862	enable_preemption();
	863
	864	do_alloc:
	865	/* fall-back to standard table allocation */
	866	elem = ltable_alloc_elem(&g_prepost_table, type, nelem, 0);
	867	if (!elem) {
	868	return NULL;
	869	}
	870
	871	out:
	872	wqp = (struct wq_prepost *)elem;
	873	wqp_do_alloc_stats(elem);
	874	return wqp;
	875	}
	876
	877	static void
	878	wq_prepost_invalidate(struct wq_prepost *wqp)
	879	{
	880	lt_elem_invalidate(&wqp->wqte);
	881	}
	882
	883	static struct wq_prepost *
	884	wq_prepost_get(uint64_t wqp_id)
	885	{
	886	struct lt_elem *elem;
	887
	888	elem = ltable_get_elem(&g_prepost_table, wqp_id);
	889	return (struct wq_prepost *)elem;
	890	}
	891
	892	static void
	893	wq_prepost_put(struct wq_prepost *wqp)
	894	{
	895	ltable_put_elem(&g_prepost_table, (struct lt_elem *)wqp);
	896	}
	897
	898	static int
	899	wq_prepost_rlink(struct wq_prepost parent, struct wq_prepost child)
	900	{
	901	return lt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte);
	902	}
	903
	904	static struct wq_prepost *
	905	wq_prepost_get_rnext(struct wq_prepost *head)
	906	{
	907	struct lt_elem *elem;
	908	struct wq_prepost *wqp;
	909	uint64_t id;
	910
	911	elem = lt_elem_list_next(&g_prepost_table, &head->wqte);
	912	if (!elem) {
	913	return NULL;
	914	}
	915	id = elem->lt_id.id;
	916	elem = ltable_get_elem(&g_prepost_table, id);
	917
	918	if (!elem) {
	919	return NULL;
	920	}
	921	wqp = (struct wq_prepost *)elem;
	922	if (elem->lt_id.id != id \|\|
	923	wqp_type(wqp) != WQP_POST \|\|
	924	wqp->wqp_post.wqp_next_id != head->wqp_prepostid.id) {
	925	ltable_put_elem(&g_prepost_table, elem);
	926	return NULL;
	927	}
	928
	929	return wqp;
	930	}
	931
	932	static void
	933	wq_prepost_reset_rnext(struct wq_prepost *wqp)
	934	{
	935	(void)lt_elem_list_break(&g_prepost_table, &wqp->wqte);
	936	}
	937
	938
	939	/**
	940	* remove 'wqp' from the prepost list on 'wqset'
	941	*
	942	* Conditions:
	943	* wqset is locked
	944	* caller holds a reference on wqp (and is responsible to release it)
	945	*
	946	* Result:
	947	* wqp is invalidated, wqset is potentially updated with a new
	948	* prepost ID, and the next element of the prepost list may be
	949	* consumed as well (if the list contained only 2 objects)
	950	*/
	951	static int
	952	wq_prepost_remove(struct waitq_set *wqset,
	953	struct wq_prepost *wqp)
	954	{
	955	int more_posts = 1;
	956	uint64_t next_id = wqp->wqp_post.wqp_next_id;
	957	uint64_t wqp_id = wqp->wqp_prepostid.id;
	958	struct wq_prepost prev_wqp, next_wqp;
	959
	960	assert(wqp_type(wqp) == WQP_POST);
	961	assert(wqset->wqset_q.waitq_prepost == 1);
	962
	963	if (next_id == wqp_id) {
	964	/* the list is singular and becoming empty */
	965	wqset->wqset_prepost_id = 0;
	966	more_posts = 0;
	967	goto out;
	968	}
	969
	970	prev_wqp = wq_prepost_get_rnext(wqp);
	971	assert(prev_wqp != NULL);
	972	assert(prev_wqp->wqp_post.wqp_next_id == wqp_id);
	973	assert(prev_wqp->wqp_prepostid.id != wqp_id);
	974	assert(wqp_type(prev_wqp) == WQP_POST);
	975
	976	if (prev_wqp->wqp_prepostid.id == next_id) {
	977	/*
	978	* There are two items in the list, and we're removing one. We
	979	* only need to keep the WQP_WQ pointer from 'prev_wqp'
	980	*/
	981	wqset->wqset_prepost_id = prev_wqp->wqp_post.wqp_wq_id;
	982	wq_prepost_invalidate(prev_wqp);
	983	wq_prepost_put(prev_wqp);
	984	more_posts = 0;
	985	goto out;
	986	}
	987
	988	/* prev->next = next */
	989	prev_wqp->wqp_post.wqp_next_id = next_id;
	990
	991	/* next->prev = prev */
	992	next_wqp = wq_prepost_get(next_id);
	993	assert(next_wqp != NULL);
	994	assert(next_wqp != wqp);
	995	assert(next_wqp != prev_wqp);
	996	assert(wqp_type(next_wqp) == WQP_POST);
	997
	998	wq_prepost_reset_rnext(next_wqp);
	999	wq_prepost_rlink(next_wqp, prev_wqp);
	1000
	1001	/* If we remove the head of the list, update the wqset */
	1002	if (wqp_id == wqset->wqset_prepost_id) {
	1003	wqset->wqset_prepost_id = next_id;
	1004	}
	1005
	1006	wq_prepost_put(prev_wqp);
	1007	wq_prepost_put(next_wqp);
	1008
	1009	out:
	1010	wq_prepost_reset_rnext(wqp);
	1011	wq_prepost_invalidate(wqp);
	1012	return more_posts;
	1013	}
	1014
	1015	static struct wq_prepost *
	1016	wq_prepost_rfirst(uint64_t id)
	1017	{
	1018	struct lt_elem *elem;
	1019	elem = lt_elem_list_first(&g_prepost_table, id);
	1020	wqp_do_alloc_stats(elem);
	1021	return (struct wq_prepost )(void )elem;
	1022	}
	1023
	1024	static struct wq_prepost *
	1025	wq_prepost_rpop(uint64_t *id, int type)
	1026	{
	1027	struct lt_elem *elem;
	1028	elem = lt_elem_list_pop(&g_prepost_table, id, type);
	1029	wqp_do_alloc_stats(elem);
	1030	return (struct wq_prepost )(void )elem;
	1031	}
	1032
	1033	static void
	1034	wq_prepost_release_rlist(struct wq_prepost *wqp)
	1035	{
	1036	int nelem = 0;
	1037	struct wqp_cache *cache;
	1038	struct lt_elem *elem;
	1039
	1040	if (!wqp) {
	1041	return;
	1042	}
	1043
	1044	elem = &wqp->wqte;
	1045
	1046	/*
	1047	* These are reserved elements: release them back to the per-cpu pool
	1048	* if our cache is running low.
	1049	*/
	1050	disable_preemption();
	1051	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
	1052	if (cache->avail < WQP_CACHE_MAX) {
	1053	struct lt_elem *tmp = NULL;
	1054	if (cache->head != LT_IDX_MAX) {
	1055	tmp = lt_elem_list_first(&g_prepost_table, cache->head);
	1056	}
	1057	nelem = lt_elem_list_link(&g_prepost_table, elem, tmp);
	1058	cache->head = elem->lt_id.id;
	1059	cache->avail += nelem;
	1060	enable_preemption();
	1061	return;
	1062	}
	1063	enable_preemption();
	1064
	1065	/* release these elements back to the main table */
	1066	nelem = lt_elem_list_release(&g_prepost_table, elem, LT_RESERVED);
	1067
	1068	#if CONFIG_WAITQ_STATS
	1069	g_prepost_table.nreserved_releases += 1;
	1070	OSDecrementAtomic64(&g_prepost_table.nreservations);
	1071	#endif
	1072	}
	1073
	1074	typedef int (wqp_callback_func)(struct waitq_set wqset,
	1075	void *ctx,
	1076	struct wq_prepost *wqp,
	1077	struct waitq *waitq);
	1078
	1079	/**
	1080	* iterate over a chain of preposts associated with a waitq set.
	1081	*
	1082	* Conditions:
	1083	* wqset is locked
	1084	*
	1085	* Notes:
	1086	* This loop performs automatic prepost chain management / culling, and
	1087	* may reset or adjust the waitq set's prepost ID pointer. If you don't
	1088	* want this extra processing, you can use wq_prepost_iterate().
	1089	*/
	1090	static int
	1091	wq_prepost_foreach_locked(struct waitq_set *wqset,
	1092	void *ctx, wqp_callback_func cb)
	1093	{
	1094	int ret = WQ_ITERATE_SUCCESS;
	1095	struct wq_prepost wqp, tmp_wqp;
	1096
	1097	assert(cb != NULL);
	1098
	1099	if (!wqset \|\| !waitq_set_maybe_preposted(wqset)) {
	1100	return WQ_ITERATE_SUCCESS;
	1101	}
	1102
	1103	restart:
	1104	wqp = wq_prepost_get(wqset->wqset_prepost_id);
	1105	if (!wqp) {
	1106	/*
	1107	* The prepost object is no longer valid, reset the waitq
	1108	* set's prepost id.
	1109	*/
	1110	wqset->wqset_prepost_id = 0;
	1111	return WQ_ITERATE_SUCCESS;
	1112	}
	1113
	1114	if (wqp_type(wqp) == WQP_WQ) {
	1115	uint64_t __assert_only wqp_id = wqp->wqp_prepostid.id;
	1116
	1117	ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
	1118
	1119	switch (ret) {
	1120	case WQ_ITERATE_INVALIDATE_CONTINUE:
	1121	/* the caller wants to remove the only prepost here */
	1122	assert(wqp_id == wqset->wqset_prepost_id);
	1123	wqset->wqset_prepost_id = 0;
	1124	/* fall through */
	1125	case WQ_ITERATE_CONTINUE:
	1126	wq_prepost_put(wqp);
	1127	ret = WQ_ITERATE_SUCCESS;
	1128	break;
	1129	case WQ_ITERATE_RESTART:
	1130	wq_prepost_put(wqp);
	1131	/* fall through */
	1132	case WQ_ITERATE_DROPPED:
	1133	goto restart;
	1134	default:
	1135	wq_prepost_put(wqp);
	1136	break;
	1137	}
	1138	return ret;
	1139	}
	1140
	1141	assert(wqp->wqp_prepostid.id == wqset->wqset_prepost_id);
	1142	assert(wqp_type(wqp) == WQP_POST);
	1143
	1144	/*
	1145	* At this point we know we have a list of POST objects.
	1146	* Grab a handle to the last element in the list and start
	1147	* the iteration.
	1148	*/
	1149	tmp_wqp = wq_prepost_get_rnext(wqp);
	1150	assert(tmp_wqp != NULL && wqp_type(tmp_wqp) == WQP_POST);
	1151
	1152	uint64_t last_id = tmp_wqp->wqp_prepostid.id;
	1153	wq_prepost_put(tmp_wqp);
	1154
	1155	ret = WQ_ITERATE_SUCCESS;
	1156	for (;;) {
	1157	uint64_t wqp_id, first_id, next_id;
	1158
	1159	wqp_id = wqp->wqp_prepostid.id;
	1160	first_id = wqset->wqset_prepost_id;
	1161	next_id = wqp->wqp_post.wqp_next_id;
	1162
	1163	/* grab the WQP_WQ object this _POST points to */
	1164	tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
	1165	if (!tmp_wqp) {
	1166	/*
	1167	* This WQP_POST object points to an invalid
	1168	* WQP_WQ object - remove the POST object from
	1169	* the list.
	1170	*/
	1171	if (wq_prepost_remove(wqset, wqp) == 0) {
	1172	wq_prepost_put(wqp);
	1173	goto restart;
	1174	}
	1175	goto next_prepost;
	1176	}
	1177	assert(wqp_type(tmp_wqp) == WQP_WQ);
	1178	/*
	1179	* make the callback: note that this could remove 'wqp' or
	1180	* drop the lock on our waitq set. We need to re-validate
	1181	* our state when this function returns.
	1182	*/
	1183	ret = cb(wqset, ctx, wqp, tmp_wqp->wqp_wq.wqp_wq_ptr);
	1184	wq_prepost_put(tmp_wqp);
	1185
	1186	switch (ret) {
	1187	case WQ_ITERATE_CONTINUE:
	1188	/* continue iteration */
	1189	break;
	1190	case WQ_ITERATE_INVALIDATE_CONTINUE:
	1191	assert(next_id == wqp->wqp_post.wqp_next_id);
	1192	if (wq_prepost_remove(wqset, wqp) == 0) {
	1193	wq_prepost_put(wqp);
	1194	goto restart;
	1195	}
	1196	goto next_prepost;
	1197	case WQ_ITERATE_RESTART:
	1198	wq_prepost_put(wqp);
	1199	/* fall-through */
	1200	case WQ_ITERATE_DROPPED:
	1201	/* the callback dropped the ref to wqp: just restart */
	1202	goto restart;
	1203	default:
	1204	/* break out of the iteration for some other reason */
	1205	goto finish_prepost_foreach;
	1206	}
	1207
	1208	/*
	1209	* the set lock may have been dropped during callback,
	1210	* if something looks different, restart the prepost iteration
	1211	*/
	1212	if (!wqp_is_valid(wqp) \|\|
	1213	(wqp->wqp_post.wqp_next_id != next_id) \|\|
	1214	wqset->wqset_prepost_id != first_id) {
	1215	wq_prepost_put(wqp);
	1216	goto restart;
	1217	}
	1218
	1219	next_prepost:
	1220	/* this was the last object in the list */
	1221	if (wqp_id == last_id) {
	1222	break;
	1223	}
	1224
	1225	/* get the next object */
	1226	tmp_wqp = wq_prepost_get(next_id);
	1227	if (!tmp_wqp) {
	1228	/*
	1229	* At this point we've already checked our state
	1230	* after the callback (which may have dropped the set
	1231	* lock). If we find an invalid member of the list
	1232	* then something is wrong.
	1233	*/
	1234	panic("Invalid WQP_POST member 0x%llx in waitq set "
	1235	"0x%llx prepost list (first:%llx, "
	1236	"wqp:%p)",
	1237	next_id, wqset->wqset_id, first_id, wqp);
	1238	}
	1239	wq_prepost_put(wqp);
	1240	wqp = tmp_wqp;
	1241
	1242	assert(wqp_type(wqp) == WQP_POST);
	1243	}
	1244
	1245	finish_prepost_foreach:
	1246	wq_prepost_put(wqp);
	1247	if (ret == WQ_ITERATE_CONTINUE) {
	1248	ret = WQ_ITERATE_SUCCESS;
	1249	}
	1250
	1251	return ret;
	1252	}
	1253
	1254	/**
	1255	* Perform a simple loop over a chain of prepost objects
	1256	*
	1257	* Conditions:
	1258	* If 'prepost_id' is associated with a waitq (set) then that object must
	1259	* be locked before calling this function.
	1260	* Callback function, 'cb', must be able to handle a NULL wqset pointer
	1261	* and a NULL waitq pointer!
	1262	*
	1263	* Notes:
	1264	* This prepost chain iteration will _not_ automatically adjust any chain
	1265	* element or linkage. This is the responsibility of the caller! If you
	1266	* want automatic prepost chain management (at a cost of extra CPU time),
	1267	* you can use: wq_prepost_foreach_locked().
	1268	*/
	1269	static int
	1270	wq_prepost_iterate(uint64_t prepost_id,
	1271	void *ctx, wqp_callback_func cb)
	1272	{
	1273	int ret;
	1274	struct wq_prepost *wqp;
	1275
	1276	if (!prepost_id) {
	1277	return WQ_ITERATE_SUCCESS;
	1278	}
	1279
	1280	wqp = wq_prepost_get(prepost_id);
	1281	if (!wqp) {
	1282	return WQ_ITERATE_SUCCESS;
	1283	}
	1284
	1285	if (wqp_type(wqp) == WQP_WQ) {
	1286	ret = WQ_ITERATE_SUCCESS;
	1287	if (cb) {
	1288	ret = cb(NULL, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
	1289	}
	1290
	1291	if (ret != WQ_ITERATE_DROPPED) {
	1292	wq_prepost_put(wqp);
	1293	}
	1294	return ret;
	1295	}
	1296
	1297	assert(wqp->wqp_prepostid.id == prepost_id);
	1298	assert(wqp_type(wqp) == WQP_POST);
	1299
	1300	/* at this point we know we have a list of POST objects */
	1301	uint64_t next_id;
	1302
	1303	ret = WQ_ITERATE_CONTINUE;
	1304	do {
	1305	struct wq_prepost *tmp_wqp;
	1306	struct waitq *wq = NULL;
	1307
	1308	next_id = wqp->wqp_post.wqp_next_id;
	1309
	1310	/* grab the WQP_WQ object this _POST points to */
	1311	tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
	1312	if (tmp_wqp) {
	1313	assert(wqp_type(tmp_wqp) == WQP_WQ);
	1314	wq = tmp_wqp->wqp_wq.wqp_wq_ptr;
	1315	}
	1316
	1317	if (cb) {
	1318	ret = cb(NULL, ctx, wqp, wq);
	1319	}
	1320	if (tmp_wqp) {
	1321	wq_prepost_put(tmp_wqp);
	1322	}
	1323
	1324	if (ret != WQ_ITERATE_CONTINUE) {
	1325	break;
	1326	}
	1327
	1328	tmp_wqp = wq_prepost_get(next_id);
	1329	if (!tmp_wqp) {
	1330	/*
	1331	* the chain is broken: nothing we can do here besides
	1332	* bail from the iteration.
	1333	*/
	1334	ret = WQ_ITERATE_ABORTED;
	1335	break;
	1336	}
	1337
	1338	wq_prepost_put(wqp);
	1339	wqp = tmp_wqp;
	1340
	1341	assert(wqp_type(wqp) == WQP_POST);
	1342	} while (next_id != prepost_id);
	1343
	1344	if (ret != WQ_ITERATE_DROPPED) {
	1345	wq_prepost_put(wqp);
	1346	}
	1347
	1348	if (ret == WQ_ITERATE_CONTINUE) {
	1349	ret = WQ_ITERATE_SUCCESS;
	1350	}
	1351	return ret;
	1352	}
	1353
	1354
	1355	struct _is_posted_ctx {
	1356	struct waitq *posting_wq;
	1357	int did_prepost;
	1358	};
	1359
	1360	static int
	1361	wq_is_preposted_on_set_cb(struct waitq_set wqset, void ctx,
	1362	struct wq_prepost wqp, struct waitq waitq)
	1363	{
	1364	struct _is_posted_ctx pctx = (struct _is_posted_ctx )ctx;
	1365
	1366	(void)wqset;
	1367	(void)wqp;
	1368
	1369	/*
	1370	* Don't early-out, run through the _entire_ list:
	1371	* This ensures that we retain a minimum number of invalid elements.
	1372	*/
	1373	if (pctx->posting_wq == waitq) {
	1374	pctx->did_prepost = 1;
	1375	}
	1376
	1377	return WQ_ITERATE_CONTINUE;
	1378	}
	1379
	1380
	1381	/**
	1382	* checks if 'waitq' has already preposted on 'wqset'
	1383	*
	1384	* Parameters:
	1385	* waitq The waitq that's preposting
	1386	* wqset The set onto which waitq may be preposted
	1387	*
	1388	* Conditions:
	1389	* both waitq and wqset are locked
	1390	*
	1391	* Returns non-zero if 'waitq' has already preposted to 'wqset'
	1392	*/
	1393	static int
	1394	wq_is_preposted_on_set(struct waitq waitq, struct waitq_set wqset)
	1395	{
	1396	int ret;
	1397	struct _is_posted_ctx pctx;
	1398
	1399	/*
	1400	* If the set's only prepost matches the waitq's prepost ID,
	1401	* then it obviously already preposted to the set.
	1402	*/
	1403	if (waitq->waitq_prepost_id != 0 &&
	1404	wqset->wqset_prepost_id == waitq->waitq_prepost_id) {
	1405	return 1;
	1406	}
	1407
	1408	/* use full prepost iteration: always trim the list */
	1409	pctx.posting_wq = waitq;
	1410	pctx.did_prepost = 0;
	1411	ret = wq_prepost_foreach_locked(wqset, (void *)&pctx,
	1412	wq_is_preposted_on_set_cb);
	1413	return pctx.did_prepost;
	1414	}
	1415
	1416	static struct wq_prepost *
	1417	wq_get_prepost_obj(uint64_t *reserved, int type)
	1418	{
	1419	struct wq_prepost *wqp = NULL;
	1420	/*
	1421	* don't fail just because the caller doesn't have enough
	1422	* reservations, we've kept a low-water mark on the prepost table,
	1423	* so there should be some available for us.
	1424	*/
	1425	if (reserved && *reserved) {
	1426	wqp = wq_prepost_rpop(reserved, type);
	1427	assert(wqp->wqte.lt_id.idx < g_prepost_table.nelem);
	1428	} else {
	1429	/*
	1430	* TODO: if in interrupt context, grab from a special
	1431	* region / reserved list!
	1432	*/
	1433	wqp = wq_prepost_alloc(type, 1);
	1434	}
	1435
	1436	if (wqp == NULL) {
	1437	panic("Couldn't allocate prepost object!");
	1438	}
	1439	return wqp;
	1440	}
	1441
	1442
	1443	/**
	1444	* prepost a waitq onto a waitq set
	1445	*
	1446	* Parameters:
	1447	* wqset The set onto which waitq will be preposted
	1448	* waitq The waitq that's preposting
	1449	* reserved List (lt_elem_list_ style) of pre-allocated prepost elements
	1450	* Could be NULL
	1451	*
	1452	* Conditions:
	1453	* both wqset and waitq are locked
	1454	*
	1455	* Notes:
	1456	* If reserved is NULL, this may block on prepost table growth.
	1457	*/
	1458	static void
	1459	wq_prepost_do_post_locked(struct waitq_set *wqset,
	1460	struct waitq *waitq,
	1461	uint64_t *reserved)
	1462	{
	1463	struct wq_prepost wqp_post, wqp_head, *wqp_tail;
	1464
	1465	assert(waitq_held(waitq) && waitq_held(&wqset->wqset_q));
	1466
	1467	/*
	1468	* nothing to do if it's already preposted:
	1469	* note that this also culls any invalid prepost objects
	1470	*/
	1471	if (wq_is_preposted_on_set(waitq, wqset)) {
	1472	return;
	1473	}
	1474
	1475	assert(waitqs_is_linked(wqset));
	1476
	1477	/*
	1478	* This function is called because an event is being posted to 'waitq'.
	1479	* We need a prepost object associated with this queue. Allocate one
	1480	* now if the waitq isn't already associated with one.
	1481	*/
	1482	if (waitq->waitq_prepost_id == 0) {
	1483	struct wq_prepost *wqp;
	1484	wqp = wq_get_prepost_obj(reserved, WQP_WQ);
	1485	wqp->wqp_wq.wqp_wq_ptr = waitq;
	1486	wqp_set_valid(wqp);
	1487	waitq->waitq_prepost_id = wqp->wqp_prepostid.id;
	1488	wq_prepost_put(wqp);
	1489	}
	1490
	1491	#if CONFIG_LTABLE_STATS
	1492	g_prepost_table.npreposts += 1;
	1493	#endif
	1494
	1495	wqdbg_v("preposting waitq %p (0x%llx) to set 0x%llx",
	1496	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq),
	1497	waitq->waitq_prepost_id, wqset->wqset_id);
	1498
	1499	if (wqset->wqset_prepost_id == 0) {
	1500	/* the set has no previous preposts */
	1501	wqset->wqset_prepost_id = waitq->waitq_prepost_id;
	1502	return;
	1503	}
	1504
	1505	wqp_head = wq_prepost_get(wqset->wqset_prepost_id);
	1506	if (!wqp_head) {
	1507	/* the previous prepost has become invalid */
	1508	wqset->wqset_prepost_id = waitq->waitq_prepost_id;
	1509	return;
	1510	}
	1511
	1512	assert(wqp_head->wqp_prepostid.id == wqset->wqset_prepost_id);
	1513
	1514	/*
	1515	* If we get here, we're going to need at least one new wq_prepost
	1516	* object. If the previous wqset_prepost_id points to a WQP_WQ, we
	1517	* actually need to allocate 2 wq_prepost objects because the WQP_WQ
	1518	* is tied to the waitq and shared across all sets.
	1519	*/
	1520	wqp_post = wq_get_prepost_obj(reserved, WQP_POST);
	1521
	1522	wqp_post->wqp_post.wqp_wq_id = waitq->waitq_prepost_id;
	1523	wqdbg_v("POST 0x%llx :: WQ 0x%llx", wqp_post->wqp_prepostid.id,
	1524	waitq->waitq_prepost_id);
	1525
	1526	if (wqp_type(wqp_head) == WQP_WQ) {
	1527	/*
	1528	* We must replace the wqset_prepost_id with a pointer
	1529	* to two new WQP_POST objects
	1530	*/
	1531	uint64_t wqp_id = wqp_head->wqp_prepostid.id;
	1532	wqdbg_v("set 0x%llx previous had 1 WQ prepost (0x%llx): "
	1533	"replacing with two POST preposts",
	1534	wqset->wqset_id, wqp_id);
	1535
	1536	/* drop the old reference */
	1537	wq_prepost_put(wqp_head);
	1538
	1539	/* grab another new object (the 2nd of two) */
	1540	wqp_head = wq_get_prepost_obj(reserved, WQP_POST);
	1541
	1542	/* point this one to the original WQP_WQ object */
	1543	wqp_head->wqp_post.wqp_wq_id = wqp_id;
	1544	wqdbg_v("POST 0x%llx :: WQ 0x%llx",
	1545	wqp_head->wqp_prepostid.id, wqp_id);
	1546
	1547	/* link it to the new wqp_post object allocated earlier */
	1548	wqp_head->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
	1549	/* make the list a double-linked and circular */
	1550	wq_prepost_rlink(wqp_head, wqp_post);
	1551
	1552	/*
	1553	* Finish setting up the new prepost: point it back to the
	1554	* POST object we allocated to replace the original wqset
	1555	* WQ prepost object
	1556	*/
	1557	wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
	1558	wq_prepost_rlink(wqp_post, wqp_head);
	1559
	1560	/* mark objects valid, and reset the wqset prepost list head */
	1561	wqp_set_valid(wqp_head);
	1562	wqp_set_valid(wqp_post);
	1563	wqset->wqset_prepost_id = wqp_head->wqp_prepostid.id;
	1564
	1565	/* release both references */
	1566	wq_prepost_put(wqp_head);
	1567	wq_prepost_put(wqp_post);
	1568
	1569	wqdbg_v("set 0x%llx: 0x%llx/0x%llx -> 0x%llx/0x%llx -> 0x%llx",
	1570	wqset->wqset_id, wqset->wqset_prepost_id,
	1571	wqp_head->wqp_prepostid.id, wqp_head->wqp_post.wqp_next_id,
	1572	wqp_post->wqp_prepostid.id,
	1573	wqp_post->wqp_post.wqp_next_id);
	1574	return;
	1575	}
	1576
	1577	assert(wqp_type(wqp_head) == WQP_POST);
	1578
	1579	/*
	1580	* Add the new prepost to the end of the prepost list
	1581	*/
	1582	wqp_tail = wq_prepost_get_rnext(wqp_head);
	1583	assert(wqp_tail != NULL);
	1584	assert(wqp_tail->wqp_post.wqp_next_id == wqset->wqset_prepost_id);
	1585
	1586	/*
	1587	* link the head to the new tail
	1588	* NOTE: this needs to happen first in case wqp_tail == wqp_head
	1589	*/
	1590	wq_prepost_reset_rnext(wqp_head);
	1591	wq_prepost_rlink(wqp_head, wqp_post);
	1592
	1593	/* point the new object to the list head, and list tail */
	1594	wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
	1595	wq_prepost_rlink(wqp_post, wqp_tail);
	1596
	1597	/* point the last item in the waitq set's list to the new object */
	1598	wqp_tail->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
	1599
	1600	wqp_set_valid(wqp_post);
	1601
	1602	wq_prepost_put(wqp_head);
	1603	wq_prepost_put(wqp_tail);
	1604	wq_prepost_put(wqp_post);
	1605
	1606	wqdbg_v("set 0x%llx (wqp:0x%llx) last_prepost:0x%llx, "
	1607	"new_prepost:0x%llx->0x%llx", wqset->wqset_id,
	1608	wqset->wqset_prepost_id, wqp_head->wqp_prepostid.id,
	1609	wqp_post->wqp_prepostid.id, wqp_post->wqp_post.wqp_next_id);
	1610
	1611	return;
	1612	}
	1613
	1614
	1615	/* ----------------------------------------------------------------------
	1616	*
	1617	* Stats collection / reporting
	1618	*
	1619	* ---------------------------------------------------------------------- */
	1620	#if CONFIG_LTABLE_STATS && CONFIG_WAITQ_STATS
	1621	static void
	1622	wq_table_stats(struct link_table table, struct wq_table_stats stats)
	1623	{
	1624	stats->version = WAITQ_STATS_VERSION;
	1625	stats->table_elements = table->nelem;
	1626	stats->table_used_elems = table->used_elem;
	1627	stats->table_elem_sz = table->elem_sz;
	1628	stats->table_slabs = table->nslabs;
	1629	stats->table_slab_sz = table->slab_sz;
	1630
	1631	stats->table_num_allocs = table->nallocs;
	1632	stats->table_num_preposts = table->npreposts;
	1633	stats->table_num_reservations = table->nreservations;
	1634
	1635	stats->table_max_used = table->max_used;
	1636	stats->table_avg_used = table->avg_used;
	1637	stats->table_max_reservations = table->max_reservations;
	1638	stats->table_avg_reservations = table->avg_reservations;
	1639	}
	1640
	1641	void
	1642	waitq_link_stats(struct wq_table_stats *stats)
	1643	{
	1644	if (!stats) {
	1645	return;
	1646	}
	1647	wq_table_stats(&g_wqlinktable, stats);
	1648	}
	1649
	1650	void
	1651	waitq_prepost_stats(struct wq_table_stats *stats)
	1652	{
	1653	wq_table_stats(&g_prepost_table, stats);
	1654	}
	1655	#endif
	1656
	1657
	1658	/* ----------------------------------------------------------------------
	1659	*
	1660	* Global Wait Queues
	1661	*
	1662	* ---------------------------------------------------------------------- */
	1663
	1664	static struct waitq g_boot_waitq;
	1665	static struct waitq *global_waitqs = &g_boot_waitq;
	1666	static uint32_t g_num_waitqs = 1;
	1667
	1668	/*
	1669	* Zero out the used MSBs of the event.
	1670	*/
	1671	#define _CAST_TO_EVENT_MASK(event) ((uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
	1672
	1673	static __inline__ uint32_t
	1674	waitq_hash(char *key, size_t length)
	1675	{
	1676	uint32_t hash = os_hash_jenkins(key, length);
	1677
	1678	hash &= (g_num_waitqs - 1);
	1679	return hash;
	1680	}
	1681
	1682	/* return a global waitq pointer corresponding to the given event */
	1683	struct waitq *
	1684	_global_eventq(char *event, size_t event_length)
	1685	{
	1686	return &global_waitqs[waitq_hash(event, event_length)];
	1687	}
	1688
	1689	/* return an indexed global waitq pointer */
	1690	struct waitq *
	1691	global_waitq(int index)
	1692	{
	1693	return &global_waitqs[index % g_num_waitqs];
	1694	}
	1695
	1696
	1697	#if CONFIG_LTABLE_STATS \|\| CONFIG_WAITQ_STATS
	1698	/* this global is for lldb */
	1699	const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
	1700
	1701	static __inline__ void
	1702	waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
	1703	{
	1704	uintptr_t buf[NWAITQ_BTFRAMES + skip];
	1705	if (skip < 0) {
	1706	skip = 0;
	1707	}
	1708	memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
	1709	backtrace(buf, g_nwaitq_btframes + skip, NULL);
	1710	memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
	1711	}
	1712	#else /* no stats */
	1713	#define waitq_grab_backtrace(...)
	1714	#endif
	1715
	1716	#if CONFIG_WAITQ_STATS
	1717
	1718	struct wq_stats g_boot_stats;
	1719	struct wq_stats *g_waitq_stats = &g_boot_stats;
	1720
	1721	static __inline__ struct wq_stats *
	1722	waitq_global_stats(struct waitq *waitq)
	1723	{
	1724	struct wq_stats *wqs;
	1725	uint32_t idx;
	1726
	1727	if (!waitq_is_global(waitq)) {
	1728	return NULL;
	1729	}
	1730
	1731	idx = (uint32_t)(((uintptr_t)waitq - (uintptr_t)global_waitqs) / sizeof(*waitq));
	1732	assert(idx < g_num_waitqs);
	1733	wqs = &g_waitq_stats[idx];
	1734	return wqs;
	1735	}
	1736
	1737	static __inline__ void
	1738	waitq_stats_count_wait(struct waitq *waitq)
	1739	{
	1740	struct wq_stats *wqs = waitq_global_stats(waitq);
	1741	if (wqs != NULL) {
	1742	wqs->waits++;
	1743	waitq_grab_backtrace(wqs->last_wait, 2);
	1744	}
	1745	}
	1746
	1747	static __inline__ void
	1748	waitq_stats_count_wakeup(struct waitq *waitq)
	1749	{
	1750	struct wq_stats *wqs = waitq_global_stats(waitq);
	1751	if (wqs != NULL) {
	1752	wqs->wakeups++;
	1753	waitq_grab_backtrace(wqs->last_wakeup, 2);
	1754	}
	1755	}
	1756
	1757	static __inline__ void
	1758	waitq_stats_count_clear_wakeup(struct waitq *waitq)
	1759	{
	1760	struct wq_stats *wqs = waitq_global_stats(waitq);
	1761	if (wqs != NULL) {
	1762	wqs->wakeups++;
	1763	wqs->clears++;
	1764	waitq_grab_backtrace(wqs->last_wakeup, 2);
	1765	}
	1766	}
	1767
	1768	static __inline__ void
	1769	waitq_stats_count_fail(struct waitq *waitq)
	1770	{
	1771	struct wq_stats *wqs = waitq_global_stats(waitq);
	1772	if (wqs != NULL) {
	1773	wqs->failed_wakeups++;
	1774	waitq_grab_backtrace(wqs->last_failed_wakeup, 2);
	1775	}
	1776	}
	1777	#else /* !CONFIG_WAITQ_STATS */
	1778	#define waitq_stats_count_wait(q) do { } while (0)
	1779	#define waitq_stats_count_wakeup(q) do { } while (0)
	1780	#define waitq_stats_count_clear_wakeup(q) do { } while (0)
	1781	#define waitq_stats_count_fail(q) do { } while (0)
	1782	#endif
	1783
	1784	int
	1785	waitq_is_valid(struct waitq *waitq)
	1786	{
	1787	return (waitq != NULL) && waitq->waitq_isvalid;
	1788	}
	1789
	1790	int
	1791	waitq_set_is_valid(struct waitq_set *wqset)
	1792	{
	1793	return (wqset != NULL) && wqset->wqset_q.waitq_isvalid && waitqs_is_set(wqset);
	1794	}
	1795
	1796	int
	1797	waitq_is_global(struct waitq *waitq)
	1798	{
	1799	if (waitq >= global_waitqs && waitq < global_waitqs + g_num_waitqs) {
	1800	return 1;
	1801	}
	1802	return 0;
	1803	}
	1804
	1805	int
	1806	waitq_irq_safe(struct waitq *waitq)
	1807	{
	1808	/* global wait queues have this bit set on initialization */
	1809	return waitq->waitq_irq;
	1810	}
	1811
	1812	static inline bool
	1813	waitq_empty(struct waitq *wq)
	1814	{
	1815	if (waitq_is_turnstile_queue(wq)) {
	1816	return priority_queue_empty(&wq->waitq_prio_queue);
	1817	} else if (waitq_is_turnstile_proxy(wq)) {
	1818	struct turnstile *ts = wq->waitq_ts;
	1819	return ts == TURNSTILE_NULL \|\|
	1820	priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
	1821	} else {
	1822	return queue_empty(&wq->waitq_queue);
	1823	}
	1824	}
	1825
	1826	static struct waitq *
	1827	waitq_get_safeq(struct waitq *waitq)
	1828	{
	1829	/* Check if it's a port waitq */
	1830	if (waitq_is_turnstile_proxy(waitq)) {
	1831	struct turnstile *ts = waitq->waitq_ts;
	1832	return ts ? &ts->ts_waitq : NULL;
	1833	}
	1834	return global_eventq(waitq);
	1835	}
	1836
	1837	static uint32_t
	1838	waitq_hash_size(void)
	1839	{
	1840	uint32_t hsize, queues;
	1841
	1842	if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize))) {
	1843	return hsize;
	1844	}
	1845
	1846	queues = thread_max / 5;
	1847	hsize = P2ROUNDUP(queues * sizeof(struct waitq), PAGE_SIZE);
	1848
	1849	return hsize;
	1850	}
	1851
	1852	/*
	1853	* Since the priority ordered waitq uses basepri as the
	1854	* ordering key assert that this value fits in a uint8_t.
	1855	*/
	1856	static_assert(MAXPRI <= UINT8_MAX);
	1857
	1858	static inline void
	1859	waitq_thread_insert(struct waitq *wq,
	1860	thread_t thread, boolean_t fifo)
	1861	{
	1862	if (waitq_is_turnstile_queue(wq)) {
	1863	turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
	1864	turnstile_waitq_add_thread_priority_queue(wq, thread);
	1865	} else {
	1866	turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
	1867	if (fifo) {
	1868	enqueue_tail(&wq->waitq_queue, &thread->wait_links);
	1869	} else {
	1870	enqueue_head(&wq->waitq_queue, &thread->wait_links);
	1871	}
	1872	}
	1873	}
	1874
	1875	static inline void
	1876	waitq_thread_remove(struct waitq *wq,
	1877	thread_t thread)
	1878	{
	1879	if (waitq_is_turnstile_queue(wq)) {
	1880	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1881	(TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) \| DBG_FUNC_NONE,
	1882	VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
	1883	thread_tid(thread),
	1884	0, 0, 0);
	1885	priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links,
	1886	PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
	1887	} else {
	1888	remqueue(&(thread->wait_links));
	1889	}
	1890	}
	1891
	1892	void
	1893	waitq_bootstrap(void)
	1894	{
	1895	kern_return_t kret;
	1896	uint32_t whsize, qsz, tmp32;
	1897
	1898	g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM;
	1899	if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE) {
	1900	g_min_free_table_elem = tmp32;
	1901	}
	1902	wqdbg("Minimum free table elements: %d", tmp32);
	1903
	1904	lck_grp_init(&waitq_lck_grp, "waitq", LCK_GRP_ATTR_NULL);
	1905
	1906	/*
	1907	* Determine the amount of memory we're willing to reserve for
	1908	* the waitqueue hash table
	1909	*/
	1910	whsize = waitq_hash_size();
	1911
	1912	/* Determine the number of waitqueues we can fit. */
	1913	qsz = sizeof(struct waitq);
	1914	whsize = ROUNDDOWN(whsize, qsz);
	1915	g_num_waitqs = whsize / qsz;
	1916
	1917	/*
	1918	* The hash algorithm requires that this be a power of 2, so we
	1919	* just mask off all the low-order bits.
	1920	*/
	1921	for (uint32_t i = 0; i < 31; i++) {
	1922	uint32_t bit = (1 << i);
	1923	if ((g_num_waitqs & bit) == g_num_waitqs) {
	1924	break;
	1925	}
	1926	g_num_waitqs &= ~bit;
	1927	}
	1928	assert(g_num_waitqs > 0);
	1929
	1930	/* Now determine how much memory we really need. */
	1931	whsize = P2ROUNDUP(g_num_waitqs * qsz, PAGE_SIZE);
	1932
	1933	wqdbg("allocating %d global queues (%d bytes)", g_num_waitqs, whsize);
	1934	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&global_waitqs,
	1935	whsize, 0, KMA_KOBJECT \| KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
	1936	if (kret != KERN_SUCCESS \|\| global_waitqs == NULL) {
	1937	panic("kernel_memory_allocate() failed to alloc global_waitqs"
	1938	", error: %d, whsize: 0x%x", kret, whsize);
	1939	}
	1940
	1941	#if CONFIG_WAITQ_STATS
	1942	whsize = P2ROUNDUP(g_num_waitqs * sizeof(struct wq_stats), PAGE_SIZE);
	1943	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&g_waitq_stats,
	1944	whsize, 0, KMA_KOBJECT \| KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
	1945	if (kret != KERN_SUCCESS \|\| global_waitqs == NULL) {
	1946	panic("kernel_memory_allocate() failed to alloc g_waitq_stats"
	1947	", error: %d, whsize: 0x%x", kret, whsize);
	1948	}
	1949	memset(g_waitq_stats, 0, whsize);
	1950	#endif
	1951
	1952	for (uint32_t i = 0; i < g_num_waitqs; i++) {
	1953	waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO \| SYNC_POLICY_DISABLE_IRQ);
	1954	}
	1955
	1956	waitq_set_zone = zinit(sizeof(struct waitq_set),
	1957	WAITQ_SET_MAX * sizeof(struct waitq_set),
	1958	sizeof(struct waitq_set),
	1959	"waitq sets");
	1960	zone_change(waitq_set_zone, Z_NOENCRYPT, TRUE);
	1961
	1962	/* initialize the global waitq link table */
	1963	wql_init();
	1964
	1965	/* initialize the global waitq prepost table */
	1966	wqp_init();
	1967	}
	1968
	1969
	1970	/* ----------------------------------------------------------------------
	1971	*
	1972	* Wait Queue Implementation
	1973	*
	1974	* ---------------------------------------------------------------------- */
	1975
	1976	/*
	1977	* Double the standard lock timeout, because wait queues tend
	1978	* to iterate over a number of threads - locking each. If there is
	1979	* a problem with a thread lock, it normally times out at the wait
	1980	* queue level first, hiding the real problem.
	1981	*/
	1982	/* For x86, the hardware timeout is in TSC units. */
	1983	#if defined(__i386__) \|\| defined(__x86_64__)
	1984	#define hwLockTimeOut LockTimeOutTSC
	1985	#else
	1986	#define hwLockTimeOut LockTimeOut
	1987	#endif
	1988
	1989	void
	1990	waitq_lock(struct waitq *wq)
	1991	{
	1992	if (__improbable(waitq_lock_to(wq,
	1993	hwLockTimeOut * 2) == 0)) {
	1994	boolean_t wql_acquired = FALSE;
	1995
	1996	while (machine_timeout_suspended()) {
	1997	mp_enable_preemption();
	1998	wql_acquired = waitq_lock_to(wq,
	1999	hwLockTimeOut * 2);
	2000	if (wql_acquired) {
	2001	break;
	2002	}
	2003	}
	2004	if (wql_acquired == FALSE) {
	2005	panic("waitq deadlock - waitq=%p, cpu=%d\n",
	2006	wq, cpu_number());
	2007	}
	2008	}
	2009	#if defined(__x86_64__)
	2010	pltrace(FALSE);
	2011	#endif
	2012	assert(waitq_held(wq));
	2013	}
	2014
	2015	void
	2016	waitq_unlock(struct waitq *wq)
	2017	{
	2018	assert(waitq_held(wq));
	2019	#if defined(__x86_64__)
	2020	pltrace(TRUE);
	2021	#endif
	2022	waitq_lock_unlock(wq);
	2023	}
	2024
	2025
	2026	/**
	2027	* clear the thread-related waitq state
	2028	*
	2029	* Conditions:
	2030	* 'thread' is locked
	2031	*/
	2032	static inline void
	2033	thread_clear_waitq_state(thread_t thread)
	2034	{
	2035	thread->waitq = NULL;
	2036	thread->wait_event = NO_EVENT64;
	2037	thread->at_safe_point = FALSE;
	2038	}
	2039
	2040
	2041	typedef thread_t (waitq_select_cb)(void ctx, struct waitq *waitq,
	2042	int is_global, thread_t thread);
	2043
	2044	struct waitq_select_args {
	2045	/* input parameters */
	2046	struct waitq *posted_waitq;
	2047	struct waitq *waitq;
	2048	event64_t event;
	2049	waitq_select_cb select_cb;
	2050	void *select_ctx;
	2051	int priority;
	2052
	2053	uint64_t *reserved_preposts;
	2054
	2055	/* output parameters */
	2056	queue_t threadq;
	2057	int max_threads;
	2058	int *nthreads;
	2059	spl_t *spl;
	2060	};
	2061
	2062	static void do_waitq_select_n_locked(struct waitq_select_args *args);
	2063
	2064	/**
	2065	* callback invoked once for every waitq set to which a waitq belongs
	2066	*
	2067	* Conditions:
	2068	* ctx->posted_waitq is locked
	2069	* 'link' points to a valid waitq set
	2070	*
	2071	* Notes:
	2072	* Takes the waitq set lock on the set pointed to by 'link'
	2073	* Calls do_waitq_select_n_locked() which could recurse back into
	2074	* this function if the waitq set is a member of other sets.
	2075	* If no threads were selected, it preposts the input waitq
	2076	* onto the waitq set pointed to by 'link'.
	2077	*/
	2078	static int
	2079	waitq_select_walk_cb(struct waitq waitq, void ctx,
	2080	struct waitq_link *link)
	2081	{
	2082	int ret = WQ_ITERATE_CONTINUE;
	2083	struct waitq_select_args args = ((struct waitq_select_args )ctx);
	2084	struct waitq_set *wqset;
	2085
	2086	(void)waitq;
	2087	assert(wql_type(link) == WQL_WQS);
	2088
	2089	wqset = link->wql_wqs.wql_set;
	2090	args.waitq = &wqset->wqset_q;
	2091
	2092	assert(!waitq_irq_safe(waitq));
	2093	assert(!waitq_irq_safe(&wqset->wqset_q));
	2094
	2095	waitq_set_lock(wqset);
	2096	/*
	2097	* verify that the link wasn't invalidated just before
	2098	* we were able to take the lock.
	2099	*/
	2100	if (wqset->wqset_id != link->wql_setid.id) {
	2101	goto out_unlock;
	2102	}
	2103
	2104	assert(waitqs_is_linked(wqset));
	2105
	2106	/*
	2107	* Find any threads waiting on this wait queue set,
	2108	* and recurse into any waitq set to which this set belongs.
	2109	*/
	2110	do_waitq_select_n_locked(&args);
	2111
	2112	if (*args.nthreads > 0 \|\| (args.threadq && !queue_empty(args.threadq))) {
	2113	/* at least 1 thread was selected and returned: don't prepost */
	2114	if (args.max_threads > 0 && *args.nthreads >= args.max_threads) {
	2115	/* break out of the setid walk */
	2116	ret = WQ_ITERATE_FOUND;
	2117	}
	2118	} else if (args.event == NO_EVENT64) {
	2119	/*
	2120	* No thread selected: prepost 'waitq' to 'wqset'
	2121	* if wqset can handle preposts and the event is set to 0.
	2122	* We also make sure to not post waitq sets to other sets.
	2123	*
	2124	* If the set doesn't support preposts, but does support
	2125	* prepost callout/hook interaction, invoke the predefined
	2126	* callout function and pass the set's 'prepost_hook.' This
	2127	* could potentially release another thread to handle events.
	2128	*/
	2129	if (waitq_set_can_prepost(wqset)) {
	2130	wq_prepost_do_post_locked(
	2131	wqset, waitq, args.reserved_preposts);
	2132	} else if (waitq_set_has_prepost_hook(wqset)) {
	2133	waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
	2134
	2135	/*
	2136	* When calling out to the prepost hook,
	2137	* we drop the waitq lock, to allow for the kevent
	2138	* subsytem to call into the waitq subsystem again,
	2139	* without risking a deadlock.
	2140	*
	2141	* However, we need to guard against wqset going away,
	2142	* so we increment the prepost hook use count
	2143	* while the lock is dropped.
	2144	*
	2145	* This lets waitq_set_deinit() know to wait for the
	2146	* prepost hook call to be done before it can proceed.
	2147	*
	2148	* Note: we need to keep preemption disabled the whole
	2149	* time as waitq_set_deinit will spin on this.
	2150	*/
	2151
	2152	disable_preemption();
	2153	os_atomic_inc(hook, relaxed);
	2154	waitq_set_unlock(wqset);
	2155
	2156	waitq_set__CALLING_PREPOST_HOOK__(hook);
	2157
	2158	/* Note: after this decrement, the wqset may be deallocated */
	2159	os_atomic_dec(hook, relaxed);
	2160	enable_preemption();
	2161	return ret;
	2162	}
	2163	}
	2164
	2165	out_unlock:
	2166	waitq_set_unlock(wqset);
	2167	return ret;
	2168	}
	2169
	2170	/**
	2171	* Routine to iterate over the waitq for non-priority ordered waitqs
	2172	*
	2173	* Conditions:
	2174	* args->waitq (and args->posted_waitq) is locked
	2175	*
	2176	* Notes:
	2177	* Uses the optional select callback function to refine the selection
	2178	* of one or more threads from a waitq. The select callback is invoked
	2179	* once for every thread that is found to be waiting on the input args->waitq.
	2180	*
	2181	* If one or more threads are selected, this may disable interrupts.
	2182	* The previous interrupt state is returned in args->spl and should
	2183	* be used in a call to splx() if threads are returned to the caller.
	2184	*/
	2185	static thread_t
	2186	waitq_queue_iterate_locked(struct waitq safeq, struct waitq waitq,
	2187	spl_t spl, struct waitq_select_args *args,
	2188	uint32_t *remaining_eventmask)
	2189	{
	2190	int max_threads = args->max_threads;
	2191	int *nthreads = args->nthreads;
	2192	thread_t thread = THREAD_NULL;
	2193	thread_t first_thread = THREAD_NULL;
	2194
	2195	qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
	2196	thread_t t = THREAD_NULL;
	2197	assert_thread_magic(thread);
	2198
	2199	/*
	2200	* For non-priority ordered waitqs, we allow multiple events to be
	2201	* mux'ed into the same waitq. Also safeqs may contain threads from
	2202	* multiple waitqs. Only pick threads that match the
	2203	* requested wait event.
	2204	*/
	2205	if (thread->waitq == waitq && thread->wait_event == args->event) {
	2206	t = thread;
	2207	if (first_thread == THREAD_NULL) {
	2208	first_thread = thread;
	2209	}
	2210
	2211	/* allow the caller to futher refine the selection */
	2212	if (args->select_cb) {
	2213	t = args->select_cb(args->select_ctx, waitq,
	2214	waitq_is_global(waitq), thread);
	2215	}
	2216	if (t != THREAD_NULL) {
	2217	*nthreads += 1;
	2218	if (args->threadq) {
	2219	/* if output queue, add locked thread to it */
	2220	if (*nthreads == 1) {
	2221	*(args->spl) = (safeq != waitq) ? spl : splsched();
	2222	}
	2223	thread_lock(t);
	2224	thread_clear_waitq_state(t);
	2225	re_queue_tail(args->threadq, &t->wait_links);
	2226	}
	2227	/* only enqueue up to 'max' threads */
	2228	if (*nthreads >= max_threads && max_threads > 0) {
	2229	break;
	2230	}
	2231	}
	2232	}
	2233	/* thread wasn't selected so track it's event */
	2234	if (t == THREAD_NULL) {
	2235	*remaining_eventmask \|= (thread->waitq != safeq) ?
	2236	_CAST_TO_EVENT_MASK(thread->waitq) : _CAST_TO_EVENT_MASK(thread->wait_event);
	2237	}
	2238	}
	2239
	2240	return first_thread;
	2241	}
	2242
	2243	/**
	2244	* Routine to iterate and remove threads from priority ordered waitqs
	2245	*
	2246	* Conditions:
	2247	* args->waitq (and args->posted_waitq) is locked
	2248	*
	2249	* Notes:
	2250	* The priority ordered waitqs only support maximum priority element removal.
	2251	*
	2252	* Also, the implementation makes sure that all threads in a priority ordered
	2253	* waitq are waiting on the same wait event. This is not necessarily true for
	2254	* non-priority ordered waitqs. If one or more threads are selected, this may
	2255	* disable interrupts. The previous interrupt state is returned in args->spl
	2256	* and should be used in a call to splx() if threads are returned to the caller.
	2257	*
	2258	* In the future, we could support priority ordered waitqs with multiple wait
	2259	* events in the same queue. The way to implement that would be to keep removing
	2260	* elements from the waitq and if the event does not match the requested one,
	2261	* add it to a local list. This local list of elements needs to be re-inserted
	2262	* into the priority queue at the end and the select_cb return value &
	2263	* remaining_eventmask would need to be handled appropriately. The implementation
	2264	* is not very efficient but would work functionally.
	2265	*/
	2266	static thread_t
	2267	waitq_prioq_iterate_locked(struct waitq safeq, struct waitq waitq,
	2268	spl_t spl, struct waitq_select_args *args,
	2269	uint32_t *remaining_eventmask)
	2270	{
	2271	int max_threads = args->max_threads;
	2272	int *nthreads = args->nthreads;
	2273	thread_t first_thread = THREAD_NULL;
	2274	thread_t thread = THREAD_NULL;
	2275
	2276	/*
	2277	* The waitq select routines need to handle two cases:
	2278	* Case 1: Peek at maximum priority thread in the waitq (remove_op = 0)
	2279	* Get the maximum priority thread from the waitq without removing it.
	2280	* In that case args->threadq == NULL and max_threads == 1.
	2281	* Case 2: Remove 'n' highest priority threads from waitq (remove_op = 1)
	2282	* Get max_threads (if available) while removing them from the waitq.
	2283	* In that case args->threadq != NULL and max_threads is one of {-1, 1}.
	2284	*
	2285	* The only possible values for remaining_eventmask for the priority queue
	2286	* waitq are either 0 (for the remove all threads case) or the original
	2287	* safeq->waitq_eventmask (for the lookup/remove one thread cases).
	2288	*/
	2289	*remaining_eventmask = safeq->waitq_eventmask;
	2290	boolean_t remove_op = !!(args->threadq);
	2291
	2292	while ((max_threads <= 0) \|\| (*nthreads < max_threads)) {
	2293	if (priority_queue_empty(&(safeq->waitq_prio_queue))) {
	2294	*remaining_eventmask = 0;
	2295	break;
	2296	}
	2297
	2298	if (remove_op) {
	2299	thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
	2300	struct thread, wait_prioq_links,
	2301	PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
	2302	} else {
	2303	/* For the peek operation, the only valid value for max_threads is 1 */
	2304	assert(max_threads == 1);
	2305	thread = priority_queue_max(&safeq->waitq_prio_queue,
	2306	struct thread, wait_prioq_links);
	2307	}
	2308	/*
	2309	* Ensure the wait event matches since priority ordered waitqs do not
	2310	* support multiple events in the same waitq.
	2311	*/
	2312	assert((thread->waitq == waitq) && (thread->wait_event == args->event));
	2313
	2314	if (args->select_cb) {
	2315	/*
	2316	* Call the select_cb passed into the waitq_select args. The callback
	2317	* updates the select_ctx with information about the highest priority
	2318	* thread which is eventually used by the caller.
	2319	*/
	2320	thread_t __assert_only ret_thread = args->select_cb(args->select_ctx, waitq,
	2321	waitq_is_global(waitq), thread);
	2322	if (!remove_op) {
	2323	/* For the peek operation, the thread should not be selected for addition */
	2324	assert(ret_thread == THREAD_NULL);
	2325	} else {
	2326	/*
	2327	* For the remove operation, the select routine should always return a valid
	2328	* thread for priority waitqs. Since all threads in a prioq are equally
	2329	* eligible, it should match the thread removed from the prioq. If this
	2330	* invariant changes, the implementation would need to handle the
	2331	* remaining_eventmask here correctly.
	2332	*/
	2333	assert(ret_thread == thread);
	2334	}
	2335	}
	2336
	2337	if (first_thread == THREAD_NULL) {
	2338	first_thread = thread;
	2339	/*
	2340	* turnstile_kernel_update_inheritor_on_wake_locked will lock
	2341	* first_thread, so call it before locking it.
	2342	*/
	2343	if (args->priority == WAITQ_PROMOTE_ON_WAKE && first_thread != THREAD_NULL && waitq_is_turnstile_queue(safeq)) {
	2344	turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq), (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD);
	2345	}
	2346	}
	2347
	2348	/* For the peek operation, break out early */
	2349	if (!remove_op) {
	2350	break;
	2351	}
	2352
	2353	/* Add the thread to the result thread list */
	2354	*nthreads += 1;
	2355	if (*nthreads == 1) {
	2356	*(args->spl) = (safeq != waitq) ? spl : splsched();
	2357	}
	2358	thread_lock(thread);
	2359	thread_clear_waitq_state(thread);
	2360	enqueue_tail(args->threadq, &(thread->wait_links));
	2361	}
	2362
	2363	return first_thread;
	2364	}
	2365
	2366	/**
	2367	* generic thread selection from a waitq (and sets to which the waitq belongs)
	2368	*
	2369	* Conditions:
	2370	* args->waitq (and args->posted_waitq) is locked
	2371	*
	2372	* Notes:
	2373	* Uses the optional select callback function to refine the selection
	2374	* of one or more threads from a waitq and any set to which the waitq
	2375	* belongs. The select callback is invoked once for every thread that
	2376	* is found to be waiting on the input args->waitq.
	2377	*
	2378	* If one or more threads are selected, this may disable interrupts.
	2379	* The previous interrupt state is returned in args->spl and should
	2380	* be used in a call to splx() if threads are returned to the caller.
	2381	*/
	2382	static void
	2383	do_waitq_select_n_locked(struct waitq_select_args *args)
	2384	{
	2385	struct waitq *waitq = args->waitq;
	2386	int max_threads = args->max_threads;
	2387	thread_t first_thread = THREAD_NULL;
	2388	struct waitq *safeq;
	2389	uint32_t remaining_eventmask = 0;
	2390	uint32_t eventmask;
	2391	int *nthreads = args->nthreads;
	2392	spl_t spl = 0;
	2393
	2394	assert(max_threads != 0);
	2395
	2396	if (!waitq_irq_safe(waitq)) {
	2397	/* JMM - add flag to waitq to avoid global lookup if no waiters */
	2398	eventmask = _CAST_TO_EVENT_MASK(waitq);
	2399	safeq = waitq_get_safeq(waitq);
	2400	if (safeq == NULL) {
	2401	/*
	2402	* in the WQT_TSPROXY case, if there's no turnstile,
	2403	* there's no queue and no waiters, so we can move straight
	2404	* to the waitq set recursion
	2405	*/
	2406	goto handle_waitq_set;
	2407	}
	2408
	2409	if (*nthreads == 0) {
	2410	spl = splsched();
	2411	}
	2412	waitq_lock(safeq);
	2413	} else {
	2414	eventmask = _CAST_TO_EVENT_MASK(args->event);
	2415	safeq = waitq;
	2416	}
	2417
	2418	/*
	2419	* If the safeq doesn't have an eventmask (not global) or the event
	2420	* we're looking for IS set in its eventmask, then scan the threads
	2421	* in that queue for ones that match the original <waitq,event> pair.
	2422	*/
	2423	if (!waitq_is_global(safeq) \|\|
	2424	(safeq->waitq_eventmask & eventmask) == eventmask) {
	2425	if (waitq_is_turnstile_queue(safeq)) {
	2426	first_thread = waitq_prioq_iterate_locked(safeq, waitq,
	2427	spl, args,
	2428	&remaining_eventmask);
	2429	} else {
	2430	first_thread = waitq_queue_iterate_locked(safeq, waitq,
	2431	spl, args,
	2432	&remaining_eventmask);
	2433	}
	2434
	2435	/*
	2436	* Update the eventmask of global queues we just scanned:
	2437	* - If we selected all the threads in the queue, we can clear its
	2438	* eventmask.
	2439	*
	2440	* - If we didn't find enough threads to fill our needs, then we can
	2441	* assume we looked at every thread in the queue and the mask we
	2442	* computed is complete - so reset it.
	2443	*/
	2444	if (waitq_is_global(safeq)) {
	2445	if (waitq_empty(safeq)) {
	2446	safeq->waitq_eventmask = 0;
	2447	} else if (max_threads < 0 \|\| *nthreads < max_threads) {
	2448	safeq->waitq_eventmask = remaining_eventmask;
	2449	}
	2450	}
	2451	}
	2452
	2453	/*
	2454	* Grab the first thread in the queue if no other thread was selected.
	2455	* We can guarantee that no one has manipulated this thread because
	2456	* it's waiting on the given waitq, and we have that waitq locked.
	2457	*/
	2458	if (*nthreads == 0 && first_thread != THREAD_NULL && args->threadq) {
	2459	/* we know this is the first (and only) thread */
	2460	++(*nthreads);
	2461	*(args->spl) = (safeq != waitq) ? spl : splsched();
	2462
	2463	thread_lock(first_thread);
	2464	thread_clear_waitq_state(first_thread);
	2465	waitq_thread_remove(safeq, first_thread);
	2466	enqueue_tail(args->threadq, &(first_thread->wait_links));
	2467
	2468	/* update the eventmask on [now] empty global queues */
	2469	if (waitq_is_global(safeq) && waitq_empty(safeq)) {
	2470	safeq->waitq_eventmask = 0;
	2471	}
	2472	}
	2473
	2474	/* unlock the safe queue if we locked one above */
	2475	if (safeq != waitq) {
	2476	waitq_unlock(safeq);
	2477	if (*nthreads == 0) {
	2478	splx(spl);
	2479	}
	2480	}
	2481
	2482	if (max_threads > 0 && *nthreads >= max_threads) {
	2483	return;
	2484	}
	2485
	2486	handle_waitq_set:
	2487	/*
	2488	* wait queues that are not in any sets
	2489	* are the bottom of the recursion
	2490	*/
	2491	if (!waitq->waitq_set_id) {
	2492	return;
	2493	}
	2494
	2495	/* check to see if the set ID for this wait queue is valid */
	2496	struct waitq_link *link = wql_get_link(waitq->waitq_set_id);
	2497	if (!link) {
	2498	/* the waitq set to which this waitq belonged, has been invalidated */
	2499	waitq->waitq_set_id = 0;
	2500	return;
	2501	}
	2502
	2503	wql_put_link(link);
	2504
	2505	/*
	2506	* If this waitq is a member of any wait queue sets, we need to look
	2507	* for waiting thread(s) in any of those sets, and prepost all sets that
	2508	* don't have active waiters.
	2509	*
	2510	* Note that we do a local walk of this waitq's links - we manually
	2511	* recurse down wait queue set's with non-zero wqset_q.waitq_set_id
	2512	*/
	2513	(void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	2514	WQL_WQS, (void *)args, waitq_select_walk_cb);
	2515	}
	2516
	2517	/**
	2518	* main entry point for thread selection from a waitq
	2519	*
	2520	* Conditions:
	2521	* waitq is locked
	2522	*
	2523	* Returns:
	2524	* The number of threads waiting on 'waitq' for 'event' which have
	2525	* been placed onto the input 'threadq'
	2526	*
	2527	* Notes:
	2528	* The 'select_cb' function is invoked for every thread found waiting on
	2529	* 'waitq' for 'event'. The thread is _not_ locked upon callback
	2530	* invocation. This parameter may be NULL.
	2531	*
	2532	* If one or more threads are returned in 'threadq' then the caller is
	2533	* responsible to call splx() using the returned 'spl' value. Each
	2534	* returned thread is locked.
	2535	*/
	2536	static __inline__ int
	2537	waitq_select_n_locked(struct waitq *waitq,
	2538	event64_t event,
	2539	waitq_select_cb select_cb,
	2540	void *select_ctx,
	2541	uint64_t *reserved_preposts,
	2542	queue_t threadq,
	2543	int max_threads, spl_t *spl,
	2544	int priority)
	2545	{
	2546	int nthreads = 0;
	2547
	2548	struct waitq_select_args args = {
	2549	.posted_waitq = waitq,
	2550	.waitq = waitq,
	2551	.event = event,
	2552	.select_cb = select_cb,
	2553	.select_ctx = select_ctx,
	2554	.priority = priority,
	2555	.reserved_preposts = reserved_preposts,
	2556	.threadq = threadq,
	2557	.max_threads = max_threads,
	2558	.nthreads = &nthreads,
	2559	.spl = spl,
	2560	};
	2561
	2562	do_waitq_select_n_locked(&args);
	2563	return nthreads;
	2564	}
	2565
	2566	/**
	2567	* select from a waitq a single thread waiting for a given event
	2568	*
	2569	* Conditions:
	2570	* 'waitq' is locked
	2571	*
	2572	* Returns:
	2573	* A locked thread that's been removed from the waitq, but has not
	2574	* yet been put on a run queue. Caller is responsible to call splx
	2575	* with the '*spl' value.
	2576	*/
	2577	static thread_t
	2578	waitq_select_one_locked(struct waitq *waitq, event64_t event,
	2579	uint64_t *reserved_preposts,
	2580	int priority, spl_t *spl)
	2581	{
	2582	int nthreads;
	2583	queue_head_t threadq;
	2584
	2585	queue_init(&threadq);
	2586
	2587	nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
	2588	reserved_preposts, &threadq, 1, spl, priority);
	2589
	2590	/* if we selected a thread, return it (still locked) */
	2591	if (!queue_empty(&threadq)) {
	2592	thread_t t;
	2593	queue_entry_t qe = dequeue_head(&threadq);
	2594	t = qe_element(qe, struct thread, wait_links);
	2595	assert(queue_empty(&threadq)); /* there should be 1 entry */
	2596	/* t has been locked and removed from all queues */
	2597	return t;
	2598	}
	2599
	2600	return THREAD_NULL;
	2601	}
	2602
	2603	struct select_thread_ctx {
	2604	thread_t thread;
	2605	event64_t event;
	2606	spl_t *spl;
	2607	};
	2608
	2609	/**
	2610	* link walk callback invoked once for each set to which a waitq belongs
	2611	*
	2612	* Conditions:
	2613	* initial waitq is locked
	2614	* ctx->thread is unlocked
	2615	*
	2616	* Notes:
	2617	* This may disable interrupts and early-out of the full DAG link walk by
	2618	* returning KERN_ALREADY_IN_SET. In this case, the returned thread has
	2619	* been removed from the waitq, it's waitq state has been reset, and the
	2620	* caller is responsible to call splx() with the returned interrupt state
	2621	* in ctx->spl.
	2622	*/
	2623	static int
	2624	waitq_select_thread_cb(struct waitq waitq, void ctx,
	2625	struct waitq_link *link)
	2626	{
	2627	struct select_thread_ctx stctx = (struct select_thread_ctx )ctx;
	2628	struct waitq_set *wqset;
	2629	struct waitq *wqsetq;
	2630	struct waitq *safeq;
	2631	spl_t s;
	2632
	2633	(void)waitq;
	2634
	2635	thread_t thread = stctx->thread;
	2636	event64_t event = stctx->event;
	2637
	2638	if (wql_type(link) != WQL_WQS) {
	2639	return WQ_ITERATE_CONTINUE;
	2640	}
	2641
	2642	wqset = link->wql_wqs.wql_set;
	2643	wqsetq = &wqset->wqset_q;
	2644
	2645	assert(!waitq_irq_safe(waitq));
	2646	assert(!waitq_irq_safe(wqsetq));
	2647
	2648	waitq_set_lock(wqset);
	2649
	2650	s = splsched();
	2651
	2652	/* find and lock the interrupt-safe waitq the thread is thought to be on */
	2653	safeq = waitq_get_safeq(wqsetq);
	2654	waitq_lock(safeq);
	2655
	2656	thread_lock(thread);
	2657
	2658	if ((thread->waitq == wqsetq) && (thread->wait_event == event)) {
	2659	waitq_thread_remove(wqsetq, thread);
	2660	if (waitq_empty(safeq)) {
	2661	safeq->waitq_eventmask = 0;
	2662	}
	2663	thread_clear_waitq_state(thread);
	2664	waitq_unlock(safeq);
	2665	waitq_set_unlock(wqset);
	2666	/*
	2667	* thread still locked,
	2668	* return non-zero to break out of WQS walk
	2669	*/
	2670	*(stctx->spl) = s;
	2671	return WQ_ITERATE_FOUND;
	2672	}
	2673
	2674	thread_unlock(thread);
	2675	waitq_set_unlock(wqset);
	2676	waitq_unlock(safeq);
	2677	splx(s);
	2678
	2679	return WQ_ITERATE_CONTINUE;
	2680	}
	2681
	2682	/**
	2683	* returns KERN_SUCCESS and locks 'thread' if-and-only-if 'thread' is waiting
	2684	* on 'waitq' (or any set to which waitq belongs) for 'event'
	2685	*
	2686	* Conditions:
	2687	* 'waitq' is locked
	2688	* 'thread' is unlocked
	2689	*/
	2690	static kern_return_t
	2691	waitq_select_thread_locked(struct waitq *waitq,
	2692	event64_t event,
	2693	thread_t thread, spl_t *spl)
	2694	{
	2695	struct waitq *safeq;
	2696	struct waitq_link *link;
	2697	struct select_thread_ctx ctx;
	2698	kern_return_t kr;
	2699	spl_t s;
	2700
	2701	/* Find and lock the interrupts disabled queue the thread is actually on */
	2702	if (!waitq_irq_safe(waitq)) {
	2703	safeq = waitq_get_safeq(waitq);
	2704	if (safeq == NULL) {
	2705	/*
	2706	* in the WQT_TSPROXY case, if there's no turnstile,
	2707	* there's no queue and no waiters, so we can move straight
	2708	* to the waitq set recursion
	2709	*/
	2710	goto handle_waitq_set;
	2711	}
	2712
	2713	s = splsched();
	2714	waitq_lock(safeq);
	2715	} else {
	2716	s = splsched();
	2717	safeq = waitq;
	2718	}
	2719
	2720	thread_lock(thread);
	2721
	2722	if ((thread->waitq == waitq) && (thread->wait_event == event)) {
	2723	waitq_thread_remove(safeq, thread);
	2724	if (waitq_empty(safeq)) {
	2725	safeq->waitq_eventmask = 0;
	2726	}
	2727	thread_clear_waitq_state(thread);
	2728	*spl = s;
	2729	/* thread still locked */
	2730	return KERN_SUCCESS;
	2731	}
	2732
	2733	thread_unlock(thread);
	2734
	2735	if (safeq != waitq) {
	2736	waitq_unlock(safeq);
	2737	}
	2738
	2739	splx(s);
	2740
	2741	handle_waitq_set:
	2742	if (!waitq->waitq_set_id) {
	2743	return KERN_NOT_WAITING;
	2744	}
	2745
	2746	/* check to see if the set ID for this wait queue is valid */
	2747	link = wql_get_link(waitq->waitq_set_id);
	2748	if (!link) {
	2749	/* the waitq to which this set belonged, has been invalidated */
	2750	waitq->waitq_set_id = 0;
	2751	return KERN_NOT_WAITING;
	2752	}
	2753
	2754	/*
	2755	* The thread may be waiting on a wait queue set to which
	2756	* the input 'waitq' belongs. Go look for the thread in
	2757	* all wait queue sets. If it's there, we'll remove it
	2758	* because it's equivalent to waiting directly on the input waitq.
	2759	*/
	2760	ctx.thread = thread;
	2761	ctx.event = event;
	2762	ctx.spl = spl;
	2763	kr = walk_waitq_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id,
	2764	WQL_WQS, (void *)&ctx, waitq_select_thread_cb);
	2765
	2766	wql_put_link(link);
	2767
	2768	/* we found a thread, return success */
	2769	if (kr == WQ_ITERATE_FOUND) {
	2770	return KERN_SUCCESS;
	2771	}
	2772
	2773	return KERN_NOT_WAITING;
	2774	}
	2775
	2776	static int
	2777	prepost_exists_cb(struct waitq_set __unused *wqset,
	2778	void __unused *ctx,
	2779	struct wq_prepost __unused *wqp,
	2780	struct waitq __unused *waitq)
	2781	{
	2782	/* if we get here, then we know that there is a valid prepost object! */
	2783	return WQ_ITERATE_FOUND;
	2784	}
	2785
	2786	/**
	2787	* declare a thread's intent to wait on 'waitq' for 'wait_event'
	2788	*
	2789	* Conditions:
	2790	* 'waitq' is locked
	2791	*/
	2792	wait_result_t
	2793	waitq_assert_wait64_locked(struct waitq *waitq,
	2794	event64_t wait_event,
	2795	wait_interrupt_t interruptible,
	2796	wait_timeout_urgency_t urgency,
	2797	uint64_t deadline,
	2798	uint64_t leeway,
	2799	thread_t thread)
	2800	{
	2801	wait_result_t wait_result;
	2802	int realtime = 0;
	2803	struct waitq *safeq;
	2804	uintptr_t eventmask;
	2805	spl_t s;
	2806
	2807
	2808	/*
	2809	* Warning: Do _not_ place debugging print statements here.
	2810	* The waitq is locked!
	2811	*/
	2812	assert(!thread->started \|\| thread == current_thread());
	2813
	2814	if (thread->waitq != NULL) {
	2815	panic("thread already waiting on %p", thread->waitq);
	2816	}
	2817
	2818	if (waitq_is_set(waitq)) {
	2819	struct waitq_set wqset = (struct waitq_set )waitq;
	2820	/*
	2821	* early-out if the thread is waiting on a wait queue set
	2822	* that has already been pre-posted.
	2823	*/
	2824	if (wait_event == NO_EVENT64 && waitq_set_maybe_preposted(wqset)) {
	2825	int ret;
	2826	/*
	2827	* Run through the list of potential preposts. Because
	2828	* this is a hot path, we short-circuit the iteration
	2829	* if we find just one prepost object.
	2830	*/
	2831	ret = wq_prepost_foreach_locked(wqset, NULL,
	2832	prepost_exists_cb);
	2833	if (ret == WQ_ITERATE_FOUND) {
	2834	s = splsched();
	2835	thread_lock(thread);
	2836	thread->wait_result = THREAD_AWAKENED;
	2837	thread_unlock(thread);
	2838	splx(s);
	2839	return THREAD_AWAKENED;
	2840	}
	2841	}
	2842	}
	2843
	2844	s = splsched();
	2845
	2846	/*
	2847	* If already dealing with an irq safe wait queue, we are all set.
	2848	* Otherwise, determine a global queue to use and lock it.
	2849	*/
	2850	if (!waitq_irq_safe(waitq)) {
	2851	safeq = waitq_get_safeq(waitq);
	2852	if (__improbable(safeq == NULL)) {
	2853	panic("Trying to assert_wait on a turnstile proxy "
	2854	"that hasn't been donated one (waitq: %p)", waitq);
	2855	}
	2856	eventmask = _CAST_TO_EVENT_MASK(waitq);
	2857	waitq_lock(safeq);
	2858	} else {
	2859	safeq = waitq;
	2860	eventmask = _CAST_TO_EVENT_MASK(wait_event);
	2861	}
	2862
	2863	/* lock the thread now that we have the irq-safe waitq locked */
	2864	thread_lock(thread);
	2865
	2866	/*
	2867	* Realtime threads get priority for wait queue placements.
	2868	* This allows wait_queue_wakeup_one to prefer a waiting
	2869	* realtime thread, similar in principle to performing
	2870	* a wait_queue_wakeup_all and allowing scheduler prioritization
	2871	* to run the realtime thread, but without causing the
	2872	* lock contention of that scenario.
	2873	*/
	2874	if (thread->sched_pri >= BASEPRI_REALTIME) {
	2875	realtime = 1;
	2876	}
	2877
	2878	/*
	2879	* This is the extent to which we currently take scheduling attributes
	2880	* into account. If the thread is vm priviledged, we stick it at
	2881	* the front of the queue. Later, these queues will honor the policy
	2882	* value set at waitq_init time.
	2883	*/
	2884	wait_result = thread_mark_wait_locked(thread, interruptible);
	2885	/* thread->wait_result has been set */
	2886	if (wait_result == THREAD_WAITING) {
	2887	if (!safeq->waitq_fifo
	2888	\|\| (thread->options & TH_OPT_VMPRIV) \|\| realtime) {
	2889	waitq_thread_insert(safeq, thread, false);
	2890	} else {
	2891	waitq_thread_insert(safeq, thread, true);
	2892	}
	2893
	2894	/* mark the event and real waitq, even if enqueued on a global safeq */
	2895	thread->wait_event = wait_event;
	2896	thread->waitq = waitq;
	2897
	2898	if (deadline != 0) {
	2899	boolean_t act;
	2900
	2901	act = timer_call_enter_with_leeway(&thread->wait_timer,
	2902	NULL,
	2903	deadline, leeway,
	2904	urgency, FALSE);
	2905	if (!act) {
	2906	thread->wait_timer_active++;
	2907	}
	2908	thread->wait_timer_is_set = TRUE;
	2909	}
	2910
	2911	if (waitq_is_global(safeq)) {
	2912	safeq->waitq_eventmask \|= eventmask;
	2913	}
	2914
	2915	waitq_stats_count_wait(waitq);
	2916	}
	2917
	2918	/* unlock the thread */
	2919	thread_unlock(thread);
	2920
	2921	/* update the inheritor's thread priority if the waitq is embedded in turnstile */
	2922	if (waitq_is_turnstile_queue(safeq) && wait_result == THREAD_WAITING) {
	2923	turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
	2924	turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
	2925	}
	2926
	2927	/* unlock the safeq if we locked it here */
	2928	if (safeq != waitq) {
	2929	waitq_unlock(safeq);
	2930	}
	2931
	2932	splx(s);
	2933
	2934	return wait_result;
	2935	}
	2936
	2937	/**
	2938	* remove 'thread' from its current blocking state on 'waitq'
	2939	*
	2940	* Conditions:
	2941	* 'thread' is locked
	2942	*
	2943	* Notes:
	2944	* This function is primarily used by clear_wait_internal in
	2945	* sched_prim.c from the thread timer wakeup path
	2946	* (i.e. the thread was waiting on 'waitq' with a timeout that expired)
	2947	*/
	2948	int
	2949	waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
	2950	{
	2951	struct waitq *safeq;
	2952
	2953	assert_thread_magic(thread);
	2954	assert(thread->waitq == waitq);
	2955
	2956	/* Find the interrupts disabled queue thread is waiting on */
	2957	if (!waitq_irq_safe(waitq)) {
	2958	safeq = waitq_get_safeq(waitq);
	2959	if (__improbable(safeq == NULL)) {
	2960	panic("Trying to clear_wait on a turnstile proxy "
	2961	"that hasn't been donated one (waitq: %p)", waitq);
	2962	}
	2963	} else {
	2964	safeq = waitq;
	2965	}
	2966
	2967	/* thread is already locked so have to try for the waitq lock */
	2968	if (!waitq_lock_try(safeq)) {
	2969	return 0;
	2970	}
	2971
	2972	waitq_thread_remove(safeq, thread);
	2973	thread_clear_waitq_state(thread);
	2974	waitq_stats_count_clear_wakeup(waitq);
	2975
	2976	/* clear the global event mask if this was the last thread there! */
	2977	if (waitq_is_global(safeq) && waitq_empty(safeq)) {
	2978	safeq->waitq_eventmask = 0;
	2979	/* JMM - also mark no-waiters on waitq (if not the same as the safeq) */
	2980	}
	2981
	2982	waitq_unlock(safeq);
	2983
	2984	return 1;
	2985	}
	2986
	2987
	2988	static __inline__
	2989	void
	2990	maybe_adjust_thread_pri(thread_t thread,
	2991	int priority,
	2992	__kdebug_only struct waitq *waitq)
	2993	{
	2994	/*
	2995	* If the caller is requesting the waitq subsystem to promote the
	2996	* priority of the awoken thread, then boost the thread's priority to
	2997	* the default WAITQ_BOOST_PRIORITY (if it's not already equal or
	2998	* higher priority). This boost must be removed via a call to
	2999	* waitq_clear_promotion_locked before the thread waits again.
	3000	*
	3001	* WAITQ_PROMOTE_PRIORITY is -2.
	3002	* Anything above 0 represents a mutex promotion.
	3003	* The default 'no action' value is -1.
	3004	* TODO: define this in a header
	3005	*/
	3006	if (priority == WAITQ_PROMOTE_PRIORITY) {
	3007	uintptr_t trace_waitq = 0;
	3008	if (__improbable(kdebug_enable)) {
	3009	trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq);
	3010	}
	3011
	3012	sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
	3013	}
	3014	}
	3015
	3016	/*
	3017	* Clear a potential thread priority promotion from a waitq wakeup
	3018	* with WAITQ_PROMOTE_PRIORITY.
	3019	*
	3020	* This must be called on the thread which was woken up with TH_SFLAG_WAITQ_PROMOTED.
	3021	*/
	3022	void
	3023	waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread)
	3024	{
	3025	spl_t s;
	3026
	3027	assert(waitq_held(waitq));
	3028	assert(thread != THREAD_NULL);
	3029	assert(thread == current_thread());
	3030
	3031	/* This flag is only cleared by the thread itself, so safe to check outside lock */
	3032	if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) {
	3033	return;
	3034	}
	3035
	3036	if (!waitq_irq_safe(waitq)) {
	3037	s = splsched();
	3038	}
	3039	thread_lock(thread);
	3040
	3041	sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, 0);
	3042
	3043	thread_unlock(thread);
	3044	if (!waitq_irq_safe(waitq)) {
	3045	splx(s);
	3046	}
	3047	}
	3048
	3049	/**
	3050	* wakeup all threads waiting on 'waitq' for 'wake_event'
	3051	*
	3052	* Conditions:
	3053	* 'waitq' is locked
	3054	*
	3055	* Notes:
	3056	* May temporarily disable and re-enable interrupts
	3057	* and re-adjust thread priority of each awoken thread.
	3058	*
	3059	* If the input 'lock_state' == WAITQ_UNLOCK then the waitq will have
	3060	* been unlocked before calling thread_go() on any returned threads, and
	3061	* is guaranteed to be unlocked upon function return.
	3062	*/
	3063	kern_return_t
	3064	waitq_wakeup64_all_locked(struct waitq *waitq,
	3065	event64_t wake_event,
	3066	wait_result_t result,
	3067	uint64_t *reserved_preposts,
	3068	int priority,
	3069	waitq_lock_state_t lock_state)
	3070	{
	3071	kern_return_t ret;
	3072	thread_t thread;
	3073	spl_t th_spl;
	3074	int nthreads;
	3075	queue_head_t wakeup_queue;
	3076
	3077	assert(waitq_held(waitq));
	3078	queue_init(&wakeup_queue);
	3079
	3080	nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
	3081	reserved_preposts,
	3082	&wakeup_queue, -1, &th_spl, priority);
	3083
	3084	/* set each thread running */
	3085	ret = KERN_NOT_WAITING;
	3086
	3087	#if CONFIG_WAITQ_STATS
	3088	qe_foreach_element(thread, &wakeup_queue, wait_links)
	3089	waitq_stats_count_wakeup(waitq);
	3090	#endif
	3091	if (lock_state == WAITQ_UNLOCK) {
	3092	waitq_unlock(waitq);
	3093	}
	3094
	3095	qe_foreach_element_safe(thread, &wakeup_queue, wait_links) {
	3096	assert_thread_magic(thread);
	3097	remqueue(&thread->wait_links);
	3098	maybe_adjust_thread_pri(thread, priority, waitq);
	3099	ret = thread_go(thread, result);
	3100	assert(ret == KERN_SUCCESS);
	3101	thread_unlock(thread);
	3102	}
	3103	if (nthreads > 0) {
	3104	splx(th_spl);
	3105	} else {
	3106	waitq_stats_count_fail(waitq);
	3107	}
	3108
	3109	return ret;
	3110	}
	3111
	3112	/**
	3113	* wakeup one thread waiting on 'waitq' for 'wake_event'
	3114	*
	3115	* Conditions:
	3116	* 'waitq' is locked
	3117	*
	3118	* Notes:
	3119	* May temporarily disable and re-enable interrupts.
	3120	*/
	3121	kern_return_t
	3122	waitq_wakeup64_one_locked(struct waitq *waitq,
	3123	event64_t wake_event,
	3124	wait_result_t result,
	3125	uint64_t *reserved_preposts,
	3126	int priority,
	3127	waitq_lock_state_t lock_state)
	3128	{
	3129	thread_t thread;
	3130	spl_t th_spl;
	3131
	3132	assert(waitq_held(waitq));
	3133
	3134	thread = waitq_select_one_locked(waitq, wake_event,
	3135	reserved_preposts,
	3136	priority, &th_spl);
	3137
	3138	if (thread != THREAD_NULL) {
	3139	waitq_stats_count_wakeup(waitq);
	3140	} else {
	3141	waitq_stats_count_fail(waitq);
	3142	}
	3143
	3144	if (lock_state == WAITQ_UNLOCK) {
	3145	waitq_unlock(waitq);
	3146	}
	3147
	3148	if (thread != THREAD_NULL) {
	3149	maybe_adjust_thread_pri(thread, priority, waitq);
	3150	kern_return_t ret = thread_go(thread, result);
	3151	assert(ret == KERN_SUCCESS);
	3152	thread_unlock(thread);
	3153	splx(th_spl);
	3154	return ret;
	3155	}
	3156
	3157	return KERN_NOT_WAITING;
	3158	}
	3159
	3160	/**
	3161	* wakeup one thread waiting on 'waitq' for 'wake_event'
	3162	*
	3163	* Conditions:
	3164	* 'waitq' is locked
	3165	*
	3166	* Returns:
	3167	* A locked, runnable thread.
	3168	* If return value is non-NULL, interrupts have also
	3169	* been disabled, and the caller is responsible to call
	3170	* splx() with the returned '*spl' value.
	3171	*/
	3172	thread_t
	3173	waitq_wakeup64_identify_locked(struct waitq *waitq,
	3174	event64_t wake_event,
	3175	wait_result_t result,
	3176	spl_t *spl,
	3177	uint64_t *reserved_preposts,
	3178	int priority,
	3179	waitq_lock_state_t lock_state)
	3180	{
	3181	thread_t thread;
	3182
	3183	assert(waitq_held(waitq));
	3184
	3185	thread = waitq_select_one_locked(waitq, wake_event,
	3186	reserved_preposts,
	3187	priority, spl);
	3188
	3189	if (thread != THREAD_NULL) {
	3190	waitq_stats_count_wakeup(waitq);
	3191	} else {
	3192	waitq_stats_count_fail(waitq);
	3193	}
	3194
	3195	if (lock_state == WAITQ_UNLOCK) {
	3196	waitq_unlock(waitq);
	3197	}
	3198
	3199	if (thread != THREAD_NULL) {
	3200	kern_return_t __assert_only ret;
	3201	ret = thread_go(thread, result);
	3202	assert(ret == KERN_SUCCESS);
	3203	}
	3204
	3205	return thread; /* locked if not NULL (caller responsible for spl) */
	3206	}
	3207
	3208	/**
	3209	* wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
	3210	*
	3211	* Conditions:
	3212	* 'waitq' is locked
	3213	* 'thread' is unlocked
	3214	*
	3215	* Notes:
	3216	* May temporarily disable and re-enable interrupts
	3217	*
	3218	* If the input lock_state == WAITQ_UNLOCK then the waitq will have been
	3219	* unlocked before calling thread_go() if 'thread' is to be awoken, and
	3220	* is guaranteed to be unlocked upon function return.
	3221	*/
	3222	kern_return_t
	3223	waitq_wakeup64_thread_locked(struct waitq *waitq,
	3224	event64_t wake_event,
	3225	thread_t thread,
	3226	wait_result_t result,
	3227	waitq_lock_state_t lock_state)
	3228	{
	3229	kern_return_t ret;
	3230	spl_t th_spl;
	3231
	3232	assert(waitq_held(waitq));
	3233	assert_thread_magic(thread);
	3234
	3235	/*
	3236	* See if the thread was still waiting there. If so, it got
	3237	* dequeued and returned locked.
	3238	*/
	3239	ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
	3240
	3241	if (ret == KERN_SUCCESS) {
	3242	waitq_stats_count_wakeup(waitq);
	3243	} else {
	3244	waitq_stats_count_fail(waitq);
	3245	}
	3246
	3247	if (lock_state == WAITQ_UNLOCK) {
	3248	waitq_unlock(waitq);
	3249	}
	3250
	3251	if (ret != KERN_SUCCESS) {
	3252	return KERN_NOT_WAITING;
	3253	}
	3254
	3255	ret = thread_go(thread, result);
	3256	assert(ret == KERN_SUCCESS);
	3257	thread_unlock(thread);
	3258	splx(th_spl);
	3259
	3260	return ret;
	3261	}
	3262
	3263
	3264
	3265	/* ----------------------------------------------------------------------
	3266	*
	3267	* In-Kernel API
	3268	*
	3269	* ---------------------------------------------------------------------- */
	3270
	3271	/**
	3272	* initialize a waitq object
	3273	*/
	3274	kern_return_t
	3275	waitq_init(struct waitq *waitq, int policy)
	3276	{
	3277	assert(waitq != NULL);
	3278
	3279	/* only FIFO and LIFO for now */
	3280	if ((policy & SYNC_POLICY_FIXED_PRIORITY) != 0) {
	3281	return KERN_INVALID_ARGUMENT;
	3282	}
	3283
	3284	waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
	3285	waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
	3286	waitq->waitq_prepost = 0;
	3287	if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
	3288	waitq->waitq_type = WQT_TSPROXY;
	3289	} else {
	3290	waitq->waitq_type = WQT_QUEUE;
	3291	}
	3292	waitq->waitq_turnstile = !!(policy & SYNC_POLICY_TURNSTILE);
	3293	waitq->waitq_eventmask = 0;
	3294
	3295	waitq->waitq_set_id = 0;
	3296	waitq->waitq_prepost_id = 0;
	3297
	3298	waitq_lock_init(waitq);
	3299	if (waitq_is_turnstile_queue(waitq)) {
	3300	/* For turnstile, initialize it as a priority queue */
	3301	priority_queue_init(&waitq->waitq_prio_queue,
	3302	PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
	3303	assert(waitq->waitq_fifo == 0);
	3304	} else if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
	3305	waitq->waitq_ts = TURNSTILE_NULL;
	3306	waitq->waitq_tspriv = NULL;
	3307	} else {
	3308	queue_init(&waitq->waitq_queue);
	3309	}
	3310
	3311	waitq->waitq_isvalid = 1;
	3312	return KERN_SUCCESS;
	3313	}
	3314
	3315	struct wq_unlink_ctx {
	3316	struct waitq *unlink_wq;
	3317	struct waitq_set *unlink_wqset;
	3318	};
	3319
	3320	static int waitq_unlink_prepost_cb(struct waitq_set __unused wqset, void ctx,
	3321	struct wq_prepost wqp, struct waitq waitq);
	3322
	3323	/**
	3324	* walk_waitq_links callback to invalidate 'link' parameter
	3325	*
	3326	* Conditions:
	3327	* Called from walk_waitq_links.
	3328	* Note that unlink other callbacks, this one make no assumptions about
	3329	* the 'waitq' parameter, specifically it does not have to be locked or
	3330	* even valid.
	3331	*/
	3332	static int
	3333	waitq_unlink_all_cb(struct waitq waitq, void ctx,
	3334	struct waitq_link *link)
	3335	{
	3336	(void)waitq;
	3337	(void)ctx;
	3338	if (wql_type(link) == WQL_LINK && wql_is_valid(link)) {
	3339	wql_invalidate(link);
	3340	}
	3341
	3342	if (wql_type(link) == WQL_WQS) {
	3343	struct waitq_set *wqset;
	3344	struct wq_unlink_ctx ulctx;
	3345
	3346	/*
	3347	* When destroying the waitq, take the time to clear out any
	3348	* preposts it may have made. This could potentially save time
	3349	* on the IPC send path which would otherwise have to iterate
	3350	* over lots of dead port preposts.
	3351	*/
	3352	if (waitq->waitq_prepost_id == 0) {
	3353	goto out;
	3354	}
	3355
	3356	wqset = link->wql_wqs.wql_set;
	3357	assert(wqset != NULL);
	3358	assert(!waitq_irq_safe(&wqset->wqset_q));
	3359
	3360	waitq_set_lock(wqset);
	3361
	3362	if (!waitq_set_is_valid(wqset)) {
	3363	/* someone raced us to teardown */
	3364	goto out_unlock;
	3365	}
	3366	if (!waitq_set_maybe_preposted(wqset)) {
	3367	goto out_unlock;
	3368	}
	3369
	3370	ulctx.unlink_wq = waitq;
	3371	ulctx.unlink_wqset = wqset;
	3372	(void)wq_prepost_iterate(wqset->wqset_prepost_id, &ulctx,
	3373	waitq_unlink_prepost_cb);
	3374	out_unlock:
	3375	waitq_set_unlock(wqset);
	3376	}
	3377
	3378	out:
	3379	return WQ_ITERATE_CONTINUE;
	3380	}
	3381
	3382
	3383	/**
	3384	* cleanup any link/prepost table resources associated with a waitq
	3385	*/
	3386	void
	3387	waitq_deinit(struct waitq *waitq)
	3388	{
	3389	spl_t s;
	3390
	3391	assert(waitq);
	3392	if (!waitq_is_valid(waitq)) {
	3393	return;
	3394	}
	3395
	3396	if (!waitq_is_queue(waitq) && !waitq_is_turnstile_proxy(waitq)) {
	3397	return;
	3398	}
	3399
	3400	if (waitq_irq_safe(waitq)) {
	3401	s = splsched();
	3402	}
	3403	waitq_lock(waitq);
	3404
	3405	if (waitq_valid(waitq)) {
	3406	waitq->waitq_isvalid = 0;
	3407	if (!waitq_irq_safe(waitq)) {
	3408	waitq_unlink_all_unlock(waitq);
	3409	/* waitq unlocked and set links deallocated */
	3410	goto out;
	3411	}
	3412	}
	3413
	3414	waitq_unlock(waitq);
	3415	if (waitq_irq_safe(waitq)) {
	3416	splx(s);
	3417	}
	3418
	3419	out:
	3420	#if MACH_ASSERT
	3421	if (waitq_is_turnstile_queue(waitq)) {
	3422	assert(priority_queue_empty(&waitq->waitq_prio_queue));
	3423	} else if (waitq_is_turnstile_proxy(waitq)) {
	3424	assert(waitq->waitq_ts == TURNSTILE_NULL);
	3425	} else {
	3426	assert(queue_empty(&waitq->waitq_queue));
	3427	}
	3428	#else
	3429	(void)0;
	3430	#endif // MACH_ASSERT
	3431	}
	3432
	3433	void
	3434	waitq_invalidate_locked(struct waitq *waitq)
	3435	{
	3436	assert(waitq_held(waitq));
	3437	assert(waitq_is_valid(waitq));
	3438	waitq->waitq_isvalid = 0;
	3439	}
	3440
	3441	/**
	3442	* invalidate the given wq_prepost object
	3443	*
	3444	* Conditions:
	3445	* Called from wq_prepost_iterate (_not_ from wq_prepost_foreach_locked!)
	3446	*/
	3447	static int
	3448	wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
	3449	void __unused *ctx,
	3450	struct wq_prepost *wqp,
	3451	struct waitq __unused *waitq)
	3452	{
	3453	if (wqp_type(wqp) == WQP_POST) {
	3454	wq_prepost_invalidate(wqp);
	3455	}
	3456	return WQ_ITERATE_CONTINUE;
	3457	}
	3458
	3459
	3460	/**
	3461	* allocate and initialize a waitq set object
	3462	*
	3463	* Conditions:
	3464	* may block
	3465	*
	3466	* Returns:
	3467	* allocated / initialized waitq_set object.
	3468	* the waits_set object returned does not have
	3469	* a waitq_link associated.
	3470	*
	3471	* NULL on failure
	3472	*/
	3473	struct waitq_set *
	3474	waitq_set_alloc(int policy, waitq_set_prepost_hook_t *prepost_hook)
	3475	{
	3476	struct waitq_set *wqset;
	3477
	3478	wqset = (struct waitq_set *)zalloc(waitq_set_zone);
	3479	if (!wqset) {
	3480	panic("Can't allocate a new waitq set from zone %p", waitq_set_zone);
	3481	}
	3482
	3483	kern_return_t ret;
	3484	ret = waitq_set_init(wqset, policy, NULL, prepost_hook);
	3485	if (ret != KERN_SUCCESS) {
	3486	zfree(waitq_set_zone, wqset);
	3487	wqset = NULL;
	3488	}
	3489
	3490	return wqset;
	3491	}
	3492
	3493	/**
	3494	* initialize a waitq set object
	3495	*
	3496	* if no 'reserved_link' object is passed
	3497	* the waitq_link will be lazily allocated
	3498	* on demand through waitq_set_lazy_init_link.
	3499	*/
	3500	kern_return_t
	3501	waitq_set_init(struct waitq_set *wqset,
	3502	int policy, uint64_t *reserved_link,
	3503	waitq_set_prepost_hook_t *prepost_hook)
	3504	{
	3505	struct waitq_link *link;
	3506	kern_return_t ret;
	3507
	3508	memset(wqset, 0, sizeof(*wqset));
	3509
	3510	ret = waitq_init(&wqset->wqset_q, policy);
	3511	if (ret != KERN_SUCCESS) {
	3512	return ret;
	3513	}
	3514
	3515	wqset->wqset_q.waitq_type = WQT_SET;
	3516	if (policy & SYNC_POLICY_PREPOST) {
	3517	wqset->wqset_q.waitq_prepost = 1;
	3518	wqset->wqset_prepost_id = 0;
	3519	assert(prepost_hook == NULL);
	3520	} else {
	3521	wqset->wqset_q.waitq_prepost = 0;
	3522	wqset->wqset_prepost_hook = prepost_hook;
	3523	}
	3524
	3525	if (reserved_link && *reserved_link != 0) {
	3526	link = wql_get_reserved(*reserved_link, WQL_WQS);
	3527
	3528	if (!link) {
	3529	panic("Can't allocate link object for waitq set: %p", wqset);
	3530	}
	3531
	3532	/* always consume the caller's reference */
	3533	*reserved_link = 0;
	3534
	3535	link->wql_wqs.wql_set = wqset;
	3536	wql_mkvalid(link);
	3537
	3538	wqset->wqset_id = link->wql_setid.id;
	3539	wql_put_link(link);
	3540	} else {
	3541	/*
	3542	* Lazy allocate the link only when an actual id is needed.
	3543	*/
	3544	wqset->wqset_id = WQSET_NOT_LINKED;
	3545	}
	3546
	3547	return KERN_SUCCESS;
	3548	}
	3549
	3550	#if DEVELOPMENT \|\| DEBUG
	3551
	3552	int
	3553	sysctl_helper_waitq_set_nelem(void)
	3554	{
	3555	return ltable_nelem(&g_wqlinktable);
	3556	}
	3557
	3558	#endif
	3559
	3560	/**
	3561	* initialize a waitq set link.
	3562	*
	3563	* Conditions:
	3564	* may block
	3565	* locks and unlocks the waiq set lock
	3566	*
	3567	*/
	3568	void
	3569	waitq_set_lazy_init_link(struct waitq_set *wqset)
	3570	{
	3571	struct waitq_link *link;
	3572
	3573	assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
	3574
	3575	waitq_set_lock(wqset);
	3576	if (!waitq_set_should_lazy_init_link(wqset)) {
	3577	waitq_set_unlock(wqset);
	3578	return;
	3579	}
	3580
	3581	assert(wqset->wqset_id == WQSET_NOT_LINKED);
	3582	waitq_set_unlock(wqset);
	3583
	3584	link = wql_alloc_link(WQL_WQS);
	3585	if (!link) {
	3586	panic("Can't allocate link object for waitq set: %p", wqset);
	3587	}
	3588
	3589	link->wql_wqs.wql_set = wqset;
	3590
	3591	waitq_set_lock(wqset);
	3592	if (waitq_set_should_lazy_init_link(wqset)) {
	3593	wql_mkvalid(link);
	3594	wqset->wqset_id = link->wql_setid.id;
	3595	}
	3596
	3597	assert(wqset->wqset_id != 0);
	3598	assert(wqset->wqset_id != WQSET_NOT_LINKED);
	3599
	3600	waitq_set_unlock(wqset);
	3601
	3602	wql_put_link(link);
	3603
	3604	return;
	3605	}
	3606
	3607	/**
	3608	* checks if a waitq set needs to be linked.
	3609	*
	3610	*/
	3611	boolean_t
	3612	waitq_set_should_lazy_init_link(struct waitq_set *wqset)
	3613	{
	3614	if (waitqs_is_linked(wqset) \|\| wqset->wqset_id == 0) {
	3615	return FALSE;
	3616	}
	3617	return TRUE;
	3618	}
	3619
	3620	/**
	3621	* clear out / release any resources associated with a waitq set
	3622	*
	3623	* Conditions:
	3624	* may block
	3625	* Note:
	3626	* This will render the waitq set invalid, and it must
	3627	* be re-initialized with waitq_set_init before it can be used again
	3628	*/
	3629	void
	3630	waitq_set_deinit(struct waitq_set *wqset)
	3631	{
	3632	struct waitq_link *link = NULL;
	3633	uint64_t set_id, prepost_id;
	3634
	3635	if (!waitqs_is_set(wqset)) {
	3636	panic("trying to de-initialize an invalid wqset @%p", wqset);
	3637	}
	3638
	3639	assert(!waitq_irq_safe(&wqset->wqset_q));
	3640
	3641	waitq_set_lock(wqset);
	3642
	3643	if (waitq_set_has_prepost_hook(wqset)) {
	3644	waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
	3645	/*
	3646	* If the wqset_prepost_hook value is non 0,
	3647	* then another core is currently posting to this waitq set
	3648	* and we need for it to finish what it's doing.
	3649	*/
	3650	while (os_atomic_load(hook, relaxed) != 0) {
	3651	waitq_set_unlock(wqset);
	3652	delay(1);
	3653	waitq_set_lock(wqset);
	3654	}
	3655	}
	3656
	3657	set_id = wqset->wqset_id;
	3658
	3659	if (waitqs_is_linked(wqset) \|\| set_id == 0) {
	3660	/* grab the set's link object */
	3661	link = wql_get_link(set_id);
	3662	if (link) {
	3663	wql_invalidate(link);
	3664	}
	3665	/* someone raced us to deinit */
	3666	if (!link \|\| wqset->wqset_id != set_id \|\| set_id != link->wql_setid.id) {
	3667	if (link) {
	3668	wql_put_link(link);
	3669	}
	3670	waitq_set_unlock(wqset);
	3671	return;
	3672	}
	3673
	3674	/* the link should be a valid link object at this point */
	3675	assert(link != NULL && wql_type(link) == WQL_WQS);
	3676
	3677	wqset->wqset_id = 0;
	3678	}
	3679
	3680	/*
	3681	* This set may have a lot of preposts, or may have been a member of
	3682	* many other sets. To minimize spinlock hold times, we clear out the
	3683	* waitq set data structure under the lock-hold, but don't clear any
	3684	* table objects. We keep handles to the prepost and set linkage
	3685	* objects and free those outside the critical section.
	3686	*/
	3687	prepost_id = 0;
	3688	if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {
	3689	assert(link != NULL);
	3690	prepost_id = wqset->wqset_prepost_id;
	3691	}
	3692	/* else { TODO: notify kqueue subsystem? } */
	3693	wqset->wqset_prepost_id = 0;
	3694
	3695	wqset->wqset_q.waitq_fifo = 0;
	3696	wqset->wqset_q.waitq_prepost = 0;
	3697	wqset->wqset_q.waitq_isvalid = 0;
	3698
	3699	/* don't clear the 'waitq_irq' bit: it's used in locking! */
	3700	wqset->wqset_q.waitq_eventmask = 0;
	3701
	3702	waitq_unlink_all_unlock(&wqset->wqset_q);
	3703	/* wqset->wqset_q unlocked and set links deallocated */
	3704
	3705
	3706	if (link) {
	3707	/*
	3708	* walk_waitq_links may race with us for access to the waitq set.
	3709	* If walk_waitq_links has a reference to the set, then we should wait
	3710	* until the link's refcount goes to 1 (our reference) before we exit
	3711	* this function. That way we ensure that the waitq set memory will
	3712	* remain valid even though it's been cleared out.
	3713	*/
	3714	while (wql_refcnt(link) > 1) {
	3715	delay(1);
	3716	}
	3717	wql_put_link(link);
	3718	}
	3719
	3720	/* drop / unlink all the prepost table objects */
	3721	/* JMM - can this happen before the delay? */
	3722	if (prepost_id) {
	3723	(void)wq_prepost_iterate(prepost_id, NULL,
	3724	wqset_clear_prepost_chain_cb);
	3725	}
	3726	}
	3727
	3728	/**
	3729	* de-initialize and free an allocated waitq set object
	3730	*
	3731	* Conditions:
	3732	* may block
	3733	*/
	3734	kern_return_t
	3735	waitq_set_free(struct waitq_set *wqset)
	3736	{
	3737	waitq_set_deinit(wqset);
	3738
	3739	memset(wqset, 0, sizeof(*wqset));
	3740	zfree(waitq_set_zone, wqset);
	3741
	3742	return KERN_SUCCESS;
	3743	}
	3744
	3745	#if DEVELOPMENT \|\| DEBUG
	3746	#if CONFIG_WAITQ_DEBUG
	3747	/**
	3748	* return the set ID of 'wqset'
	3749	*/
	3750	uint64_t
	3751	wqset_id(struct waitq_set *wqset)
	3752	{
	3753	if (!wqset) {
	3754	return 0;
	3755	}
	3756
	3757	assert(waitqs_is_set(wqset));
	3758
	3759	if (!waitqs_is_linked(wqset)) {
	3760	waitq_set_lazy_init_link(wqset);
	3761	}
	3762
	3763	return wqset->wqset_id;
	3764	}
	3765
	3766	/**
	3767	* returns a pointer to the waitq object embedded in 'wqset'
	3768	*/
	3769	struct waitq *
	3770	wqset_waitq(struct waitq_set *wqset)
	3771	{
	3772	if (!wqset) {
	3773	return NULL;
	3774	}
	3775
	3776	assert(waitqs_is_set(wqset));
	3777
	3778	return &wqset->wqset_q;
	3779	}
	3780	#endif /* CONFIG_WAITQ_DEBUG */
	3781	#endif /* DEVELOPMENT \|\| DEBUG */
	3782
	3783
	3784	/**
	3785	* clear all preposts originating from 'waitq'
	3786	*
	3787	* Conditions:
	3788	* 'waitq' locked
	3789	* may (rarely) spin waiting for another on-core thread to
	3790	* release the last reference to the waitq's prepost link object
	3791	*
	3792	* NOTE:
	3793	* If this function needs to spin, it will drop the waitq lock!
	3794	* The return value of the function indicates whether or not this
	3795	* happened: 1 == lock was dropped, 0 == lock held
	3796	*/
	3797	int
	3798	waitq_clear_prepost_locked(struct waitq *waitq)
	3799	{
	3800	struct wq_prepost *wqp;
	3801	int dropped_lock = 0;
	3802
	3803	assert(!waitq_irq_safe(waitq));
	3804
	3805	if (waitq->waitq_prepost_id == 0) {
	3806	return 0;
	3807	}
	3808
	3809	wqp = wq_prepost_get(waitq->waitq_prepost_id);
	3810	waitq->waitq_prepost_id = 0;
	3811	if (wqp) {
	3812	uint64_t wqp_id = wqp->wqp_prepostid.id;
	3813	wqdbg_v("invalidate prepost 0x%llx (refcnt:%d)",
	3814	wqp->wqp_prepostid.id, wqp_refcnt(wqp));
	3815	wq_prepost_invalidate(wqp);
	3816	while (wqp_refcnt(wqp) > 1) {
	3817	/*
	3818	* Some other thread must have raced us to grab a link
	3819	* object reference before we invalidated it. This
	3820	* means that they are probably trying to access the
	3821	* waitq to which the prepost object points. We need
	3822	* to wait here until the other thread drops their
	3823	* reference. We know that no one else can get a
	3824	* reference (the object has been invalidated), and
	3825	* that prepost references are short-lived (dropped on
	3826	* a call to wq_prepost_put). We also know that no one
	3827	* blocks while holding a reference therefore the
	3828	* other reference holder must be on-core. We'll just
	3829	* sit and wait for the other reference to be dropped.
	3830	*/
	3831	disable_preemption();
	3832
	3833	waitq_unlock(waitq);
	3834	dropped_lock = 1;
	3835	/*
	3836	* don't yield here, just spin and assume the other
	3837	* consumer is already on core...
	3838	*/
	3839	delay(1);
	3840
	3841	waitq_lock(waitq);
	3842
	3843	enable_preemption();
	3844	}
	3845	if (wqp_refcnt(wqp) > 0 && wqp->wqp_prepostid.id == wqp_id) {
	3846	wq_prepost_put(wqp);
	3847	}
	3848	}
	3849
	3850	return dropped_lock;
	3851	}
	3852
	3853	/**
	3854	* clear all preposts originating from 'waitq'
	3855	*
	3856	* Conditions:
	3857	* 'waitq' is not locked
	3858	* may disable and re-enable interrupts
	3859	*/
	3860	void
	3861	waitq_clear_prepost(struct waitq *waitq)
	3862	{
	3863	assert(waitq_valid(waitq));
	3864	assert(!waitq_irq_safe(waitq));
	3865
	3866	waitq_lock(waitq);
	3867	/* it doesn't matter to us if the lock is dropped here */
	3868	(void)waitq_clear_prepost_locked(waitq);
	3869	waitq_unlock(waitq);
	3870	}
	3871
	3872	/**
	3873	* return a the waitq's prepost object ID (allocate if necessary)
	3874	*
	3875	* Conditions:
	3876	* 'waitq' is unlocked
	3877	*/
	3878	uint64_t
	3879	waitq_get_prepost_id(struct waitq *waitq)
	3880	{
	3881	struct wq_prepost *wqp;
	3882	uint64_t wqp_id = 0;
	3883
	3884	if (!waitq_valid(waitq)) {
	3885	return 0;
	3886	}
	3887
	3888	assert(!waitq_irq_safe(waitq));
	3889
	3890	waitq_lock(waitq);
	3891
	3892	if (!waitq_valid(waitq)) {
	3893	goto out_unlock;
	3894	}
	3895
	3896	if (waitq->waitq_prepost_id) {
	3897	wqp_id = waitq->waitq_prepost_id;
	3898	goto out_unlock;
	3899	}
	3900
	3901	/* don't hold a spinlock while allocating a prepost object */
	3902	waitq_unlock(waitq);
	3903
	3904	wqp = wq_prepost_alloc(WQP_WQ, 1);
	3905	if (!wqp) {
	3906	return 0;
	3907	}
	3908
	3909	/* re-acquire the waitq lock */
	3910	waitq_lock(waitq);
	3911
	3912	if (!waitq_valid(waitq)) {
	3913	wq_prepost_put(wqp);
	3914	wqp_id = 0;
	3915	goto out_unlock;
	3916	}
	3917
	3918	if (waitq->waitq_prepost_id) {
	3919	/* we were beat by someone else */
	3920	wq_prepost_put(wqp);
	3921	wqp_id = waitq->waitq_prepost_id;
	3922	goto out_unlock;
	3923	}
	3924
	3925	wqp->wqp_wq.wqp_wq_ptr = waitq;
	3926
	3927	wqp_set_valid(wqp);
	3928	wqp_id = wqp->wqp_prepostid.id;
	3929	waitq->waitq_prepost_id = wqp_id;
	3930
	3931	wq_prepost_put(wqp);
	3932
	3933	out_unlock:
	3934	waitq_unlock(waitq);
	3935
	3936	return wqp_id;
	3937	}
	3938
	3939
	3940	static int
	3941	waitq_inset_cb(struct waitq waitq, void ctx, struct waitq_link *link)
	3942	{
	3943	uint64_t setid = (uint64_t )ctx;
	3944	int wqltype = wql_type(link);
	3945	(void)waitq;
	3946	if (wqltype == WQL_WQS && link->wql_setid.id == setid) {
	3947	wqdbg_v(" waitq already in set 0x%llx", setid);
	3948	return WQ_ITERATE_FOUND;
	3949	} else if (wqltype == WQL_LINK) {
	3950	/*
	3951	* break out early if we see a link that points to the setid
	3952	* in question. This saves us a step in the
	3953	* iteration/recursion
	3954	*/
	3955	wqdbg_v(" waitq already in set 0x%llx (WQL_LINK)", setid);
	3956	if (link->wql_link.left_setid == setid \|\|
	3957	link->wql_link.right_setid == setid) {
	3958	return WQ_ITERATE_FOUND;
	3959	}
	3960	}
	3961
	3962	return WQ_ITERATE_CONTINUE;
	3963	}
	3964
	3965	/**
	3966	* determine if 'waitq' is a member of 'wqset'
	3967	*
	3968	* Conditions:
	3969	* neither 'waitq' nor 'wqset' is not locked
	3970	* may disable and re-enable interrupts while locking 'waitq'
	3971	*/
	3972	boolean_t
	3973	waitq_member(struct waitq waitq, struct waitq_set wqset)
	3974	{
	3975	kern_return_t kr = WQ_ITERATE_SUCCESS;
	3976	uint64_t setid;
	3977
	3978	if (!waitq_valid(waitq)) {
	3979	panic("Invalid waitq: %p", waitq);
	3980	}
	3981	assert(!waitq_irq_safe(waitq));
	3982
	3983	if (!waitqs_is_set(wqset)) {
	3984	return FALSE;
	3985	}
	3986
	3987	waitq_lock(waitq);
	3988
	3989	if (!waitqs_is_linked(wqset)) {
	3990	goto out_unlock;
	3991	}
	3992
	3993	setid = wqset->wqset_id;
	3994
	3995	/* fast path: most waitqs are members of only 1 set */
	3996	if (waitq->waitq_set_id == setid) {
	3997	waitq_unlock(waitq);
	3998	return TRUE;
	3999	}
	4000
	4001	/* walk the link table and look for the Set ID of wqset */
	4002	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	4003	WQL_ALL, (void *)&setid, waitq_inset_cb);
	4004
	4005	out_unlock:
	4006	waitq_unlock(waitq);
	4007	return kr == WQ_ITERATE_FOUND;
	4008	}
	4009
	4010	/**
	4011	* Returns true is the given waitq is a member of at least 1 set
	4012	*/
	4013	boolean_t
	4014	waitq_in_set(struct waitq *waitq)
	4015	{
	4016	struct waitq_link *link;
	4017	boolean_t inset = FALSE;
	4018
	4019	if (waitq_irq_safe(waitq)) {
	4020	return FALSE;
	4021	}
	4022
	4023	waitq_lock(waitq);
	4024
	4025	if (!waitq->waitq_set_id) {
	4026	goto out_unlock;
	4027	}
	4028
	4029	link = wql_get_link(waitq->waitq_set_id);
	4030	if (link) {
	4031	/* if we get here, the waitq is in _at_least_one_ set */
	4032	inset = TRUE;
	4033	wql_put_link(link);
	4034	} else {
	4035	/* we can just optimize this for next time */
	4036	waitq->waitq_set_id = 0;
	4037	}
	4038
	4039	out_unlock:
	4040	waitq_unlock(waitq);
	4041	return inset;
	4042	}
	4043
	4044
	4045	/**
	4046	* pre-allocate a waitq link structure from the link table
	4047	*
	4048	* Conditions:
	4049	* 'waitq' is not locked
	4050	* may (rarely) block if link table needs to grow
	4051	*/
	4052	uint64_t
	4053	waitq_link_reserve(struct waitq *waitq)
	4054	{
	4055	struct waitq_link *link;
	4056	uint64_t reserved_id = 0;
	4057
	4058	assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
	4059
	4060	/*
	4061	* We've asserted that the caller can block, so we enforce a
	4062	* minimum-free table element policy here.
	4063	*/
	4064	wql_ensure_free_space();
	4065
	4066	(void)waitq;
	4067	link = wql_alloc_link(LT_RESERVED);
	4068	if (!link) {
	4069	return 0;
	4070	}
	4071
	4072	reserved_id = link->wql_setid.id;
	4073
	4074	return reserved_id;
	4075	}
	4076
	4077	/**
	4078	* release a pre-allocated waitq link structure
	4079	*/
	4080	void
	4081	waitq_link_release(uint64_t id)
	4082	{
	4083	struct waitq_link *link;
	4084
	4085	if (id == 0) {
	4086	return;
	4087	}
	4088
	4089	link = wql_get_reserved(id, WQL_LINK);
	4090	if (!link) {
	4091	return;
	4092	}
	4093
	4094	/*
	4095	* if we successfully got a link object, then we know
	4096	* it's not been marked valid, and can be released with
	4097	* a standard wql_put_link() which should free the element.
	4098	*/
	4099	wql_put_link(link);
	4100	#if CONFIG_LTABLE_STATS
	4101	g_wqlinktable.nreserved_releases += 1;
	4102	#endif
	4103	}
	4104
	4105	/**
	4106	* link 'waitq' to the set identified by 'setid' using the 'link' structure
	4107	*
	4108	* Conditions:
	4109	* 'waitq' is locked
	4110	* caller should have a reference to the 'link' object
	4111	*/
	4112	static kern_return_t
	4113	waitq_link_internal(struct waitq *waitq,
	4114	uint64_t setid, struct waitq_link *link)
	4115	{
	4116	struct waitq_link *qlink;
	4117	kern_return_t kr;
	4118
	4119	assert(waitq_held(waitq));
	4120	assert(setid != 0);
	4121	assert(setid != WQSET_NOT_LINKED);
	4122
	4123	/*
	4124	* If the waitq_set_id field is empty, then this waitq is not
	4125	* a member of any other set. All we have to do is update the
	4126	* field.
	4127	*/
	4128	if (!waitq->waitq_set_id) {
	4129	waitq->waitq_set_id = setid;
	4130	return KERN_SUCCESS;
	4131	}
	4132
	4133	qlink = wql_get_link(waitq->waitq_set_id);
	4134	if (!qlink) {
	4135	/*
	4136	* The set to which this wait queue belonged has been
	4137	* destroyed / invalidated. We can re-use the waitq field.
	4138	*/
	4139	waitq->waitq_set_id = setid;
	4140	return KERN_SUCCESS;
	4141	}
	4142	wql_put_link(qlink);
	4143
	4144	/*
	4145	* Check to see if it's already a member of the set.
	4146	*
	4147	* TODO: check for cycles!
	4148	*/
	4149	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	4150	WQL_ALL, (void *)&setid, waitq_inset_cb);
	4151	if (kr == WQ_ITERATE_FOUND) {
	4152	return KERN_ALREADY_IN_SET;
	4153	}
	4154
	4155	/*
	4156	* This wait queue is a member of at least one set already,
	4157	* and _not_ a member of the given set. Use our previously
	4158	* allocated link object, and hook it up to the wait queue.
	4159	* Note that it's possible that one or more of the wait queue sets to
	4160	* which the wait queue belongs was invalidated before we allocated
	4161	* this link object. That's OK because the next time we use that
	4162	* object we'll just ignore it.
	4163	*/
	4164	link->wql_link.left_setid = setid;
	4165	link->wql_link.right_setid = waitq->waitq_set_id;
	4166	wql_mkvalid(link);
	4167
	4168	waitq->waitq_set_id = link->wql_setid.id;
	4169
	4170	return KERN_SUCCESS;
	4171	}
	4172
	4173	/**
	4174	* link 'waitq' to 'wqset'
	4175	*
	4176	* Conditions:
	4177	* if 'lock_state' contains WAITQ_SHOULD_LOCK, 'waitq' must be unlocked.
	4178	* Otherwise, 'waitq' must be locked.
	4179	*
	4180	* may (rarely) block on link table allocation if the table has to grow,
	4181	* and no 'reserved_link' object is passed.
	4182	*
	4183	* may block and acquire wqset lock if the wqset passed has no link.
	4184	*
	4185	* Notes:
	4186	* The caller can guarantee that this function will never block by
	4187	* - pre-allocating a link table object and passing its ID in 'reserved_link'
	4188	* - and pre-allocating the waitq set link calling waitq_set_lazy_init_link.
	4189	* It is not possible to provide a reserved_link without having also linked
	4190	* the wqset.
	4191	*/
	4192	kern_return_t
	4193	waitq_link(struct waitq waitq, struct waitq_set wqset,
	4194	waitq_lock_state_t lock_state, uint64_t *reserved_link)
	4195	{
	4196	kern_return_t kr;
	4197	struct waitq_link *link;
	4198	int should_lock = (lock_state == WAITQ_SHOULD_LOCK);
	4199
	4200	if (!waitq_valid(waitq) \|\| waitq_irq_safe(waitq)) {
	4201	panic("Invalid waitq: %p", waitq);
	4202	}
	4203
	4204	if (!waitqs_is_set(wqset)) {
	4205	return KERN_INVALID_ARGUMENT;
	4206	}
	4207
	4208	if (!reserved_link \|\| *reserved_link == 0) {
	4209	if (!waitqs_is_linked(wqset)) {
	4210	waitq_set_lazy_init_link(wqset);
	4211	}
	4212	}
	4213
	4214	wqdbg_v("Link waitq %p to wqset 0x%llx",
	4215	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
	4216
	4217	/*
	4218	* We _might_ need a new link object here, so we'll grab outside
	4219	* the lock because the alloc call _might_ block.
	4220	*
	4221	* If the caller reserved a link beforehand, then wql_get_link
	4222	* is guaranteed not to block because the caller holds an extra
	4223	* reference to the link which, in turn, hold a reference to the
	4224	* link table.
	4225	*/
	4226	if (reserved_link && *reserved_link != 0) {
	4227	link = wql_get_reserved(*reserved_link, WQL_LINK);
	4228	/* always consume the caller's reference */
	4229	*reserved_link = 0;
	4230	} else {
	4231	link = wql_alloc_link(WQL_LINK);
	4232	}
	4233	if (!link) {
	4234	return KERN_NO_SPACE;
	4235	}
	4236
	4237	if (should_lock) {
	4238	waitq_lock(waitq);
	4239	}
	4240
	4241	kr = waitq_link_internal(waitq, wqset->wqset_id, link);
	4242
	4243	if (should_lock) {
	4244	waitq_unlock(waitq);
	4245	}
	4246
	4247	wql_put_link(link);
	4248
	4249	return kr;
	4250	}
	4251
	4252	/**
	4253	* helper: unlink 'waitq' from waitq set identified by 'setid'
	4254	* this function also prunes invalid objects from the tree
	4255	*
	4256	* Conditions:
	4257	* MUST be called from walk_waitq_links link table walk
	4258	* 'waitq' is locked
	4259	*
	4260	* Notes:
	4261	* This is a helper function which compresses the link table by culling
	4262	* unused or unnecessary links. See comments below for different
	4263	* scenarios.
	4264	*/
	4265	static inline int
	4266	waitq_maybe_remove_link(struct waitq *waitq,
	4267	uint64_t setid,
	4268	struct waitq_link *parent,
	4269	struct waitq_link *left,
	4270	struct waitq_link *right)
	4271	{
	4272	uint64_t *wq_setid = &waitq->waitq_set_id;
	4273
	4274	/*
	4275	* There are two scenarios:
	4276	*
	4277	* Scenario 1:
	4278	* --------------------------------------------------------------------
	4279	* waitq->waitq_set_id == parent
	4280	*
	4281	* parent(LINK)
	4282	* / \
	4283	* / \
	4284	* / \
	4285	* L(LINK/WQS_l) R(LINK/WQS_r)
	4286	*
	4287	* In this scenario, we assert that the original waitq points to the
	4288	* parent link we were passed in. If WQS_l (or WQS_r) is the waitq
	4289	* set we're looking for, we can set the corresponding parent
	4290	* link id (left or right) to 0. To compress the tree, we can reset the
	4291	* waitq_set_id of the original waitq to point to the side of the
	4292	* parent that is still valid. We then discard the parent link object.
	4293	*/
	4294	if (*wq_setid == parent->wql_setid.id) {
	4295	if (!left && !right) {
	4296	/* completely invalid children */
	4297	wql_invalidate(parent);
	4298	wqdbg_v("S1, L+R");
	4299	*wq_setid = 0;
	4300	return WQ_ITERATE_INVALID;
	4301	} else if (!left \|\| left->wql_setid.id == setid) {
	4302	/*
	4303	* left side matches we know it points either to the
	4304	* WQS we're unlinking, or to an invalid object:
	4305	* no need to invalidate it
	4306	*/
	4307	*wq_setid = right ? right->wql_setid.id : 0;
	4308	wql_invalidate(parent);
	4309	wqdbg_v("S1, L");
	4310	return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4311	} else if (!right \|\| right->wql_setid.id == setid) {
	4312	/*
	4313	* if right side matches we know it points either to the
	4314	* WQS we're unlinking, or to an invalid object:
	4315	* no need to invalidate it
	4316	*/
	4317	*wq_setid = left ? left->wql_setid.id : 0;
	4318	wql_invalidate(parent);
	4319	wqdbg_v("S1, R");
	4320	return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4321	}
	4322	}
	4323
	4324	/*
	4325	* the tree walk starts at the top-of-tree and moves down,
	4326	* so these are safe asserts.
	4327	*/
	4328	assert(left \|\| right); /* one of them has to be valid at this point */
	4329
	4330	/*
	4331	* Scenario 2:
	4332	* --------------------------------------------------------------------
	4333	* waitq->waitq_set_id == ... (OR parent)
	4334	*
	4335	* ...
	4336	* \|
	4337	* parent
	4338	* / \
	4339	* / \
	4340	* L(LINK) R(LINK)
	4341	* /\ /\
	4342	* / \ / \
	4343	* / \ Rl() Rr()
	4344	* Ll(WQS) Lr(WQS)
	4345	*
	4346	* In this scenario, a leaf node of either the left or right side
	4347	* could be the wait queue set we're looking to unlink. We also handle
	4348	* the case where one of these links is invalid. If a leaf node is
	4349	* invalid or it's the set we're looking for, we can safely remove the
	4350	* middle link (left or right) and point the parent link directly to
	4351	* the remaining leaf node.
	4352	*/
	4353	if (left && wql_type(left) == WQL_LINK) {
	4354	uint64_t Ll, Lr;
	4355	struct waitq_link linkLl, linkLr;
	4356	assert(left->wql_setid.id != setid);
	4357	Ll = left->wql_link.left_setid;
	4358	Lr = left->wql_link.right_setid;
	4359	linkLl = wql_get_link(Ll);
	4360	linkLr = wql_get_link(Lr);
	4361	if (!linkLl && !linkLr) {
	4362	/*
	4363	* The left object points to two invalid objects!
	4364	* We can invalidate the left w/o touching the parent.
	4365	*/
	4366	wql_invalidate(left);
	4367	wqdbg_v("S2, Ll+Lr");
	4368	return WQ_ITERATE_INVALID;
	4369	} else if (!linkLl \|\| Ll == setid) {
	4370	/* Ll is invalid and/or the wait queue set we're looking for */
	4371	parent->wql_link.left_setid = Lr;
	4372	wql_invalidate(left);
	4373	wql_put_link(linkLl);
	4374	wql_put_link(linkLr);
	4375	wqdbg_v("S2, Ll");
	4376	return linkLl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4377	} else if (!linkLr \|\| Lr == setid) {
	4378	/* Lr is invalid and/or the wait queue set we're looking for */
	4379	parent->wql_link.left_setid = Ll;
	4380	wql_invalidate(left);
	4381	wql_put_link(linkLr);
	4382	wql_put_link(linkLl);
	4383	wqdbg_v("S2, Lr");
	4384	return linkLr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4385	}
	4386	wql_put_link(linkLl);
	4387	wql_put_link(linkLr);
	4388	}
	4389
	4390	if (right && wql_type(right) == WQL_LINK) {
	4391	uint64_t Rl, Rr;
	4392	struct waitq_link linkRl, linkRr;
	4393	assert(right->wql_setid.id != setid);
	4394	Rl = right->wql_link.left_setid;
	4395	Rr = right->wql_link.right_setid;
	4396	linkRl = wql_get_link(Rl);
	4397	linkRr = wql_get_link(Rr);
	4398	if (!linkRl && !linkRr) {
	4399	/*
	4400	* The right object points to two invalid objects!
	4401	* We can invalidate the right w/o touching the parent.
	4402	*/
	4403	wql_invalidate(right);
	4404	wqdbg_v("S2, Rl+Rr");
	4405	return WQ_ITERATE_INVALID;
	4406	} else if (!linkRl \|\| Rl == setid) {
	4407	/* Rl is invalid and/or the wait queue set we're looking for */
	4408	parent->wql_link.right_setid = Rr;
	4409	wql_invalidate(right);
	4410	wql_put_link(linkRl);
	4411	wql_put_link(linkRr);
	4412	wqdbg_v("S2, Rl");
	4413	return linkRl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4414	} else if (!linkRr \|\| Rr == setid) {
	4415	/* Rr is invalid and/or the wait queue set we're looking for */
	4416	parent->wql_link.right_setid = Rl;
	4417	wql_invalidate(right);
	4418	wql_put_link(linkRl);
	4419	wql_put_link(linkRr);
	4420	wqdbg_v("S2, Rr");
	4421	return linkRr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4422	}
	4423	wql_put_link(linkRl);
	4424	wql_put_link(linkRr);
	4425	}
	4426
	4427	return WQ_ITERATE_CONTINUE;
	4428	}
	4429
	4430	/**
	4431	* link table walk callback that unlinks 'waitq' from 'ctx->setid'
	4432	*
	4433	* Conditions:
	4434	* called from walk_waitq_links
	4435	* 'waitq' is locked
	4436	*
	4437	* Notes:
	4438	* uses waitq_maybe_remove_link() to compress the linktable and
	4439	* perform the actual unlinking
	4440	*/
	4441	static int
	4442	waitq_unlink_cb(struct waitq waitq, void ctx,
	4443	struct waitq_link *link)
	4444	{
	4445	uint64_t setid = ((uint64_t )ctx);
	4446	struct waitq_link right, left;
	4447	int ret = 0;
	4448
	4449	if (wql_type(link) != WQL_LINK) {
	4450	return WQ_ITERATE_CONTINUE;
	4451	}
	4452
	4453	do {
	4454	left = wql_get_link(link->wql_link.left_setid);
	4455	right = wql_get_link(link->wql_link.right_setid);
	4456
	4457	ret = waitq_maybe_remove_link(waitq, setid, link, left, right);
	4458
	4459	wql_put_link(left);
	4460	wql_put_link(right);
	4461
	4462	if (!wql_is_valid(link)) {
	4463	return WQ_ITERATE_INVALID;
	4464	}
	4465	/* A ret value of UNLINKED will break us out of table walk */
	4466	} while (ret == WQ_ITERATE_INVALID);
	4467
	4468	return ret;
	4469	}
	4470
	4471
	4472	/**
	4473	* undo/remove a prepost from 'ctx' (waitq) to 'wqset'
	4474	*
	4475	* Conditions:
	4476	* Called from wq_prepost_foreach_locked OR wq_prepost_iterate
	4477	* 'wqset' may be NULL
	4478	* (ctx)->unlink_wqset is locked
	4479	*/
	4480	static int
	4481	waitq_unlink_prepost_cb(struct waitq_set __unused wqset, void ctx,
	4482	struct wq_prepost wqp, struct waitq waitq)
	4483	{
	4484	struct wq_unlink_ctx ulctx = (struct wq_unlink_ctx )ctx;
	4485
	4486	if (waitq != ulctx->unlink_wq) {
	4487	return WQ_ITERATE_CONTINUE;
	4488	}
	4489
	4490	if (wqp_type(wqp) == WQP_WQ &&
	4491	wqp->wqp_prepostid.id == ulctx->unlink_wqset->wqset_prepost_id) {
	4492	/* this is the only prepost on this wait queue set */
	4493	wqdbg_v("unlink wqp (WQ) 0x%llx", wqp->wqp_prepostid.id);
	4494	ulctx->unlink_wqset->wqset_prepost_id = 0;
	4495	return WQ_ITERATE_BREAK;
	4496	}
	4497
	4498	assert(wqp_type(wqp) == WQP_POST);
	4499
	4500	/*
	4501	* The prepost object 'wqp' points to a waitq which should no longer
	4502	* be preposted to 'ulctx->unlink_wqset'. We can remove the prepost
	4503	* object from the list and break out of the iteration. Using the
	4504	* context object in this way allows this same callback function to be
	4505	* used from both wq_prepost_foreach_locked and wq_prepost_iterate.
	4506	*/
	4507	wq_prepost_remove(ulctx->unlink_wqset, wqp);
	4508	return WQ_ITERATE_BREAK;
	4509	}
	4510
	4511	/**
	4512	* unlink 'waitq' from 'wqset'
	4513	*
	4514	* Conditions:
	4515	* 'waitq' is locked
	4516	* 'wqset' is _not_ locked
	4517	* may (rarely) spin in prepost clear and drop/re-acquire 'waitq' lock
	4518	* (see waitq_clear_prepost_locked)
	4519	*/
	4520	static kern_return_t
	4521	waitq_unlink_locked(struct waitq *waitq,
	4522	struct waitq_set *wqset)
	4523	{
	4524	uint64_t setid;
	4525	kern_return_t kr;
	4526
	4527	assert(!waitq_irq_safe(waitq));
	4528
	4529	if (waitq->waitq_set_id == 0) {
	4530	/*
	4531	* TODO:
	4532	* it doesn't belong to anyone, and it has a prepost object?
	4533	* This is an artifact of not cleaning up after kqueues when
	4534	* they prepost into select sets...
	4535	*/
	4536	if (waitq->waitq_prepost_id != 0) {
	4537	(void)waitq_clear_prepost_locked(waitq);
	4538	}
	4539	return KERN_NOT_IN_SET;
	4540	}
	4541
	4542	if (!waitqs_is_linked(wqset)) {
	4543	/*
	4544	* No link has been allocated for the wqset,
	4545	* so no waitq could have been linked to it.
	4546	*/
	4547	return KERN_NOT_IN_SET;
	4548	}
	4549
	4550	setid = wqset->wqset_id;
	4551
	4552	if (waitq->waitq_set_id == setid) {
	4553	waitq->waitq_set_id = 0;
	4554	/*
	4555	* This was the only set to which the waitq belonged: we can
	4556	* safely release the waitq's prepost object. It doesn't
	4557	* matter if this function drops and re-acquires the lock
	4558	* because we're not manipulating waitq state any more.
	4559	*/
	4560	(void)waitq_clear_prepost_locked(waitq);
	4561	return KERN_SUCCESS;
	4562	}
	4563
	4564	/*
	4565	* The waitq was a member of more that 1 set, so we need to
	4566	* handle potentially compressing the link table, and
	4567	* adjusting the waitq->waitq_set_id value.
	4568	*
	4569	* Note: we can't free the waitq's associated prepost object (if any)
	4570	* because it may be in use by the one or more _other_ sets to
	4571	* which this queue belongs.
	4572	*
	4573	* Note: This function only handles a single level of the queue linkage.
	4574	* Removing a waitq from a set to which it does not directly
	4575	* belong is undefined. For example, if a waitq belonged to set
	4576	* A, and set A belonged to set B. You can't remove the waitq
	4577	* from set B.
	4578	*/
	4579	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	4580	WQL_LINK, (void *)&setid, waitq_unlink_cb);
	4581
	4582	if (kr == WQ_ITERATE_UNLINKED) {
	4583	struct wq_unlink_ctx ulctx;
	4584
	4585	kr = KERN_SUCCESS; /* found it and dis-associated it */
	4586
	4587	/* don't look for preposts if it's not prepost-enabled */
	4588	if (!wqset->wqset_q.waitq_prepost) {
	4589	goto out;
	4590	}
	4591
	4592	assert(!waitq_irq_safe(&wqset->wqset_q));
	4593
	4594	waitq_set_lock(wqset);
	4595	/*
	4596	* clear out any prepost from waitq into wqset
	4597	* TODO: this could be more efficient than a linear search of
	4598	* the waitq set's prepost list.
	4599	*/
	4600	ulctx.unlink_wq = waitq;
	4601	ulctx.unlink_wqset = wqset;
	4602	(void)wq_prepost_iterate(wqset->wqset_prepost_id, (void *)&ulctx,
	4603	waitq_unlink_prepost_cb);
	4604	waitq_set_unlock(wqset);
	4605	} else {
	4606	kr = KERN_NOT_IN_SET; /* waitq is _not_ associated with wqset */
	4607	}
	4608
	4609	out:
	4610	return kr;
	4611	}
	4612
	4613	/**
	4614	* unlink 'waitq' from 'wqset'
	4615	*
	4616	* Conditions:
	4617	* neither 'waitq' nor 'wqset' is locked
	4618	* may disable and re-enable interrupts
	4619	* may (rarely) spin in prepost clear
	4620	* (see waitq_clear_prepost_locked)
	4621	*/
	4622	kern_return_t
	4623	waitq_unlink(struct waitq waitq, struct waitq_set wqset)
	4624	{
	4625	kern_return_t kr = KERN_SUCCESS;
	4626
	4627	assert(waitqs_is_set(wqset));
	4628
	4629	/*
	4630	* we allow the waitq to be invalid because the caller may be trying
	4631	* to clear out old/dirty state
	4632	*/
	4633	if (!waitq_valid(waitq)) {
	4634	return KERN_INVALID_ARGUMENT;
	4635	}
	4636
	4637	wqdbg_v("unlink waitq %p from set 0x%llx",
	4638	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
	4639
	4640	assert(!waitq_irq_safe(waitq));
	4641
	4642	waitq_lock(waitq);
	4643
	4644	kr = waitq_unlink_locked(waitq, wqset);
	4645
	4646	waitq_unlock(waitq);
	4647	return kr;
	4648	}
	4649
	4650	/**
	4651	* unlink a waitq from a waitq set, but reference the waitq by its prepost ID
	4652	*
	4653	* Conditions:
	4654	* 'wqset' is unlocked
	4655	* wqp_id may be valid or invalid
	4656	*/
	4657	void
	4658	waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset)
	4659	{
	4660	struct wq_prepost *wqp;
	4661
	4662	disable_preemption();
	4663	wqp = wq_prepost_get(wqp_id);
	4664	if (wqp) {
	4665	struct waitq *wq;
	4666
	4667	wq = wqp->wqp_wq.wqp_wq_ptr;
	4668
	4669	/*
	4670	* lock the waitq, then release our prepost ID reference, then
	4671	* unlink the waitq from the wqset: this ensures that we don't
	4672	* hold a prepost ID reference during the unlink, but we also
	4673	* complete the unlink operation atomically to avoid a race
	4674	* with waitq_unlink[_all].
	4675	*/
	4676	assert(!waitq_irq_safe(wq));
	4677
	4678	waitq_lock(wq);
	4679	wq_prepost_put(wqp);
	4680
	4681	if (!waitq_valid(wq)) {
	4682	/* someone already tore down this waitq! */
	4683	waitq_unlock(wq);
	4684	enable_preemption();
	4685	return;
	4686	}
	4687
	4688	/* this _may_ drop the wq lock, but that's OK */
	4689	waitq_unlink_locked(wq, wqset);
	4690
	4691	waitq_unlock(wq);
	4692	}
	4693	enable_preemption();
	4694	return;
	4695	}
	4696
	4697
	4698	/**
	4699	* reference and lock a waitq by its prepost ID
	4700	*
	4701	* Conditions:
	4702	* wqp_id may be valid or invalid
	4703	*
	4704	* Returns:
	4705	* a locked waitq if wqp_id was valid
	4706	* NULL on failure
	4707	*/
	4708	struct waitq *
	4709	waitq_lock_by_prepost_id(uint64_t wqp_id)
	4710	{
	4711	struct waitq *wq = NULL;
	4712	struct wq_prepost *wqp;
	4713
	4714	disable_preemption();
	4715	wqp = wq_prepost_get(wqp_id);
	4716	if (wqp) {
	4717	wq = wqp->wqp_wq.wqp_wq_ptr;
	4718
	4719	assert(!waitq_irq_safe(wq));
	4720
	4721	waitq_lock(wq);
	4722	wq_prepost_put(wqp);
	4723
	4724	if (!waitq_valid(wq)) {
	4725	/* someone already tore down this waitq! */
	4726	waitq_unlock(wq);
	4727	enable_preemption();
	4728	return NULL;
	4729	}
	4730	}
	4731	enable_preemption();
	4732	return wq;
	4733	}
	4734
	4735
	4736	/**
	4737	* unlink 'waitq' from all sets to which it belongs
	4738	*
	4739	* Conditions:
	4740	* 'waitq' is locked on entry
	4741	* returns with waitq lock dropped
	4742	*
	4743	* Notes:
	4744	* may (rarely) spin (see waitq_clear_prepost_locked)
	4745	*/
	4746	kern_return_t
	4747	waitq_unlink_all_unlock(struct waitq *waitq)
	4748	{
	4749	uint64_t old_set_id = 0;
	4750	wqdbg_v("unlink waitq %p from all sets",
	4751	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq));
	4752	assert(!waitq_irq_safe(waitq));
	4753
	4754	/* it's not a member of any sets */
	4755	if (waitq->waitq_set_id == 0) {
	4756	waitq_unlock(waitq);
	4757	return KERN_SUCCESS;
	4758	}
	4759
	4760	old_set_id = waitq->waitq_set_id;
	4761	waitq->waitq_set_id = 0;
	4762
	4763	/*
	4764	* invalidate the prepost entry for this waitq.
	4765	* This may drop and re-acquire the waitq lock, but that's OK because
	4766	* if it was added to another set and preposted to that set in the
	4767	* time we drop the lock, the state will remain consistent.
	4768	*/
	4769	(void)waitq_clear_prepost_locked(waitq);
	4770
	4771	waitq_unlock(waitq);
	4772
	4773	if (old_set_id) {
	4774	/*
	4775	* Walk the link table and invalidate each LINK object that
	4776	* used to connect this waitq to one or more sets: this works
	4777	* because WQL_LINK objects are private to each wait queue
	4778	*/
	4779	(void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, old_set_id,
	4780	WQL_LINK, NULL, waitq_unlink_all_cb);
	4781	}
	4782
	4783	return KERN_SUCCESS;
	4784	}
	4785
	4786	/**
	4787	* unlink 'waitq' from all sets to which it belongs
	4788	*
	4789	* Conditions:
	4790	* 'waitq' is not locked
	4791	* may disable and re-enable interrupts
	4792	* may (rarely) spin
	4793	* (see waitq_unlink_all_locked, waitq_clear_prepost_locked)
	4794	*/
	4795	kern_return_t
	4796	waitq_unlink_all(struct waitq *waitq)
	4797	{
	4798	kern_return_t kr = KERN_SUCCESS;
	4799
	4800	if (!waitq_valid(waitq)) {
	4801	panic("Invalid waitq: %p", waitq);
	4802	}
	4803
	4804	assert(!waitq_irq_safe(waitq));
	4805	waitq_lock(waitq);
	4806	if (!waitq_valid(waitq)) {
	4807	waitq_unlock(waitq);
	4808	return KERN_SUCCESS;
	4809	}
	4810
	4811	kr = waitq_unlink_all_unlock(waitq);
	4812	/* waitq unlocked and set links deallocated */
	4813
	4814	return kr;
	4815	}
	4816
	4817
	4818	/**
	4819	* unlink all waitqs from 'wqset'
	4820	*
	4821	* Conditions:
	4822	* 'wqset' is locked on entry
	4823	* 'wqset' is unlocked on exit and spl is restored
	4824	*
	4825	* Note:
	4826	* may (rarely) spin/block (see waitq_clear_prepost_locked)
	4827	*/
	4828	kern_return_t
	4829	waitq_set_unlink_all_unlock(struct waitq_set *wqset)
	4830	{
	4831	struct waitq_link *link;
	4832	uint64_t prepost_id;
	4833
	4834	wqdbg_v("unlink all queues from set 0x%llx", wqset->wqset_id);
	4835
	4836	/*
	4837	* This operation does not require interaction with any of the set's
	4838	* constituent wait queues. All we have to do is invalidate the SetID
	4839	*/
	4840
	4841	if (waitqs_is_linked(wqset)) {
	4842	/* invalidate and re-alloc the link object first */
	4843	link = wql_get_link(wqset->wqset_id);
	4844
	4845	/* we may have raced with a waitq_set_deinit: handle this */
	4846	if (!link) {
	4847	waitq_set_unlock(wqset);
	4848	return KERN_SUCCESS;
	4849	}
	4850
	4851	wql_invalidate(link);
	4852
	4853	/* re-alloc the object to get a new generation ID */
	4854	wql_realloc_link(link, WQL_WQS);
	4855	link->wql_wqs.wql_set = wqset;
	4856
	4857	wqset->wqset_id = link->wql_setid.id;
	4858	wql_mkvalid(link);
	4859	wql_put_link(link);
	4860	}
	4861
	4862	/* clear any preposts attached to this set */
	4863	prepost_id = 0;
	4864	if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {
	4865	prepost_id = wqset->wqset_prepost_id;
	4866	}
	4867	/* else { TODO: notify kqueue subsystem? } */
	4868	wqset->wqset_prepost_id = 0;
	4869
	4870	/*
	4871	* clear set linkage and prepost object associated with this set:
	4872	* waitq sets may prepost to other sets if, for example, they are
	4873	* associated with a kqueue which is in a select set.
	4874	*
	4875	* This releases all the set link objects
	4876	* (links to other sets to which this set was previously added)
	4877	*/
	4878	waitq_unlink_all_unlock(&wqset->wqset_q);
	4879	/* wqset->wqset_q unlocked */
	4880
	4881	/* drop / unlink all the prepost table objects */
	4882	if (prepost_id) {
	4883	(void)wq_prepost_iterate(prepost_id, NULL,
	4884	wqset_clear_prepost_chain_cb);
	4885	}
	4886
	4887	return KERN_SUCCESS;
	4888	}
	4889
	4890	/**
	4891	* unlink all waitqs from 'wqset'
	4892	*
	4893	* Conditions:
	4894	* 'wqset' is not locked
	4895	* may (rarely) spin/block (see waitq_clear_prepost_locked)
	4896	*/
	4897	kern_return_t
	4898	waitq_set_unlink_all(struct waitq_set *wqset)
	4899	{
	4900	assert(waitqs_is_set(wqset));
	4901	assert(!waitq_irq_safe(&wqset->wqset_q));
	4902
	4903	waitq_set_lock(wqset);
	4904	return waitq_set_unlink_all_unlock(wqset);
	4905	/* wqset unlocked and set links and preposts deallocated */
	4906	}
	4907
	4908	static int
	4909	waitq_prepost_reserve_cb(struct waitq waitq, void ctx,
	4910	struct waitq_link *link)
	4911	{
	4912	uint32_t num = (uint32_t )ctx;
	4913	(void)waitq;
	4914
	4915	/*
	4916	* In the worst case, we'll have to allocate 2 prepost objects
	4917	* per waitq set (if the set was already preposted by another
	4918	* waitq).
	4919	*/
	4920	if (wql_type(link) == WQL_WQS) {
	4921	/*
	4922	* check to see if the associated waitq actually supports
	4923	* preposting
	4924	*/
	4925	if (waitq_set_can_prepost(link->wql_wqs.wql_set)) {
	4926	*num += 2;
	4927	}
	4928	}
	4929	return WQ_ITERATE_CONTINUE;
	4930	}
	4931
	4932	static int
	4933	waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,
	4934	int did_unlock, struct wq_prepost *wqp)
	4935	{
	4936	struct wq_prepost *tmp;
	4937	struct wqp_cache *cache;
	4938
	4939	*did_unlock = 0;
	4940
	4941	/*
	4942	* Before we unlock the waitq, check the per-processor prepost object
	4943	* cache to see if there's enough there for us. If so, do the
	4944	* allocation, keep the lock and save an entire iteration over the set
	4945	* linkage!
	4946	*/
	4947	if (waitq) {
	4948	disable_preemption();
	4949	cache = &PROCESSOR_DATA(current_processor(), wqp_cache);
	4950	if (nalloc <= (int)cache->avail) {
	4951	goto do_alloc;
	4952	}
	4953	enable_preemption();
	4954
	4955	/* unlock the waitq to perform the allocation */
	4956	*did_unlock = 1;
	4957	waitq_unlock(waitq);
	4958	}
	4959
	4960	do_alloc:
	4961	tmp = wq_prepost_alloc(LT_RESERVED, nalloc);
	4962	if (!tmp) {
	4963	panic("Couldn't reserve %d preposts for waitq @%p (wqp@%p)",
	4964	nalloc, waitq, *wqp);
	4965	}
	4966	if (*wqp) {
	4967	/* link the two lists */
	4968	int __assert_only rc;
	4969	rc = wq_prepost_rlink(tmp, *wqp);
	4970	assert(rc == nalloc);
	4971	}
	4972	*wqp = tmp;
	4973
	4974	/*
	4975	* If the caller can block, then enforce a minimum-free table element
	4976	* policy here. This helps ensure that we will have enough prepost
	4977	* objects for callers such as selwakeup() that can be called with
	4978	* spin locks held.
	4979	*/
	4980	if (get_preemption_level() == 0) {
	4981	wq_prepost_ensure_free_space();
	4982	}
	4983
	4984	if (waitq) {
	4985	if (*did_unlock == 0) {
	4986	/* decrement the preemption count if alloc from cache */
	4987	enable_preemption();
	4988	} else {
	4989	/* otherwise: re-lock the waitq */
	4990	waitq_lock(waitq);
	4991	}
	4992	}
	4993
	4994	return nalloc;
	4995	}
	4996
	4997	static int
	4998	waitq_count_prepost_reservation(struct waitq *waitq, int extra, int keep_locked)
	4999	{
	5000	int npreposts = 0;
	5001
	5002	/*
	5003	* If the waitq is not currently part of a set, and we're not asked to
	5004	* keep the waitq locked then we'll want to have 3 in reserve
	5005	* just-in-case it becomes part of a set while we unlock and reserve.
	5006	* We may need up to 1 object for the waitq, and 2 for the set.
	5007	*/
	5008	if (waitq->waitq_set_id == 0) {
	5009	npreposts = 3;
	5010	} else {
	5011	/* this queue has never been preposted before */
	5012	if (waitq->waitq_prepost_id == 0) {
	5013	npreposts = 3;
	5014	}
	5015
	5016	/*
	5017	* Walk the set of table linkages associated with this waitq
	5018	* and count the worst-case number of prepost objects that
	5019	* may be needed during a wakeup_all. We can walk this without
	5020	* locking each set along the way because the table-based IDs
	5021	* disconnect us from the set pointers themselves, and the
	5022	* table walking is careful to read the setid values only once.
	5023	* Locking each set up the chain also doesn't guarantee that
	5024	* their membership won't change between the time we unlock
	5025	* that set and when we actually go to prepost, so our
	5026	* situation is no worse than before and we've alleviated lock
	5027	* contention on any sets to which this waitq belongs.
	5028	*/
	5029	(void)walk_waitq_links(LINK_WALK_FULL_DAG_UNLOCKED,
	5030	waitq, waitq->waitq_set_id,
	5031	WQL_WQS, (void *)&npreposts,
	5032	waitq_prepost_reserve_cb);
	5033	}
	5034
	5035	if (extra > 0) {
	5036	npreposts += extra;
	5037	}
	5038
	5039	if (npreposts == 0 && !keep_locked) {
	5040	/*
	5041	* If we get here, we were asked to reserve some prepost
	5042	* objects for a waitq that's previously preposted, and is not
	5043	* currently a member of any sets. We have also been
	5044	* instructed to unlock the waitq when we're done. In this
	5045	* case, we pre-allocated enough reserved objects to handle
	5046	* the case where the waitq gets added to a single set when
	5047	* the lock is released.
	5048	*/
	5049	npreposts = 3;
	5050	}
	5051
	5052	return npreposts;
	5053	}
	5054
	5055
	5056	/**
	5057	* pre-allocate prepost objects for 'waitq'
	5058	*
	5059	* Conditions:
	5060	* 'waitq' is not locked
	5061	*
	5062	* Returns:
	5063	* panic on error
	5064	*
	5065	* 0 on success, '*reserved' is set to the head of a singly-linked
	5066	* list of pre-allocated prepost objects.
	5067	*
	5068	* Notes:
	5069	* If 'lock_state' is WAITQ_KEEP_LOCKED, this function performs the pre-allocation
	5070	* atomically and returns 'waitq' locked.
	5071	*
	5072	* This function attempts to pre-allocate precisely enough prepost
	5073	* objects based on the current set membership of 'waitq'. If the
	5074	* operation is performed atomically, then the caller
	5075	* is guaranteed to have enough pre-allocated prepost object to avoid
	5076	* any (rare) blocking in the wakeup path.
	5077	*/
	5078	uint64_t
	5079	waitq_prepost_reserve(struct waitq *waitq, int extra,
	5080	waitq_lock_state_t lock_state)
	5081	{
	5082	uint64_t reserved = 0;
	5083	uint64_t prev_setid = 0, prev_prepostid = 0;
	5084	struct wq_prepost *wqp = NULL;
	5085	int nalloc = 0, npreposts = 0;
	5086	int keep_locked = (lock_state == WAITQ_KEEP_LOCKED);
	5087	int unlocked = 0;
	5088
	5089	wqdbg_v("Attempting to reserve prepost linkages for waitq %p (extra:%d)",
	5090	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), extra);
	5091
	5092	if (waitq == NULL && extra > 0) {
	5093	/*
	5094	* Simple prepost object allocation:
	5095	* we'll add 2 more because the waitq might need an object,
	5096	* and the set itself may need a new POST object in addition
	5097	* to the number of preposts requested by the caller
	5098	*/
	5099	nalloc = waitq_alloc_prepost_reservation(extra + 2, NULL,
	5100	&unlocked, &wqp);
	5101	assert(nalloc == extra + 2);
	5102	return wqp->wqp_prepostid.id;
	5103	}
	5104
	5105	assert(lock_state == WAITQ_KEEP_LOCKED \|\| lock_state == WAITQ_UNLOCK);
	5106
	5107	assert(!waitq_irq_safe(waitq));
	5108
	5109	waitq_lock(waitq);
	5110
	5111	/* remember the set ID that we started with */
	5112	prev_setid = waitq->waitq_set_id;
	5113	prev_prepostid = waitq->waitq_prepost_id;
	5114
	5115	/*
	5116	* If the waitq is not part of a set, and we're asked to
	5117	* keep the set locked, then we don't have to reserve
	5118	* anything!
	5119	*/
	5120	if (prev_setid == 0 && keep_locked) {
	5121	goto out;
	5122	}
	5123
	5124	npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
	5125
	5126	/* nothing for us to do! */
	5127	if (npreposts == 0) {
	5128	if (keep_locked) {
	5129	goto out;
	5130	}
	5131	goto out_unlock;
	5132	}
	5133
	5134	try_alloc:
	5135	/* this _may_ unlock and relock the waitq! */
	5136	nalloc = waitq_alloc_prepost_reservation(npreposts, waitq,
	5137	&unlocked, &wqp);
	5138
	5139	if (!unlocked) {
	5140	/* allocation held the waitq lock: we'd done! */
	5141	if (keep_locked) {
	5142	goto out;
	5143	}
	5144	goto out_unlock;
	5145	}
	5146
	5147	/*
	5148	* Before we return, if the allocation had to unlock the waitq, we
	5149	* must check one more time to see if we have enough. If not, we'll
	5150	* try to allocate the difference. If the caller requests it, we'll
	5151	* also leave the waitq locked so that the use of the pre-allocated
	5152	* prepost objects can be guaranteed to be enough if a wakeup_all is
	5153	* performed before unlocking the waitq.
	5154	*/
	5155
	5156	/*
	5157	* If the waitq is no longer associated with a set, or if the waitq's
	5158	* set/prepostid has not changed since we first walked its linkage,
	5159	* we're done.
	5160	*/
	5161	if ((waitq->waitq_set_id == 0) \|\|
	5162	(waitq->waitq_set_id == prev_setid &&
	5163	waitq->waitq_prepost_id == prev_prepostid)) {
	5164	if (keep_locked) {
	5165	goto out;
	5166	}
	5167	goto out_unlock;
	5168	}
	5169
	5170	npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
	5171
	5172	if (npreposts > nalloc) {
	5173	prev_setid = waitq->waitq_set_id;
	5174	prev_prepostid = waitq->waitq_prepost_id;
	5175	npreposts = npreposts - nalloc; /* only allocate the diff */
	5176	goto try_alloc;
	5177	}
	5178
	5179	if (keep_locked) {
	5180	goto out;
	5181	}
	5182
	5183	out_unlock:
	5184	waitq_unlock(waitq);
	5185	out:
	5186	if (wqp) {
	5187	reserved = wqp->wqp_prepostid.id;
	5188	}
	5189
	5190	return reserved;
	5191	}
	5192
	5193	/**
	5194	* release a linked list of prepost objects allocated via _prepost_reserve
	5195	*
	5196	* Conditions:
	5197	* may (rarely) spin waiting for prepost table growth memcpy
	5198	*/
	5199	void
	5200	waitq_prepost_release_reserve(uint64_t id)
	5201	{
	5202	struct wq_prepost *wqp;
	5203
	5204	wqdbg_v("releasing reserved preposts starting at: 0x%llx", id);
	5205
	5206	wqp = wq_prepost_rfirst(id);
	5207	if (!wqp) {
	5208	return;
	5209	}
	5210
	5211	wq_prepost_release_rlist(wqp);
	5212	}
	5213
	5214
	5215	/**
	5216	* clear all preposts from 'wqset'
	5217	*
	5218	* Conditions:
	5219	* 'wqset' is not locked
	5220	*/
	5221	void
	5222	waitq_set_clear_preposts(struct waitq_set *wqset)
	5223	{
	5224	uint64_t prepost_id;
	5225	spl_t spl;
	5226
	5227	assert(waitqs_is_set(wqset));
	5228
	5229	if (!wqset->wqset_q.waitq_prepost \|\| !wqset->wqset_prepost_id) {
	5230	return;
	5231	}
	5232
	5233	wqdbg_v("Clearing all preposted queues on waitq_set: 0x%llx",
	5234	wqset->wqset_id);
	5235
	5236	if (waitq_irq_safe(&wqset->wqset_q)) {
	5237	spl = splsched();
	5238	}
	5239	waitq_set_lock(wqset);
	5240	prepost_id = wqset->wqset_prepost_id;
	5241	wqset->wqset_prepost_id = 0;
	5242	waitq_set_unlock(wqset);
	5243	if (waitq_irq_safe(&wqset->wqset_q)) {
	5244	splx(spl);
	5245	}
	5246
	5247	/* drop / unlink all the prepost table objects */
	5248	if (prepost_id) {
	5249	(void)wq_prepost_iterate(prepost_id, NULL,
	5250	wqset_clear_prepost_chain_cb);
	5251	}
	5252	}
	5253
	5254
	5255	/* ----------------------------------------------------------------------
	5256	*
	5257	* Iteration: waitq -> sets / waitq_set -> preposts
	5258	*
	5259	* ---------------------------------------------------------------------- */
	5260
	5261	struct wq_it_ctx {
	5262	void *input;
	5263	void *ctx;
	5264	waitq_iterator_t it;
	5265	};
	5266
	5267	static int
	5268	waitq_iterate_sets_cb(struct waitq waitq, void ctx,
	5269	struct waitq_link *link)
	5270	{
	5271	struct wq_it_ctx wctx = (struct wq_it_ctx )(ctx);
	5272	struct waitq_set *wqset;
	5273	int ret;
	5274
	5275	(void)waitq;
	5276	assert(!waitq_irq_safe(waitq));
	5277	assert(wql_type(link) == WQL_WQS);
	5278
	5279	/*
	5280	* the waitq is locked, so we can just take the set lock
	5281	* and call the iterator function
	5282	*/
	5283	wqset = link->wql_wqs.wql_set;
	5284	assert(wqset != NULL);
	5285	assert(!waitq_irq_safe(&wqset->wqset_q));
	5286	waitq_set_lock(wqset);
	5287
	5288	ret = wctx->it(wctx->ctx, (struct waitq *)wctx->input, wqset);
	5289
	5290	waitq_set_unlock(wqset);
	5291	return ret;
	5292	}
	5293
	5294	/**
	5295	* call external iterator function for each prepost object in wqset
	5296	*
	5297	* Conditions:
	5298	* Called from wq_prepost_foreach_locked
	5299	* (wqset locked, waitq _not_ locked)
	5300	*/
	5301	static int
	5302	wqset_iterate_prepost_cb(struct waitq_set wqset, void ctx,
	5303	struct wq_prepost wqp, struct waitq waitq)
	5304	{
	5305	struct wq_it_ctx wctx = (struct wq_it_ctx )(ctx);
	5306	uint64_t wqp_id;
	5307	int ret;
	5308
	5309	(void)wqp;
	5310
	5311	/*
	5312	* This is a bit tricky. The 'wqset' is locked, but the 'waitq' is not.
	5313	* Taking the 'waitq' lock is a lock order violation, so we need to be
	5314	* careful. We also must realize that we may have taken a reference to
	5315	* the 'wqp' just as the associated waitq was being torn down (or
	5316	* clearing all its preposts) - see waitq_clear_prepost_locked(). If
	5317	* the 'wqp' is valid and we can get the waitq lock, then we are good
	5318	* to go. If not, we need to back off, check that the 'wqp' hasn't
	5319	* been invalidated, and try to re-take the locks.
	5320	*/
	5321	assert(!waitq_irq_safe(waitq));
	5322
	5323	if (waitq_lock_try(waitq)) {
	5324	goto call_iterator;
	5325	}
	5326
	5327	if (!wqp_is_valid(wqp)) {
	5328	return WQ_ITERATE_RESTART;
	5329	}
	5330
	5331	/* We are passed a prepost object with a reference on it. If neither
	5332	* the waitq set nor the waitq require interrupts disabled, then we
	5333	* may block on the delay(1) call below. We can't hold a prepost
	5334	* object reference while blocking, so we have to give that up as well
	5335	* and re-acquire it when we come back.
	5336	*/
	5337	wqp_id = wqp->wqp_prepostid.id;
	5338	wq_prepost_put(wqp);
	5339	waitq_set_unlock(wqset);
	5340	wqdbg_v("dropped set:%p lock waiting for wqp:%p (0x%llx -> wq:%p)",
	5341	wqset, wqp, wqp->wqp_prepostid.id, waitq);
	5342	delay(1);
	5343	waitq_set_lock(wqset);
	5344	wqp = wq_prepost_get(wqp_id);
	5345	if (!wqp) {
	5346	/* someone cleared preposts while we slept! */
	5347	return WQ_ITERATE_DROPPED;
	5348	}
	5349
	5350	/*
	5351	* TODO:
	5352	* This differs slightly from the logic in ipc_mqueue.c:
	5353	* ipc_mqueue_receive_on_thread(). There, if the waitq lock
	5354	* can't be obtained, the prepost link is placed on the back of
	5355	* the chain, and the iteration starts from the beginning. Here,
	5356	* we just restart from the beginning.
	5357	*/
	5358	return WQ_ITERATE_RESTART;
	5359
	5360	call_iterator:
	5361	if (!wqp_is_valid(wqp)) {
	5362	ret = WQ_ITERATE_RESTART;
	5363	goto out_unlock;
	5364	}
	5365
	5366	/* call the external callback */
	5367	ret = wctx->it(wctx->ctx, waitq, wqset);
	5368
	5369	if (ret == WQ_ITERATE_BREAK_KEEP_LOCKED) {
	5370	ret = WQ_ITERATE_BREAK;
	5371	goto out;
	5372	}
	5373
	5374	out_unlock:
	5375	waitq_unlock(waitq);
	5376	out:
	5377	return ret;
	5378	}
	5379
	5380	/**
	5381	* iterator over all sets to which the given waitq has been linked
	5382	*
	5383	* Conditions:
	5384	* 'waitq' is locked
	5385	*/
	5386	int
	5387	waitq_iterate_sets(struct waitq waitq, void ctx, waitq_iterator_t it)
	5388	{
	5389	int ret;
	5390	struct wq_it_ctx wctx = {
	5391	.input = (void *)waitq,
	5392	.ctx = ctx,
	5393	.it = it,
	5394	};
	5395	if (!it \|\| !waitq) {
	5396	return KERN_INVALID_ARGUMENT;
	5397	}
	5398
	5399	ret = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	5400	WQL_WQS, (void *)&wctx, waitq_iterate_sets_cb);
	5401	if (ret == WQ_ITERATE_CONTINUE) {
	5402	ret = WQ_ITERATE_SUCCESS;
	5403	}
	5404	return ret;
	5405	}
	5406
	5407	/**
	5408	* iterator over all preposts in the given wqset
	5409	*
	5410	* Conditions:
	5411	* 'wqset' is locked
	5412	*/
	5413	int
	5414	waitq_set_iterate_preposts(struct waitq_set *wqset,
	5415	void *ctx, waitq_iterator_t it)
	5416	{
	5417	struct wq_it_ctx wctx = {
	5418	.input = (void *)wqset,
	5419	.ctx = ctx,
	5420	.it = it,
	5421	};
	5422	if (!it \|\| !wqset) {
	5423	return WQ_ITERATE_INVALID;
	5424	}
	5425
	5426	assert(waitq_held(&wqset->wqset_q));
	5427
	5428	return wq_prepost_foreach_locked(wqset, (void *)&wctx,
	5429	wqset_iterate_prepost_cb);
	5430	}
	5431
	5432
	5433	/* ----------------------------------------------------------------------
	5434	*
	5435	* Higher-level APIs
	5436	*
	5437	* ---------------------------------------------------------------------- */
	5438
	5439
	5440	/**
	5441	* declare a thread's intent to wait on 'waitq' for 'wait_event'
	5442	*
	5443	* Conditions:
	5444	* 'waitq' is not locked
	5445	*/
	5446	wait_result_t
	5447	waitq_assert_wait64(struct waitq *waitq,
	5448	event64_t wait_event,
	5449	wait_interrupt_t interruptible,
	5450	uint64_t deadline)
	5451	{
	5452	thread_t thread = current_thread();
	5453	wait_result_t ret;
	5454	spl_t s;
	5455
	5456	if (!waitq_valid(waitq)) {
	5457	panic("Invalid waitq: %p", waitq);
	5458	}
	5459
	5460	if (waitq_irq_safe(waitq)) {
	5461	s = splsched();
	5462	}
	5463
	5464	waitq_lock(waitq);
	5465	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
	5466	TIMEOUT_URGENCY_SYS_NORMAL,
	5467	deadline, TIMEOUT_NO_LEEWAY, thread);
	5468	waitq_unlock(waitq);
	5469
	5470	if (waitq_irq_safe(waitq)) {
	5471	splx(s);
	5472	}
	5473
	5474	return ret;
	5475	}
	5476
	5477	/**
	5478	* declare a thread's intent to wait on 'waitq' for 'wait_event'
	5479	*
	5480	* Conditions:
	5481	* 'waitq' is not locked
	5482	* will disable and re-enable interrupts while locking current_thread()
	5483	*/
	5484	wait_result_t
	5485	waitq_assert_wait64_leeway(struct waitq *waitq,
	5486	event64_t wait_event,
	5487	wait_interrupt_t interruptible,
	5488	wait_timeout_urgency_t urgency,
	5489	uint64_t deadline,
	5490	uint64_t leeway)
	5491	{
	5492	wait_result_t ret;
	5493	thread_t thread = current_thread();
	5494	spl_t s;
	5495
	5496	if (!waitq_valid(waitq)) {
	5497	panic("Invalid waitq: %p", waitq);
	5498	}
	5499
	5500	if (waitq_irq_safe(waitq)) {
	5501	s = splsched();
	5502	}
	5503
	5504	waitq_lock(waitq);
	5505	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
	5506	urgency, deadline, leeway, thread);
	5507	waitq_unlock(waitq);
	5508
	5509	if (waitq_irq_safe(waitq)) {
	5510	splx(s);
	5511	}
	5512
	5513	return ret;
	5514	}
	5515
	5516	/**
	5517	* wakeup a single thread from a waitq that's waiting for a given event
	5518	*
	5519	* Conditions:
	5520	* 'waitq' is not locked
	5521	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
	5522	* may disable and re-enable interrupts
	5523	*
	5524	* Notes:
	5525	* will _not_ block if waitq is global (or not a member of any set)
	5526	*/
	5527	kern_return_t
	5528	waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
	5529	wait_result_t result, int priority)
	5530	{
	5531	kern_return_t kr;
	5532	uint64_t reserved_preposts = 0;
	5533	spl_t spl;
	5534
	5535	if (!waitq_valid(waitq)) {
	5536	panic("Invalid waitq: %p", waitq);
	5537	}
	5538
	5539	if (!waitq_irq_safe(waitq)) {
	5540	/* reserve preposts in addition to locking the waitq */
	5541	reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);
	5542	} else {
	5543	spl = splsched();
	5544	waitq_lock(waitq);
	5545	}
	5546
	5547	/* waitq is locked upon return */
	5548	kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
	5549	&reserved_preposts, priority, WAITQ_UNLOCK);
	5550
	5551	if (waitq_irq_safe(waitq)) {
	5552	splx(spl);
	5553	}
	5554
	5555	/* release any left-over prepost object (won't block/lock anything) */
	5556	waitq_prepost_release_reserve(reserved_preposts);
	5557
	5558	return kr;
	5559	}
	5560
	5561	/**
	5562	* wakeup all threads from a waitq that are waiting for a given event
	5563	*
	5564	* Conditions:
	5565	* 'waitq' is not locked
	5566	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
	5567	* may disable and re-enable interrupts
	5568	*
	5569	* Notes:
	5570	* will _not_ block if waitq is global (or not a member of any set)
	5571	*/
	5572	kern_return_t
	5573	waitq_wakeup64_all(struct waitq *waitq,
	5574	event64_t wake_event,
	5575	wait_result_t result,
	5576	int priority)
	5577	{
	5578	kern_return_t ret;
	5579	uint64_t reserved_preposts = 0;
	5580	spl_t s;
	5581
	5582	if (!waitq_valid(waitq)) {
	5583	panic("Invalid waitq: %p", waitq);
	5584	}
	5585
	5586	if (!waitq_irq_safe(waitq)) {
	5587	/* reserve preposts in addition to locking waitq */
	5588	reserved_preposts = waitq_prepost_reserve(waitq, 0,
	5589	WAITQ_KEEP_LOCKED);
	5590	} else {
	5591	s = splsched();
	5592	waitq_lock(waitq);
	5593	}
	5594
	5595	ret = waitq_wakeup64_all_locked(waitq, wake_event, result,
	5596	&reserved_preposts, priority,
	5597	WAITQ_UNLOCK);
	5598
	5599	if (waitq_irq_safe(waitq)) {
	5600	splx(s);
	5601	}
	5602
	5603	waitq_prepost_release_reserve(reserved_preposts);
	5604
	5605	return ret;
	5606	}
	5607
	5608	/**
	5609	* wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
	5610	*
	5611	* Conditions:
	5612	* 'waitq' is not locked
	5613	*
	5614	* Notes:
	5615	* May temporarily disable and re-enable interrupts
	5616	*/
	5617	kern_return_t
	5618	waitq_wakeup64_thread(struct waitq *waitq,
	5619	event64_t wake_event,
	5620	thread_t thread,
	5621	wait_result_t result)
	5622	{
	5623	kern_return_t ret;
	5624	spl_t s, th_spl;
	5625
	5626	if (!waitq_valid(waitq)) {
	5627	panic("Invalid waitq: %p", waitq);
	5628	}
	5629
	5630	if (waitq_irq_safe(waitq)) {
	5631	s = splsched();
	5632	}
	5633	waitq_lock(waitq);
	5634
	5635	ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
	5636	/* on success, returns 'thread' locked */
	5637
	5638	waitq_unlock(waitq);
	5639
	5640	if (ret == KERN_SUCCESS) {
	5641	ret = thread_go(thread, result);
	5642	assert(ret == KERN_SUCCESS);
	5643	thread_unlock(thread);
	5644	splx(th_spl);
	5645	waitq_stats_count_wakeup(waitq);
	5646	} else {
	5647	ret = KERN_NOT_WAITING;
	5648	waitq_stats_count_fail(waitq);
	5649	}
	5650
	5651	if (waitq_irq_safe(waitq)) {
	5652	splx(s);
	5653	}
	5654
	5655	return ret;
	5656	}
	5657
	5658	/**
	5659	* wakeup a single thread from a waitq that's waiting for a given event
	5660	* and return a reference to that thread
	5661	* returns THREAD_NULL if no thread was waiting
	5662	*
	5663	* Conditions:
	5664	* 'waitq' is not locked
	5665	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
	5666	* may disable and re-enable interrupts
	5667	*
	5668	* Notes:
	5669	* will _not_ block if waitq is global (or not a member of any set)
	5670	*/
	5671	thread_t
	5672	waitq_wakeup64_identify(struct waitq *waitq,
	5673	event64_t wake_event,
	5674	wait_result_t result,
	5675	int priority)
	5676	{
	5677	uint64_t reserved_preposts = 0;
	5678	spl_t thread_spl = 0;
	5679	thread_t thread;
	5680	spl_t spl;
	5681
	5682	if (!waitq_valid(waitq)) {
	5683	panic("Invalid waitq: %p", waitq);
	5684	}
	5685
	5686	if (!waitq_irq_safe(waitq)) {
	5687	/* reserve preposts in addition to locking waitq */
	5688	reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);
	5689	} else {
	5690	spl = splsched();
	5691	waitq_lock(waitq);
	5692	}
	5693
	5694	thread = waitq_wakeup64_identify_locked(waitq, wake_event, result,
	5695	&thread_spl, &reserved_preposts,
	5696	priority, WAITQ_UNLOCK);
	5697	/* waitq is unlocked, thread is locked */
	5698
	5699	if (thread != THREAD_NULL) {
	5700	thread_reference(thread);
	5701	thread_unlock(thread);
	5702	splx(thread_spl);
	5703	}
	5704
	5705	if (waitq_irq_safe(waitq)) {
	5706	splx(spl);
	5707	}
	5708
	5709	/* release any left-over prepost object (won't block/lock anything) */
	5710	waitq_prepost_release_reserve(reserved_preposts);
	5711
	5712	/* returns +1 ref to running thread or THREAD_NULL */
	5713	return thread;
	5714	}