git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2015-2020 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/*
	29	* @OSF_FREE_COPYRIGHT@
	30	*/
	31	/*
	32	* Mach Operating System
	33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
	34	* All Rights Reserved.
	35	*
	36	* Permission to use, copy, modify and distribute this software and its
	37	* documentation is hereby granted, provided that both the copyright
	38	* notice and this permission notice appear in all copies of the
	39	* software, derivative works or modified versions, and any portions
	40	* thereof, and that both notices appear in supporting documentation.
	41	*
	42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
	43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
	44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
	45	*
	46	* Carnegie Mellon requests users of this software to return to
	47	*
	48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
	49	* School of Computer Science
	50	* Carnegie Mellon University
	51	* Pittsburgh PA 15213-3890
	52	*
	53	* any improvements or extensions that they make and grant Carnegie Mellon
	54	* the rights to redistribute these changes.
	55	*/
	56
	57	/*
	58	* un-comment the following lines to debug the link/prepost tables
	59	* NOTE: this expands each element by ~40 bytes
	60	*/
	61	//#define KEEP_WAITQ_LINK_STATS
	62	//#define KEEP_WAITQ_PREPOST_STATS
	63
	64	#include <kern/ast.h>
	65	#include <kern/backtrace.h>
	66	#include <kern/kern_types.h>
	67	#include <kern/ltable.h>
	68	#include <kern/mach_param.h>
	69	#include <kern/percpu.h>
	70	#include <kern/queue.h>
	71	#include <kern/sched_prim.h>
	72	#include <kern/simple_lock.h>
	73	#include <kern/spl.h>
	74	#include <kern/waitq.h>
	75	#include <kern/zalloc.h>
	76	#include <kern/policy_internal.h>
	77	#include <kern/turnstile.h>
	78
	79	#include <os/hash.h>
	80	#include <libkern/OSAtomic.h>
	81	#include <mach/sync_policy.h>
	82	#include <vm/vm_kern.h>
	83
	84	#include <sys/kdebug.h>
	85
	86	#if defined(KEEP_WAITQ_LINK_STATS) \|\| defined(KEEP_WAITQ_PREPOST_STATS)
	87	# if !CONFIG_LTABLE_STATS
	88	# error "You must configure LTABLE_STATS to use WAITQ_[LINK\|PREPOST]_STATS"
	89	# endif
	90	# if !CONFIG_WAITQ_STATS
	91	# error "You must configure WAITQ_STATS to use WAITQ_[LINK\|PREPOST]_STATS"
	92	# endif
	93	#endif
	94
	95	#if CONFIG_WAITQ_DEBUG
	96	#define wqdbg(fmt, ...) \
	97	printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__)
	98	#else
	99	#define wqdbg(fmt, ...) do { } while (0)
	100	#endif
	101
	102	#ifdef WAITQ_VERBOSE_DEBUG
	103	#define wqdbg_v(fmt, ...) \
	104	printf("WQ[v:%s]: " fmt "\n", __func__, ## __VA_ARGS__)
	105	#else
	106	#define wqdbg_v(fmt, ...) do { } while (0)
	107	#endif
	108
	109	#define wqinfo(fmt, ...) \
	110	printf("WQ[%s]: " fmt "\n", __func__, ## __VA_ARGS__)
	111
	112	#define wqerr(fmt, ...) \
	113	printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__)
	114
	115	/*
	116	* file-static functions / data
	117	*/
	118	static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
	119	uint64_t *reserved_preposts,
	120	int priority, spl_t *spl);
	121
	122	static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
	123	event64_t event,
	124	thread_t thread, spl_t *spl);
	125
	126	ZONE_DECLARE(waitq_set_zone, "waitq sets",
	127	sizeof(struct waitq_set), ZC_NOENCRYPT);
	128
	129	/* waitq prepost cache */
	130	#define WQP_CACHE_MAX 50
	131	struct wqp_cache {
	132	uint64_t head;
	133	unsigned int avail;
	134	};
	135	static struct wqp_cache PERCPU_DATA(wqp_cache);
	136
	137	#define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
	138	#define ROUNDDOWN(x, y) (((x)/(y))*(y))
	139
	140
	141	#if CONFIG_LTABLE_STATS \|\| CONFIG_WAITQ_STATS
	142	static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
	143	#endif
	144
	145	LCK_GRP_DECLARE(waitq_lck_grp, "waitq");
	146
	147	#if __arm64__
	148
	149	#define waitq_lock_to(wq, to) \
	150	(hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, to, &waitq_lck_grp))
	151
	152	#define waitq_lock_unlock(wq) \
	153	(hw_unlock_bit(&(wq)->waitq_interlock, LCK_ILOCK))
	154
	155	#define waitq_lock_init(wq) \
	156	(wq->waitq_interlock = 0)
	157
	158	#else
	159
	160	#define waitq_lock_to(wq, to) \
	161	(hw_lock_to(&(wq)->waitq_interlock, to, &waitq_lck_grp))
	162
	163	#define waitq_lock_unlock(wq) \
	164	(hw_lock_unlock(&(wq)->waitq_interlock))
	165
	166	#define waitq_lock_init(wq) \
	167	(hw_lock_init(&(wq)->waitq_interlock))
	168
	169	#endif /* __arm64__ */
	170
	171	/*
	172	* Prepost callback function for specially marked waitq sets
	173	* (prepost alternative)
	174	*/
	175	extern void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *ctx);
	176
	177	#define DEFAULT_MIN_FREE_TABLE_ELEM 100
	178	static uint32_t g_min_free_table_elem;
	179	static uint32_t g_min_free_cache;
	180
	181
	182	/* ----------------------------------------------------------------------
	183	*
	184	* SetID Link Table Implementation
	185	*
	186	* ---------------------------------------------------------------------- */
	187	static struct link_table g_wqlinktable;
	188
	189	enum wq_link_type {
	190	WQL_ALL = -1,
	191	WQL_FREE = LT_FREE,
	192	WQL_WQS = LT_ELEM,
	193	WQL_LINK = LT_LINK,
	194	};
	195
	196	struct waitq_link {
	197	struct lt_elem wqte;
	198
	199	union {
	200	/* wqt_type == WQL_WQS (LT_ELEM) */
	201	struct {
	202	struct waitq_set *wql_set;
	203	/* uint64_t sl_prepost_id; */
	204	} wql_wqs;
	205
	206	/* wqt_type == WQL_LINK (LT_LINK) */
	207	struct {
	208	uint64_t left_setid;
	209	uint64_t right_setid;
	210	} wql_link;
	211	};
	212	#ifdef KEEP_WAITQ_LINK_STATS
	213	thread_t sl_alloc_th;
	214	task_t sl_alloc_task;
	215	uintptr_t sl_alloc_bt[NWAITQ_BTFRAMES];
	216	uint64_t sl_alloc_ts;
	217	uintptr_t sl_invalidate_bt[NWAITQ_BTFRAMES];
	218	uint64_t sl_invalidate_ts;
	219	uintptr_t sl_mkvalid_bt[NWAITQ_BTFRAMES];
	220	uint64_t sl_mkvalid_ts;
	221	uint64_t sl_free_ts;
	222	#endif
	223	};
	224	#if !defined(KEEP_WAITQ_LINK_STATS)
	225	static_assert((sizeof(struct waitq_link) & (sizeof(struct waitq_link) - 1)) == 0,
	226	"waitq_link struct must be a power of two!");
	227	#endif
	228
	229	#define wql_refcnt(link) \
	230	(lt_bits_refcnt((link)->wqte.lt_bits))
	231
	232	#define wql_type(link) \
	233	(lt_bits_type((link)->wqte.lt_bits))
	234
	235	#define wql_mkvalid(link) \
	236	do { \
	237	lt_elem_mkvalid(&(link)->wqte); \
	238	wql_do_mkvalid_stats(&(link)->wqte); \
	239	} while (0)
	240
	241	#define wql_is_valid(link) \
	242	lt_bits_valid((link)->wqte.lt_bits)
	243
	244	#define wql_setid wqte.lt_id
	245
	246	#define WQL_WQS_POISON ((void *)(0xf00df00d))
	247	#define WQL_LINK_POISON (0x0bad0badffffffffull)
	248
	249	static void
	250	wql_poison(struct link_table table, struct lt_elem elem)
	251	{
	252	struct waitq_link link = (struct waitq_link )elem;
	253	(void)table;
	254
	255	switch (wql_type(link)) {
	256	case WQL_WQS:
	257	link->wql_wqs.wql_set = WQL_WQS_POISON;
	258	break;
	259	case WQL_LINK:
	260	link->wql_link.left_setid = WQL_LINK_POISON;
	261	link->wql_link.right_setid = WQL_LINK_POISON;
	262	break;
	263	default:
	264	break;
	265	}
	266	#ifdef KEEP_WAITQ_LINK_STATS
	267	memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
	268	link->sl_alloc_ts = 0;
	269	memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
	270	link->sl_mkvalid_ts = 0;
	271
	272	link->sl_alloc_th = THREAD_NULL;
	273	/* leave the sl_alloc_task in place for debugging */
	274
	275	link->sl_free_ts = mach_absolute_time();
	276	#endif
	277	}
	278
	279	#ifdef KEEP_WAITQ_LINK_STATS
	280	static __inline__ void
	281	wql_do_alloc_stats(struct lt_elem *elem)
	282	{
	283	if (elem) {
	284	struct waitq_link link = (struct waitq_link )elem;
	285	memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
	286	waitq_grab_backtrace(link->sl_alloc_bt, 0);
	287	link->sl_alloc_th = current_thread();
	288	link->sl_alloc_task = current_task();
	289
	290	assert(link->sl_alloc_ts == 0);
	291	link->sl_alloc_ts = mach_absolute_time();
	292
	293	memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
	294	link->sl_invalidate_ts = 0;
	295	}
	296	}
	297
	298	static __inline__ void
	299	wql_do_invalidate_stats(struct lt_elem *elem)
	300	{
	301	struct waitq_link link = (struct waitq_link )elem;
	302
	303	if (!elem) {
	304	return;
	305	}
	306
	307	assert(link->sl_mkvalid_ts > 0);
	308
	309	memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
	310	link->sl_invalidate_ts = mach_absolute_time();
	311	waitq_grab_backtrace(link->sl_invalidate_bt, 0);
	312	}
	313
	314	static __inline__ void
	315	wql_do_mkvalid_stats(struct lt_elem *elem)
	316	{
	317	struct waitq_link link = (struct waitq_link )elem;
	318
	319	if (!elem) {
	320	return;
	321	}
	322
	323	memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
	324	link->sl_mkvalid_ts = mach_absolute_time();
	325	waitq_grab_backtrace(link->sl_mkvalid_bt, 0);
	326	}
	327	#else
	328	#define wql_do_alloc_stats(e)
	329	#define wql_do_invalidate_stats(e)
	330	#define wql_do_mkvalid_stats(e)
	331	#endif /* KEEP_WAITQ_LINK_STATS */
	332
	333	static void
	334	wql_init(void)
	335	{
	336	uint32_t tablesz = 0, max_links = 0;
	337
	338	if (PE_parse_boot_argn("wql_tsize", &tablesz, sizeof(tablesz)) != TRUE) {
	339	tablesz = (uint32_t)g_lt_max_tbl_size;
	340	}
	341
	342	tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
	343	max_links = tablesz / sizeof(struct waitq_link);
	344	assert(max_links > 0 && tablesz > 0);
	345
	346	/* we have a restricted index range */
	347	if (max_links > (LT_IDX_MAX + 1)) {
	348	max_links = LT_IDX_MAX + 1;
	349	}
	350
	351	wqinfo("init linktable with max:%d elements (%d bytes)",
	352	max_links, tablesz);
	353	ltable_init(&g_wqlinktable, "wqslab.wql", max_links,
	354	sizeof(struct waitq_link), wql_poison);
	355	}
	356
	357	static void
	358	wql_ensure_free_space(void)
	359	{
	360	if (g_wqlinktable.nelem - g_wqlinktable.used_elem < g_min_free_table_elem) {
	361	/*
	362	* we don't hold locks on these values, so check for underflow
	363	*/
	364	if (g_wqlinktable.used_elem <= g_wqlinktable.nelem) {
	365	wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d",
	366	g_wqlinktable.nelem, g_wqlinktable.used_elem,
	367	g_min_free_table_elem);
	368	ltable_grow(&g_wqlinktable, g_min_free_table_elem);
	369	}
	370	}
	371	}
	372
	373	static struct waitq_link *
	374	wql_alloc_link(int type)
	375	{
	376	struct lt_elem *elem;
	377
	378	elem = ltable_alloc_elem(&g_wqlinktable, type, 1, 0);
	379	wql_do_alloc_stats(elem);
	380	return (struct waitq_link *)elem;
	381	}
	382
	383	static void
	384	wql_realloc_link(struct waitq_link *link, int type)
	385	{
	386	ltable_realloc_elem(&g_wqlinktable, &link->wqte, type);
	387	#ifdef KEEP_WAITQ_LINK_STATS
	388	memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
	389	link->sl_alloc_ts = 0;
	390	wql_do_alloc_stats(&link->wqte);
	391
	392	memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
	393	link->sl_invalidate_ts = 0;
	394	#endif
	395	}
	396
	397	static void
	398	wql_invalidate(struct waitq_link *link)
	399	{
	400	lt_elem_invalidate(&link->wqte);
	401	wql_do_invalidate_stats(&link->wqte);
	402	}
	403
	404	static struct waitq_link *
	405	wql_get_link(uint64_t setid)
	406	{
	407	struct lt_elem *elem;
	408
	409	elem = ltable_get_elem(&g_wqlinktable, setid);
	410	return (struct waitq_link *)elem;
	411	}
	412
	413	static void
	414	wql_put_link(struct waitq_link *link)
	415	{
	416	if (!link) {
	417	return;
	418	}
	419	ltable_put_elem(&g_wqlinktable, (struct lt_elem *)link);
	420	}
	421
	422	static struct waitq_link *
	423	wql_get_reserved(uint64_t setid, int type)
	424	{
	425	struct lt_elem *elem;
	426
	427	elem = lt_elem_list_first(&g_wqlinktable, setid);
	428	if (!elem) {
	429	return NULL;
	430	}
	431	ltable_realloc_elem(&g_wqlinktable, elem, type);
	432	return (struct waitq_link *)elem;
	433	}
	434
	435
	436	static inline int waitq_maybe_remove_link(struct waitq *waitq,
	437	uint64_t setid,
	438	struct waitq_link *parent,
	439	struct waitq_link *left,
	440	struct waitq_link *right);
	441
	442	enum {
	443	LINK_WALK_ONE_LEVEL = 0,
	444	LINK_WALK_FULL_DAG = 1,
	445	LINK_WALK_FULL_DAG_UNLOCKED = 2,
	446	};
	447
	448	typedef int (wql_callback_func)(struct waitq waitq, void *ctx,
	449	struct waitq_link *link);
	450
	451	/**
	452	* walk_waitq_links: walk all table elements (of type 'link_type') pointed to by 'setid'
	453	*
	454	* Conditions:
	455	* waitq is locked (or NULL)
	456	* 'setid' is managed by 'waitq'
	457	* this could be direct (waitq->waitq_set_id == setid)
	458	* OR indirect (setid is the left/right ID in a LINK chain,
	459	* whose root is waitq->waitq_set_id)
	460	*
	461	* Notes:
	462	* This function uses recursion to walk the set of table elements
	463	* pointed to by 'setid'. For each element encountered, 'cb' will be
	464	* called. If non-zero, the return value of this callback function can
	465	* early-out of the table walk.
	466	*
	467	* For each link element encountered, the function takes a reference to
	468	* it. The reference is dropped only after the callback and any recursion
	469	* has completed.
	470	*
	471	* The assumed table/link/tree structure:
	472	* 'setid'
	473	* / \
	474	* / \
	475	* L(LINK) R(LINK)
	476	* /\ /\
	477	* / \ / \
	478	* / \ Rl() Rr()
	479	* Ll() Lr() /\ /\
	480	* /\ /\ ... ... ... ...
	481	* ... ... ... ...
	482	* \
	483	* WQS(wqset_q.waitq_setid == Sx)
	484	* [waitq set is a membet of setid, 'Sx')
	485	*
	486	* 'Sx'
	487	* / \
	488	* / \
	489	* L(LINK) R(LINK)
	490	* /\ /\
	491	* ... ... ... ...
	492	*
	493	* The basic algorithm is as follows:
	494	* *) take a reference to the table object pointed to by 'setid'
	495	* *) if appropriate, call 'cb' (potentially early-out on non-zero return)
	496	* *) if the link object points to a waitq set, and the walk type
	497	* is 'FULL_DAG' (full directed-acyclic-graph), then try to lock
	498	* the associated waitq set object and recursively walk all sets to
	499	* which that set belongs. This is a DFS of the tree structure.
	500	* *) recurse down the left side of the tree (following the
	501	* 'left_setid' pointer in the link object
	502	* *) recurse down the right side of the tree (following the
	503	* 'right_setid' pointer in the link object
	504	*/
	505	static __attribute__((noinline))
	506	int
	507	walk_waitq_links(int walk_type, struct waitq *waitq,
	508	uint64_t setid, int link_type,
	509	void *ctx, wql_callback_func cb)
	510	{
	511	struct waitq_link *link;
	512	uint64_t nextid;
	513	int wqltype;
	514
	515	link = wql_get_link(setid);
	516
	517	/* invalid link */
	518	if (!link) {
	519	return WQ_ITERATE_CONTINUE;
	520	}
	521
	522	setid = nextid = 0;
	523	wqltype = wql_type(link);
	524	if (wqltype == WQL_LINK) {
	525	setid = link->wql_link.left_setid;
	526	nextid = link->wql_link.right_setid;
	527	}
	528
	529	/*
	530	* Make the callback only on specified link_type (or all links)
	531	* Note that after the callback, the link object may be
	532	* invalid. The only valid thing we can do is put our
	533	* reference to it (which may put it back on the free list)
	534	*/
	535	if (link_type == WQL_ALL \|\| link_type == wqltype) {
	536	/* allow the callback to early-out */
	537	int ret = cb(waitq, ctx, link);
	538	if (ret != WQ_ITERATE_CONTINUE) {
	539	wql_put_link(link);
	540	return ret;
	541	}
	542	}
	543
	544	if (wqltype == WQL_WQS &&
	545	(walk_type == LINK_WALK_FULL_DAG \|\|
	546	walk_type == LINK_WALK_FULL_DAG_UNLOCKED)) {
	547	/*
	548	* Recurse down any sets to which this wait queue set was
	549	* added. We do this just before we put our reference to
	550	* the link object (which may free it).
	551	*/
	552	struct waitq_set *wqset = link->wql_wqs.wql_set;
	553	int ret = WQ_ITERATE_CONTINUE;
	554	int should_unlock = 0;
	555	uint64_t wqset_setid = 0;
	556
	557	if (waitq_set_is_valid(wqset) && walk_type == LINK_WALK_FULL_DAG) {
	558	assert(!waitq_irq_safe(&wqset->wqset_q));
	559	waitq_set_lock(wqset);
	560	should_unlock = 1;
	561	}
	562
	563	/*
	564	* verify the linked waitq set as it could have been
	565	* invalidated before we grabbed the lock!
	566	*/
	567	if (wqset->wqset_id != link->wql_setid.id) {
	568	/* This is the bottom of the tree: just get out */
	569	if (should_unlock) {
	570	waitq_set_unlock(wqset);
	571	}
	572	wql_put_link(link);
	573	return WQ_ITERATE_CONTINUE;
	574	}
	575
	576	wqset_setid = wqset->wqset_q.waitq_set_id;
	577
	578	if (wqset_setid > 0) {
	579	ret = walk_waitq_links(walk_type, &wqset->wqset_q,
	580	wqset_setid, link_type, ctx, cb);
	581	}
	582	if (should_unlock) {
	583	waitq_set_unlock(wqset);
	584	}
	585	if (ret != WQ_ITERATE_CONTINUE) {
	586	wql_put_link(link);
	587	return ret;
	588	}
	589	}
	590
	591	wql_put_link(link);
	592
	593	/* recurse down left side of the tree */
	594	if (setid) {
	595	int ret = walk_waitq_links(walk_type, waitq, setid, link_type, ctx, cb);
	596	if (ret != WQ_ITERATE_CONTINUE) {
	597	return ret;
	598	}
	599	}
	600
	601	/* recurse down right side of the tree */
	602	if (nextid) {
	603	return walk_waitq_links(walk_type, waitq, nextid, link_type, ctx, cb);
	604	}
	605
	606	return WQ_ITERATE_CONTINUE;
	607	}
	608
	609	/* ----------------------------------------------------------------------
	610	*
	611	* Prepost Link Table Implementation
	612	*
	613	* ---------------------------------------------------------------------- */
	614	static struct link_table g_prepost_table;
	615
	616	enum wq_prepost_type {
	617	WQP_FREE = LT_FREE,
	618	WQP_WQ = LT_ELEM,
	619	WQP_POST = LT_LINK,
	620	};
	621
	622	struct wq_prepost {
	623	struct lt_elem wqte;
	624
	625	union {
	626	/* wqt_type == WQP_WQ (LT_ELEM) */
	627	struct {
	628	struct waitq *wqp_wq_ptr;
	629	} wqp_wq;
	630	/* wqt_type == WQP_POST (LT_LINK) */
	631	struct {
	632	uint64_t wqp_next_id;
	633	uint64_t wqp_wq_id;
	634	} wqp_post;
	635	};
	636	#ifdef KEEP_WAITQ_PREPOST_STATS
	637	thread_t wqp_alloc_th;
	638	task_t wqp_alloc_task;
	639	uintptr_t wqp_alloc_bt[NWAITQ_BTFRAMES];
	640	#endif
	641	};
	642	#if !defined(KEEP_WAITQ_PREPOST_STATS)
	643	static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - 1)) == 0,
	644	"wq_prepost struct must be a power of two!");
	645	#endif
	646
	647	#define wqp_refcnt(wqp) \
	648	(lt_bits_refcnt((wqp)->wqte.lt_bits))
	649
	650	#define wqp_type(wqp) \
	651	(lt_bits_type((wqp)->wqte.lt_bits))
	652
	653	#define wqp_set_valid(wqp) \
	654	lt_elem_mkvalid(&(wqp)->wqte)
	655
	656	#define wqp_is_valid(wqp) \
	657	lt_bits_valid((wqp)->wqte.lt_bits)
	658
	659	#define wqp_prepostid wqte.lt_id
	660
	661	#define WQP_WQ_POISON (0x0bad0badffffffffull)
	662	#define WQP_POST_POISON (0xf00df00df00df00d)
	663
	664	static void
	665	wqp_poison(struct link_table table, struct lt_elem elem)
	666	{
	667	struct wq_prepost wqp = (struct wq_prepost )elem;
	668	(void)table;
	669
	670	switch (wqp_type(wqp)) {
	671	case WQP_WQ:
	672	break;
	673	case WQP_POST:
	674	wqp->wqp_post.wqp_next_id = WQP_POST_POISON;
	675	wqp->wqp_post.wqp_wq_id = WQP_POST_POISON;
	676	break;
	677	default:
	678	break;
	679	}
	680	}
	681
	682	#ifdef KEEP_WAITQ_PREPOST_STATS
	683	static __inline__ void
	684	wqp_do_alloc_stats(struct lt_elem *elem)
	685	{
	686	if (!elem) {
	687	return;
	688	}
	689
	690	struct wq_prepost wqp = (struct wq_prepost )elem;
	691	uintptr_t alloc_bt[sizeof(wqp->wqp_alloc_bt)];
	692
	693	waitq_grab_backtrace(alloc_bt, NWAITQ_BTFRAMES);
	694
	695	/* be sure the take stats for _all_ allocated objects */
	696	for (;;) {
	697	memcpy(wqp->wqp_alloc_bt, alloc_bt, sizeof(alloc_bt));
	698	wqp->wqp_alloc_th = current_thread();
	699	wqp->wqp_alloc_task = current_task();
	700	wqp = (struct wq_prepost *)lt_elem_list_next(&g_prepost_table, &wqp->wqte);
	701	if (!wqp) {
	702	break;
	703	}
	704	}
	705	}
	706	#else
	707	#define wqp_do_alloc_stats(e)
	708	#endif /* KEEP_WAITQ_LINK_STATS */
	709
	710	static void
	711	wqp_init(void)
	712	{
	713	uint32_t tablesz = 0, max_wqp = 0;
	714
	715	if (PE_parse_boot_argn("wqp_tsize", &tablesz, sizeof(tablesz)) != TRUE) {
	716	tablesz = (uint32_t)g_lt_max_tbl_size;
	717	}
	718
	719	tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
	720	max_wqp = tablesz / sizeof(struct wq_prepost);
	721	assert(max_wqp > 0 && tablesz > 0);
	722
	723	/* we have a restricted index range */
	724	if (max_wqp > (LT_IDX_MAX + 1)) {
	725	max_wqp = LT_IDX_MAX + 1;
	726	}
	727
	728	wqinfo("init prepost table with max:%d elements (%d bytes)",
	729	max_wqp, tablesz);
	730	ltable_init(&g_prepost_table, "wqslab.prepost", max_wqp,
	731	sizeof(struct wq_prepost), wqp_poison);
	732	}
	733
	734	/*
	735	* Refill the per-CPU cache.
	736	*/
	737	static void
	738	wq_prepost_refill_cpu_cache(uint32_t nalloc)
	739	{
	740	struct lt_elem new_head, old_head;
	741	struct wqp_cache *cache;
	742
	743	/* require preemption enabled to allocate elements */
	744	if (get_preemption_level() != 0) {
	745	return;
	746	}
	747
	748	new_head = ltable_alloc_elem(&g_prepost_table,
	749	LT_RESERVED, nalloc, 1);
	750	if (new_head == NULL) {
	751	return;
	752	}
	753
	754	disable_preemption();
	755	cache = PERCPU_GET(wqp_cache);
	756
	757	/* check once more before putting these elements on the list */
	758	if (cache->avail >= WQP_CACHE_MAX) {
	759	lt_elem_list_release(&g_prepost_table, new_head, LT_RESERVED);
	760	enable_preemption();
	761	return;
	762	}
	763
	764	cache->avail += nalloc;
	765	if (cache->head == 0 \|\| cache->head == LT_IDX_MAX) {
	766	cache->head = new_head->lt_id.id;
	767	goto out;
	768	}
	769
	770	old_head = lt_elem_list_first(&g_prepost_table, cache->head);
	771	(void)lt_elem_list_link(&g_prepost_table, new_head, old_head);
	772	cache->head = new_head->lt_id.id;
	773
	774	out:
	775	enable_preemption();
	776	return;
	777	}
	778
	779	static void
	780	wq_prepost_ensure_free_space(void)
	781	{
	782	uint32_t free_elem;
	783	uint32_t min_free;
	784	struct wqp_cache *cache;
	785
	786	if (g_min_free_cache == 0) {
	787	g_min_free_cache = (WQP_CACHE_MAX * ml_wait_max_cpus());
	788	}
	789
	790	/*
	791	* Ensure that we always have a pool of per-CPU prepost elements
	792	*/
	793	disable_preemption();
	794	cache = PERCPU_GET(wqp_cache);
	795	free_elem = cache->avail;
	796	enable_preemption();
	797
	798	if (free_elem < (WQP_CACHE_MAX / 3)) {
	799	wq_prepost_refill_cpu_cache(WQP_CACHE_MAX - free_elem);
	800	}
	801
	802	/*
	803	* Now ensure that we have a sufficient amount of free table space
	804	*/
	805	free_elem = g_prepost_table.nelem - g_prepost_table.used_elem;
	806	min_free = g_min_free_table_elem + g_min_free_cache;
	807	if (free_elem < min_free) {
	808	/*
	809	* we don't hold locks on these values, so check for underflow
	810	*/
	811	if (g_prepost_table.used_elem <= g_prepost_table.nelem) {
	812	wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d+%d",
	813	g_prepost_table.nelem, g_prepost_table.used_elem,
	814	g_min_free_table_elem, g_min_free_cache);
	815	ltable_grow(&g_prepost_table, min_free);
	816	}
	817	}
	818	}
	819
	820	static struct wq_prepost *
	821	wq_prepost_alloc(int type, int nelem)
	822	{
	823	struct lt_elem *elem;
	824	struct wq_prepost *wqp;
	825	struct wqp_cache *cache;
	826
	827	if (type != LT_RESERVED) {
	828	goto do_alloc;
	829	}
	830	if (nelem == 0) {
	831	return NULL;
	832	}
	833
	834	/*
	835	* First try to grab the elements from the per-CPU cache if we are
	836	* allocating RESERVED elements
	837	*/
	838	disable_preemption();
	839	cache = PERCPU_GET(wqp_cache);
	840	if (nelem <= (int)cache->avail) {
	841	struct lt_elem first, next = NULL;
	842	int nalloc = nelem;
	843
	844	cache->avail -= nelem;
	845
	846	/* grab the first element */
	847	first = lt_elem_list_first(&g_prepost_table, cache->head);
	848
	849	/* find the last element and re-adjust the cache head */
	850	for (elem = first; elem != NULL && nalloc > 0; elem = next) {
	851	next = lt_elem_list_next(&g_prepost_table, elem);
	852	if (--nalloc == 0) {
	853	/* terminate the allocated list */
	854	elem->lt_next_idx = LT_IDX_MAX;
	855	break;
	856	}
	857	}
	858	assert(nalloc == 0);
	859	if (!next) {
	860	cache->head = LT_IDX_MAX;
	861	} else {
	862	cache->head = next->lt_id.id;
	863	}
	864	/* assert that we don't have mis-matched book keeping */
	865	assert(!(cache->head == LT_IDX_MAX && cache->avail > 0));
	866	enable_preemption();
	867	elem = first;
	868	goto out;
	869	}
	870	enable_preemption();
	871
	872	do_alloc:
	873	/* fall-back to standard table allocation */
	874	elem = ltable_alloc_elem(&g_prepost_table, type, nelem, 0);
	875	if (!elem) {
	876	return NULL;
	877	}
	878
	879	out:
	880	wqp = (struct wq_prepost *)elem;
	881	wqp_do_alloc_stats(elem);
	882	return wqp;
	883	}
	884
	885	static void
	886	wq_prepost_invalidate(struct wq_prepost *wqp)
	887	{
	888	lt_elem_invalidate(&wqp->wqte);
	889	}
	890
	891	static struct wq_prepost *
	892	wq_prepost_get(uint64_t wqp_id)
	893	{
	894	struct lt_elem *elem;
	895
	896	elem = ltable_get_elem(&g_prepost_table, wqp_id);
	897	return (struct wq_prepost *)elem;
	898	}
	899
	900	static void
	901	wq_prepost_put(struct wq_prepost *wqp)
	902	{
	903	ltable_put_elem(&g_prepost_table, (struct lt_elem *)wqp);
	904	}
	905
	906	static int
	907	wq_prepost_rlink(struct wq_prepost parent, struct wq_prepost child)
	908	{
	909	return lt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte);
	910	}
	911
	912	static struct wq_prepost *
	913	wq_prepost_get_rnext(struct wq_prepost *head)
	914	{
	915	struct lt_elem *elem;
	916	struct wq_prepost *wqp;
	917	uint64_t id;
	918
	919	elem = lt_elem_list_next(&g_prepost_table, &head->wqte);
	920	if (!elem) {
	921	return NULL;
	922	}
	923	id = elem->lt_id.id;
	924	elem = ltable_get_elem(&g_prepost_table, id);
	925
	926	if (!elem) {
	927	return NULL;
	928	}
	929	wqp = (struct wq_prepost *)elem;
	930	if (elem->lt_id.id != id \|\|
	931	wqp_type(wqp) != WQP_POST \|\|
	932	wqp->wqp_post.wqp_next_id != head->wqp_prepostid.id) {
	933	ltable_put_elem(&g_prepost_table, elem);
	934	return NULL;
	935	}
	936
	937	return wqp;
	938	}
	939
	940	static void
	941	wq_prepost_reset_rnext(struct wq_prepost *wqp)
	942	{
	943	(void)lt_elem_list_break(&g_prepost_table, &wqp->wqte);
	944	}
	945
	946
	947	/**
	948	* remove 'wqp' from the prepost list on 'wqset'
	949	*
	950	* Conditions:
	951	* wqset is locked
	952	* caller holds a reference on wqp (and is responsible to release it)
	953	*
	954	* Result:
	955	* wqp is invalidated, wqset is potentially updated with a new
	956	* prepost ID, and the next element of the prepost list may be
	957	* consumed as well (if the list contained only 2 objects)
	958	*/
	959	static int
	960	wq_prepost_remove(struct waitq_set *wqset,
	961	struct wq_prepost *wqp)
	962	{
	963	int more_posts = 1;
	964	uint64_t next_id = wqp->wqp_post.wqp_next_id;
	965	uint64_t wqp_id = wqp->wqp_prepostid.id;
	966	struct wq_prepost prev_wqp, next_wqp;
	967
	968	assert(wqp_type(wqp) == WQP_POST);
	969	assert(wqset->wqset_q.waitq_prepost == 1);
	970
	971	if (next_id == wqp_id) {
	972	/* the list is singular and becoming empty */
	973	wqset->wqset_prepost_id = 0;
	974	more_posts = 0;
	975	goto out;
	976	}
	977
	978	prev_wqp = wq_prepost_get_rnext(wqp);
	979	assert(prev_wqp != NULL);
	980	assert(prev_wqp->wqp_post.wqp_next_id == wqp_id);
	981	assert(prev_wqp->wqp_prepostid.id != wqp_id);
	982	assert(wqp_type(prev_wqp) == WQP_POST);
	983
	984	if (prev_wqp->wqp_prepostid.id == next_id) {
	985	/*
	986	* There are two items in the list, and we're removing one. We
	987	* only need to keep the WQP_WQ pointer from 'prev_wqp'
	988	*/
	989	wqset->wqset_prepost_id = prev_wqp->wqp_post.wqp_wq_id;
	990	wq_prepost_invalidate(prev_wqp);
	991	wq_prepost_put(prev_wqp);
	992	more_posts = 0;
	993	goto out;
	994	}
	995
	996	/* prev->next = next */
	997	prev_wqp->wqp_post.wqp_next_id = next_id;
	998
	999	/* next->prev = prev */
	1000	next_wqp = wq_prepost_get(next_id);
	1001	assert(next_wqp != NULL);
	1002	assert(next_wqp != wqp);
	1003	assert(next_wqp != prev_wqp);
	1004	assert(wqp_type(next_wqp) == WQP_POST);
	1005
	1006	wq_prepost_reset_rnext(next_wqp);
	1007	wq_prepost_rlink(next_wqp, prev_wqp);
	1008
	1009	/* If we remove the head of the list, update the wqset */
	1010	if (wqp_id == wqset->wqset_prepost_id) {
	1011	wqset->wqset_prepost_id = next_id;
	1012	}
	1013
	1014	wq_prepost_put(prev_wqp);
	1015	wq_prepost_put(next_wqp);
	1016
	1017	out:
	1018	wq_prepost_reset_rnext(wqp);
	1019	wq_prepost_invalidate(wqp);
	1020	return more_posts;
	1021	}
	1022
	1023	static struct wq_prepost *
	1024	wq_prepost_rfirst(uint64_t id)
	1025	{
	1026	struct lt_elem *elem;
	1027	elem = lt_elem_list_first(&g_prepost_table, id);
	1028	wqp_do_alloc_stats(elem);
	1029	return (struct wq_prepost )(void )elem;
	1030	}
	1031
	1032	static struct wq_prepost *
	1033	wq_prepost_rpop(uint64_t *id, int type)
	1034	{
	1035	struct lt_elem *elem;
	1036	elem = lt_elem_list_pop(&g_prepost_table, id, type);
	1037	wqp_do_alloc_stats(elem);
	1038	return (struct wq_prepost )(void )elem;
	1039	}
	1040
	1041	static void
	1042	wq_prepost_release_rlist(struct wq_prepost *wqp)
	1043	{
	1044	int nelem = 0;
	1045	struct wqp_cache *cache;
	1046	struct lt_elem *elem;
	1047
	1048	if (!wqp) {
	1049	return;
	1050	}
	1051
	1052	elem = &wqp->wqte;
	1053
	1054	/*
	1055	* These are reserved elements: release them back to the per-cpu pool
	1056	* if our cache is running low.
	1057	*/
	1058	disable_preemption();
	1059	cache = PERCPU_GET(wqp_cache);
	1060	if (cache->avail < WQP_CACHE_MAX) {
	1061	struct lt_elem *tmp = NULL;
	1062	if (cache->head != LT_IDX_MAX) {
	1063	tmp = lt_elem_list_first(&g_prepost_table, cache->head);
	1064	}
	1065	nelem = lt_elem_list_link(&g_prepost_table, elem, tmp);
	1066	cache->head = elem->lt_id.id;
	1067	cache->avail += nelem;
	1068	enable_preemption();
	1069	return;
	1070	}
	1071	enable_preemption();
	1072
	1073	/* release these elements back to the main table */
	1074	nelem = lt_elem_list_release(&g_prepost_table, elem, LT_RESERVED);
	1075
	1076	#if CONFIG_WAITQ_STATS
	1077	g_prepost_table.nreserved_releases += 1;
	1078	OSDecrementAtomic64(&g_prepost_table.nreservations);
	1079	#endif
	1080	}
	1081
	1082	typedef int (wqp_callback_func)(struct waitq_set wqset,
	1083	void *ctx,
	1084	struct wq_prepost *wqp,
	1085	struct waitq *waitq);
	1086
	1087	/**
	1088	* iterate over a chain of preposts associated with a waitq set.
	1089	*
	1090	* Conditions:
	1091	* wqset is locked
	1092	*
	1093	* Notes:
	1094	* This loop performs automatic prepost chain management / culling, and
	1095	* may reset or adjust the waitq set's prepost ID pointer. If you don't
	1096	* want this extra processing, you can use wq_prepost_iterate().
	1097	*/
	1098	static int
	1099	wq_prepost_foreach_locked(struct waitq_set *wqset,
	1100	void *ctx, wqp_callback_func cb)
	1101	{
	1102	int ret = WQ_ITERATE_SUCCESS;
	1103	struct wq_prepost wqp, tmp_wqp;
	1104
	1105	assert(cb != NULL);
	1106
	1107	if (!wqset \|\| !waitq_set_maybe_preposted(wqset)) {
	1108	return WQ_ITERATE_SUCCESS;
	1109	}
	1110
	1111	restart:
	1112	wqp = wq_prepost_get(wqset->wqset_prepost_id);
	1113	if (!wqp) {
	1114	/*
	1115	* The prepost object is no longer valid, reset the waitq
	1116	* set's prepost id.
	1117	*/
	1118	wqset->wqset_prepost_id = 0;
	1119	return WQ_ITERATE_SUCCESS;
	1120	}
	1121
	1122	if (wqp_type(wqp) == WQP_WQ) {
	1123	uint64_t __assert_only wqp_id = wqp->wqp_prepostid.id;
	1124
	1125	ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
	1126
	1127	switch (ret) {
	1128	case WQ_ITERATE_INVALIDATE_CONTINUE:
	1129	/* the caller wants to remove the only prepost here */
	1130	assert(wqp_id == wqset->wqset_prepost_id);
	1131	wqset->wqset_prepost_id = 0;
	1132	OS_FALLTHROUGH;
	1133	case WQ_ITERATE_CONTINUE:
	1134	wq_prepost_put(wqp);
	1135	ret = WQ_ITERATE_SUCCESS;
	1136	break;
	1137	case WQ_ITERATE_RESTART:
	1138	wq_prepost_put(wqp);
	1139	OS_FALLTHROUGH;
	1140	case WQ_ITERATE_DROPPED:
	1141	goto restart;
	1142	default:
	1143	wq_prepost_put(wqp);
	1144	break;
	1145	}
	1146	return ret;
	1147	}
	1148
	1149	assert(wqp->wqp_prepostid.id == wqset->wqset_prepost_id);
	1150	assert(wqp_type(wqp) == WQP_POST);
	1151
	1152	/*
	1153	* At this point we know we have a list of POST objects.
	1154	* Grab a handle to the last element in the list and start
	1155	* the iteration.
	1156	*/
	1157	tmp_wqp = wq_prepost_get_rnext(wqp);
	1158	assert(tmp_wqp != NULL && wqp_type(tmp_wqp) == WQP_POST);
	1159
	1160	uint64_t last_id = tmp_wqp->wqp_prepostid.id;
	1161	wq_prepost_put(tmp_wqp);
	1162
	1163	ret = WQ_ITERATE_SUCCESS;
	1164	for (;;) {
	1165	uint64_t wqp_id, first_id, next_id;
	1166
	1167	wqp_id = wqp->wqp_prepostid.id;
	1168	first_id = wqset->wqset_prepost_id;
	1169	next_id = wqp->wqp_post.wqp_next_id;
	1170
	1171	/* grab the WQP_WQ object this _POST points to */
	1172	tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
	1173	if (!tmp_wqp) {
	1174	/*
	1175	* This WQP_POST object points to an invalid
	1176	* WQP_WQ object - remove the POST object from
	1177	* the list.
	1178	*/
	1179	if (wq_prepost_remove(wqset, wqp) == 0) {
	1180	wq_prepost_put(wqp);
	1181	goto restart;
	1182	}
	1183	goto next_prepost;
	1184	}
	1185	assert(wqp_type(tmp_wqp) == WQP_WQ);
	1186	/*
	1187	* make the callback: note that this could remove 'wqp' or
	1188	* drop the lock on our waitq set. We need to re-validate
	1189	* our state when this function returns.
	1190	*/
	1191	ret = cb(wqset, ctx, wqp, tmp_wqp->wqp_wq.wqp_wq_ptr);
	1192	wq_prepost_put(tmp_wqp);
	1193
	1194	switch (ret) {
	1195	case WQ_ITERATE_CONTINUE:
	1196	/* continue iteration */
	1197	break;
	1198	case WQ_ITERATE_INVALIDATE_CONTINUE:
	1199	assert(next_id == wqp->wqp_post.wqp_next_id);
	1200	if (wq_prepost_remove(wqset, wqp) == 0) {
	1201	wq_prepost_put(wqp);
	1202	goto restart;
	1203	}
	1204	goto next_prepost;
	1205	case WQ_ITERATE_RESTART:
	1206	wq_prepost_put(wqp);
	1207	OS_FALLTHROUGH;
	1208	case WQ_ITERATE_DROPPED:
	1209	/* the callback dropped the ref to wqp: just restart */
	1210	goto restart;
	1211	default:
	1212	/* break out of the iteration for some other reason */
	1213	goto finish_prepost_foreach;
	1214	}
	1215
	1216	/*
	1217	* the set lock may have been dropped during callback,
	1218	* if something looks different, restart the prepost iteration
	1219	*/
	1220	if (!wqp_is_valid(wqp) \|\|
	1221	(wqp->wqp_post.wqp_next_id != next_id) \|\|
	1222	wqset->wqset_prepost_id != first_id) {
	1223	wq_prepost_put(wqp);
	1224	goto restart;
	1225	}
	1226
	1227	next_prepost:
	1228	/* this was the last object in the list */
	1229	if (wqp_id == last_id) {
	1230	break;
	1231	}
	1232
	1233	/* get the next object */
	1234	tmp_wqp = wq_prepost_get(next_id);
	1235	if (!tmp_wqp) {
	1236	/*
	1237	* At this point we've already checked our state
	1238	* after the callback (which may have dropped the set
	1239	* lock). If we find an invalid member of the list
	1240	* then something is wrong.
	1241	*/
	1242	panic("Invalid WQP_POST member 0x%llx in waitq set "
	1243	"0x%llx prepost list (first:%llx, "
	1244	"wqp:%p)",
	1245	next_id, wqset->wqset_id, first_id, wqp);
	1246	}
	1247	wq_prepost_put(wqp);
	1248	wqp = tmp_wqp;
	1249
	1250	assert(wqp_type(wqp) == WQP_POST);
	1251	}
	1252
	1253	finish_prepost_foreach:
	1254	wq_prepost_put(wqp);
	1255	if (ret == WQ_ITERATE_CONTINUE) {
	1256	ret = WQ_ITERATE_SUCCESS;
	1257	}
	1258
	1259	return ret;
	1260	}
	1261
	1262	/**
	1263	* Perform a simple loop over a chain of prepost objects
	1264	*
	1265	* Conditions:
	1266	* If 'prepost_id' is associated with a waitq (set) then that object must
	1267	* be locked before calling this function.
	1268	* Callback function, 'cb', must be able to handle a NULL wqset pointer
	1269	* and a NULL waitq pointer!
	1270	*
	1271	* Notes:
	1272	* This prepost chain iteration will _not_ automatically adjust any chain
	1273	* element or linkage. This is the responsibility of the caller! If you
	1274	* want automatic prepost chain management (at a cost of extra CPU time),
	1275	* you can use: wq_prepost_foreach_locked().
	1276	*/
	1277	static int
	1278	wq_prepost_iterate(uint64_t prepost_id,
	1279	void *ctx, wqp_callback_func cb)
	1280	{
	1281	int ret;
	1282	struct wq_prepost *wqp;
	1283
	1284	if (!prepost_id) {
	1285	return WQ_ITERATE_SUCCESS;
	1286	}
	1287
	1288	wqp = wq_prepost_get(prepost_id);
	1289	if (!wqp) {
	1290	return WQ_ITERATE_SUCCESS;
	1291	}
	1292
	1293	if (wqp_type(wqp) == WQP_WQ) {
	1294	ret = WQ_ITERATE_SUCCESS;
	1295	if (cb) {
	1296	ret = cb(NULL, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
	1297	}
	1298
	1299	if (ret != WQ_ITERATE_DROPPED) {
	1300	wq_prepost_put(wqp);
	1301	}
	1302	return ret;
	1303	}
	1304
	1305	assert(wqp->wqp_prepostid.id == prepost_id);
	1306	assert(wqp_type(wqp) == WQP_POST);
	1307
	1308	/* at this point we know we have a list of POST objects */
	1309	uint64_t next_id;
	1310
	1311	ret = WQ_ITERATE_CONTINUE;
	1312	do {
	1313	struct wq_prepost *tmp_wqp;
	1314	struct waitq *wq = NULL;
	1315
	1316	next_id = wqp->wqp_post.wqp_next_id;
	1317
	1318	/* grab the WQP_WQ object this _POST points to */
	1319	tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
	1320	if (tmp_wqp) {
	1321	assert(wqp_type(tmp_wqp) == WQP_WQ);
	1322	wq = tmp_wqp->wqp_wq.wqp_wq_ptr;
	1323	}
	1324
	1325	if (cb) {
	1326	ret = cb(NULL, ctx, wqp, wq);
	1327	}
	1328	if (tmp_wqp) {
	1329	wq_prepost_put(tmp_wqp);
	1330	}
	1331
	1332	if (ret != WQ_ITERATE_CONTINUE) {
	1333	break;
	1334	}
	1335
	1336	tmp_wqp = wq_prepost_get(next_id);
	1337	if (!tmp_wqp) {
	1338	/*
	1339	* the chain is broken: nothing we can do here besides
	1340	* bail from the iteration.
	1341	*/
	1342	ret = WQ_ITERATE_ABORTED;
	1343	break;
	1344	}
	1345
	1346	wq_prepost_put(wqp);
	1347	wqp = tmp_wqp;
	1348
	1349	assert(wqp_type(wqp) == WQP_POST);
	1350	} while (next_id != prepost_id);
	1351
	1352	if (ret != WQ_ITERATE_DROPPED) {
	1353	wq_prepost_put(wqp);
	1354	}
	1355
	1356	if (ret == WQ_ITERATE_CONTINUE) {
	1357	ret = WQ_ITERATE_SUCCESS;
	1358	}
	1359	return ret;
	1360	}
	1361
	1362
	1363	struct _is_posted_ctx {
	1364	struct waitq *posting_wq;
	1365	int did_prepost;
	1366	};
	1367
	1368	static int
	1369	wq_is_preposted_on_set_cb(struct waitq_set wqset, void ctx,
	1370	struct wq_prepost wqp, struct waitq waitq)
	1371	{
	1372	struct _is_posted_ctx pctx = (struct _is_posted_ctx )ctx;
	1373
	1374	(void)wqset;
	1375	(void)wqp;
	1376
	1377	/*
	1378	* Don't early-out, run through the _entire_ list:
	1379	* This ensures that we retain a minimum number of invalid elements.
	1380	*/
	1381	if (pctx->posting_wq == waitq) {
	1382	pctx->did_prepost = 1;
	1383	}
	1384
	1385	return WQ_ITERATE_CONTINUE;
	1386	}
	1387
	1388
	1389	/**
	1390	* checks if 'waitq' has already preposted on 'wqset'
	1391	*
	1392	* Parameters:
	1393	* waitq The waitq that's preposting
	1394	* wqset The set onto which waitq may be preposted
	1395	*
	1396	* Conditions:
	1397	* both waitq and wqset are locked
	1398	*
	1399	* Returns non-zero if 'waitq' has already preposted to 'wqset'
	1400	*/
	1401	static int
	1402	wq_is_preposted_on_set(struct waitq waitq, struct waitq_set wqset)
	1403	{
	1404	int ret;
	1405	struct _is_posted_ctx pctx;
	1406
	1407	/*
	1408	* If the set's only prepost matches the waitq's prepost ID,
	1409	* then it obviously already preposted to the set.
	1410	*/
	1411	if (waitq->waitq_prepost_id != 0 &&
	1412	wqset->wqset_prepost_id == waitq->waitq_prepost_id) {
	1413	return 1;
	1414	}
	1415
	1416	/* use full prepost iteration: always trim the list */
	1417	pctx.posting_wq = waitq;
	1418	pctx.did_prepost = 0;
	1419	ret = wq_prepost_foreach_locked(wqset, (void *)&pctx,
	1420	wq_is_preposted_on_set_cb);
	1421	return pctx.did_prepost;
	1422	}
	1423
	1424	static struct wq_prepost *
	1425	wq_get_prepost_obj(uint64_t *reserved, int type)
	1426	{
	1427	struct wq_prepost *wqp = NULL;
	1428	/*
	1429	* don't fail just because the caller doesn't have enough
	1430	* reservations, we've kept a low-water mark on the prepost table,
	1431	* so there should be some available for us.
	1432	*/
	1433	if (reserved && *reserved) {
	1434	wqp = wq_prepost_rpop(reserved, type);
	1435	assert(wqp->wqte.lt_id.idx < g_prepost_table.nelem);
	1436	} else {
	1437	/*
	1438	* TODO: if in interrupt context, grab from a special
	1439	* region / reserved list!
	1440	*/
	1441	wqp = wq_prepost_alloc(type, 1);
	1442	}
	1443
	1444	if (wqp == NULL) {
	1445	panic("Couldn't allocate prepost object!");
	1446	}
	1447	return wqp;
	1448	}
	1449
	1450
	1451	/**
	1452	* prepost a waitq onto a waitq set
	1453	*
	1454	* Parameters:
	1455	* wqset The set onto which waitq will be preposted
	1456	* waitq The waitq that's preposting
	1457	* reserved List (lt_elem_list_ style) of pre-allocated prepost elements
	1458	* Could be NULL
	1459	*
	1460	* Conditions:
	1461	* both wqset and waitq are locked
	1462	*
	1463	* Notes:
	1464	* If reserved is NULL, this may block on prepost table growth.
	1465	*/
	1466	static void
	1467	wq_prepost_do_post_locked(struct waitq_set *wqset,
	1468	struct waitq *waitq,
	1469	uint64_t *reserved)
	1470	{
	1471	struct wq_prepost wqp_post, wqp_head, *wqp_tail;
	1472
	1473	assert(waitq_held(waitq) && waitq_held(&wqset->wqset_q));
	1474
	1475	/*
	1476	* nothing to do if it's already preposted:
	1477	* note that this also culls any invalid prepost objects
	1478	*/
	1479	if (wq_is_preposted_on_set(waitq, wqset)) {
	1480	return;
	1481	}
	1482
	1483	assert(waitqs_is_linked(wqset));
	1484
	1485	/*
	1486	* This function is called because an event is being posted to 'waitq'.
	1487	* We need a prepost object associated with this queue. Allocate one
	1488	* now if the waitq isn't already associated with one.
	1489	*/
	1490	if (waitq->waitq_prepost_id == 0) {
	1491	struct wq_prepost *wqp;
	1492	wqp = wq_get_prepost_obj(reserved, WQP_WQ);
	1493	wqp->wqp_wq.wqp_wq_ptr = waitq;
	1494	wqp_set_valid(wqp);
	1495	waitq->waitq_prepost_id = wqp->wqp_prepostid.id;
	1496	wq_prepost_put(wqp);
	1497	}
	1498
	1499	#if CONFIG_LTABLE_STATS
	1500	g_prepost_table.npreposts += 1;
	1501	#endif
	1502
	1503	wqdbg_v("preposting waitq %p (0x%llx) to set 0x%llx",
	1504	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq),
	1505	waitq->waitq_prepost_id, wqset->wqset_id);
	1506
	1507	if (wqset->wqset_prepost_id == 0) {
	1508	/* the set has no previous preposts */
	1509	wqset->wqset_prepost_id = waitq->waitq_prepost_id;
	1510	return;
	1511	}
	1512
	1513	wqp_head = wq_prepost_get(wqset->wqset_prepost_id);
	1514	if (!wqp_head) {
	1515	/* the previous prepost has become invalid */
	1516	wqset->wqset_prepost_id = waitq->waitq_prepost_id;
	1517	return;
	1518	}
	1519
	1520	assert(wqp_head->wqp_prepostid.id == wqset->wqset_prepost_id);
	1521
	1522	/*
	1523	* If we get here, we're going to need at least one new wq_prepost
	1524	* object. If the previous wqset_prepost_id points to a WQP_WQ, we
	1525	* actually need to allocate 2 wq_prepost objects because the WQP_WQ
	1526	* is tied to the waitq and shared across all sets.
	1527	*/
	1528	wqp_post = wq_get_prepost_obj(reserved, WQP_POST);
	1529
	1530	wqp_post->wqp_post.wqp_wq_id = waitq->waitq_prepost_id;
	1531	wqdbg_v("POST 0x%llx :: WQ 0x%llx", wqp_post->wqp_prepostid.id,
	1532	waitq->waitq_prepost_id);
	1533
	1534	if (wqp_type(wqp_head) == WQP_WQ) {
	1535	/*
	1536	* We must replace the wqset_prepost_id with a pointer
	1537	* to two new WQP_POST objects
	1538	*/
	1539	uint64_t wqp_id = wqp_head->wqp_prepostid.id;
	1540	wqdbg_v("set 0x%llx previous had 1 WQ prepost (0x%llx): "
	1541	"replacing with two POST preposts",
	1542	wqset->wqset_id, wqp_id);
	1543
	1544	/* drop the old reference */
	1545	wq_prepost_put(wqp_head);
	1546
	1547	/* grab another new object (the 2nd of two) */
	1548	wqp_head = wq_get_prepost_obj(reserved, WQP_POST);
	1549
	1550	/* point this one to the original WQP_WQ object */
	1551	wqp_head->wqp_post.wqp_wq_id = wqp_id;
	1552	wqdbg_v("POST 0x%llx :: WQ 0x%llx",
	1553	wqp_head->wqp_prepostid.id, wqp_id);
	1554
	1555	/* link it to the new wqp_post object allocated earlier */
	1556	wqp_head->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
	1557	/* make the list a double-linked and circular */
	1558	wq_prepost_rlink(wqp_head, wqp_post);
	1559
	1560	/*
	1561	* Finish setting up the new prepost: point it back to the
	1562	* POST object we allocated to replace the original wqset
	1563	* WQ prepost object
	1564	*/
	1565	wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
	1566	wq_prepost_rlink(wqp_post, wqp_head);
	1567
	1568	/* mark objects valid, and reset the wqset prepost list head */
	1569	wqp_set_valid(wqp_head);
	1570	wqp_set_valid(wqp_post);
	1571	wqset->wqset_prepost_id = wqp_head->wqp_prepostid.id;
	1572
	1573	/* release both references */
	1574	wq_prepost_put(wqp_head);
	1575	wq_prepost_put(wqp_post);
	1576
	1577	wqdbg_v("set 0x%llx: 0x%llx/0x%llx -> 0x%llx/0x%llx -> 0x%llx",
	1578	wqset->wqset_id, wqset->wqset_prepost_id,
	1579	wqp_head->wqp_prepostid.id, wqp_head->wqp_post.wqp_next_id,
	1580	wqp_post->wqp_prepostid.id,
	1581	wqp_post->wqp_post.wqp_next_id);
	1582	return;
	1583	}
	1584
	1585	assert(wqp_type(wqp_head) == WQP_POST);
	1586
	1587	/*
	1588	* Add the new prepost to the end of the prepost list
	1589	*/
	1590	wqp_tail = wq_prepost_get_rnext(wqp_head);
	1591	assert(wqp_tail != NULL);
	1592	assert(wqp_tail->wqp_post.wqp_next_id == wqset->wqset_prepost_id);
	1593
	1594	/*
	1595	* link the head to the new tail
	1596	* NOTE: this needs to happen first in case wqp_tail == wqp_head
	1597	*/
	1598	wq_prepost_reset_rnext(wqp_head);
	1599	wq_prepost_rlink(wqp_head, wqp_post);
	1600
	1601	/* point the new object to the list head, and list tail */
	1602	wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
	1603	wq_prepost_rlink(wqp_post, wqp_tail);
	1604
	1605	/* point the last item in the waitq set's list to the new object */
	1606	wqp_tail->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
	1607
	1608	wqp_set_valid(wqp_post);
	1609
	1610	wq_prepost_put(wqp_head);
	1611	wq_prepost_put(wqp_tail);
	1612	wq_prepost_put(wqp_post);
	1613
	1614	wqdbg_v("set 0x%llx (wqp:0x%llx) last_prepost:0x%llx, "
	1615	"new_prepost:0x%llx->0x%llx", wqset->wqset_id,
	1616	wqset->wqset_prepost_id, wqp_head->wqp_prepostid.id,
	1617	wqp_post->wqp_prepostid.id, wqp_post->wqp_post.wqp_next_id);
	1618
	1619	return;
	1620	}
	1621
	1622
	1623	/* ----------------------------------------------------------------------
	1624	*
	1625	* Stats collection / reporting
	1626	*
	1627	* ---------------------------------------------------------------------- */
	1628	#if CONFIG_LTABLE_STATS && CONFIG_WAITQ_STATS
	1629	static void
	1630	wq_table_stats(struct link_table table, struct wq_table_stats stats)
	1631	{
	1632	stats->version = WAITQ_STATS_VERSION;
	1633	stats->table_elements = table->nelem;
	1634	stats->table_used_elems = table->used_elem;
	1635	stats->table_elem_sz = table->elem_sz;
	1636	stats->table_slabs = table->nslabs;
	1637	stats->table_slab_sz = table->slab_sz;
	1638
	1639	stats->table_num_allocs = table->nallocs;
	1640	stats->table_num_preposts = table->npreposts;
	1641	stats->table_num_reservations = table->nreservations;
	1642
	1643	stats->table_max_used = table->max_used;
	1644	stats->table_avg_used = table->avg_used;
	1645	stats->table_max_reservations = table->max_reservations;
	1646	stats->table_avg_reservations = table->avg_reservations;
	1647	}
	1648
	1649	void
	1650	waitq_link_stats(struct wq_table_stats *stats)
	1651	{
	1652	if (!stats) {
	1653	return;
	1654	}
	1655	wq_table_stats(&g_wqlinktable, stats);
	1656	}
	1657
	1658	void
	1659	waitq_prepost_stats(struct wq_table_stats *stats)
	1660	{
	1661	wq_table_stats(&g_prepost_table, stats);
	1662	}
	1663	#endif
	1664
	1665
	1666	/* ----------------------------------------------------------------------
	1667	*
	1668	* Global Wait Queues
	1669	*
	1670	* ---------------------------------------------------------------------- */
	1671
	1672	static struct waitq g_boot_waitq;
	1673	static struct waitq *global_waitqs = &g_boot_waitq;
	1674	static uint32_t g_num_waitqs = 1;
	1675
	1676	/*
	1677	* Zero out the used MSBs of the event.
	1678	*/
	1679	#define _CAST_TO_EVENT_MASK(event) ((uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
	1680
	1681	static __inline__ uint32_t
	1682	waitq_hash(char *key, size_t length)
	1683	{
	1684	uint32_t hash = os_hash_jenkins(key, length);
	1685
	1686	hash &= (g_num_waitqs - 1);
	1687	return hash;
	1688	}
	1689
	1690	/* return a global waitq pointer corresponding to the given event */
	1691	struct waitq *
	1692	_global_eventq(char *event, size_t event_length)
	1693	{
	1694	return &global_waitqs[waitq_hash(event, event_length)];
	1695	}
	1696
	1697	/* return an indexed global waitq pointer */
	1698	struct waitq *
	1699	global_waitq(int index)
	1700	{
	1701	return &global_waitqs[index % g_num_waitqs];
	1702	}
	1703
	1704
	1705	#if CONFIG_LTABLE_STATS \|\| CONFIG_WAITQ_STATS
	1706	/* this global is for lldb */
	1707	const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
	1708
	1709	static __inline__ void
	1710	waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
	1711	{
	1712	uintptr_t buf[NWAITQ_BTFRAMES + skip];
	1713	if (skip < 0) {
	1714	skip = 0;
	1715	}
	1716	memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
	1717	backtrace(buf, g_nwaitq_btframes + skip, NULL);
	1718	memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
	1719	}
	1720	#else /* no stats */
	1721	#define waitq_grab_backtrace(...)
	1722	#endif
	1723
	1724	#if CONFIG_WAITQ_STATS
	1725
	1726	struct wq_stats g_boot_stats;
	1727	struct wq_stats *g_waitq_stats = &g_boot_stats;
	1728
	1729	static __inline__ struct wq_stats *
	1730	waitq_global_stats(struct waitq *waitq)
	1731	{
	1732	struct wq_stats *wqs;
	1733	uint32_t idx;
	1734
	1735	if (!waitq_is_global(waitq)) {
	1736	return NULL;
	1737	}
	1738
	1739	idx = (uint32_t)(((uintptr_t)waitq - (uintptr_t)global_waitqs) / sizeof(*waitq));
	1740	assert(idx < g_num_waitqs);
	1741	wqs = &g_waitq_stats[idx];
	1742	return wqs;
	1743	}
	1744
	1745	static __inline__ void
	1746	waitq_stats_count_wait(struct waitq *waitq)
	1747	{
	1748	struct wq_stats *wqs = waitq_global_stats(waitq);
	1749	if (wqs != NULL) {
	1750	wqs->waits++;
	1751	waitq_grab_backtrace(wqs->last_wait, 2);
	1752	}
	1753	}
	1754
	1755	static __inline__ void
	1756	waitq_stats_count_wakeup(struct waitq *waitq)
	1757	{
	1758	struct wq_stats *wqs = waitq_global_stats(waitq);
	1759	if (wqs != NULL) {
	1760	wqs->wakeups++;
	1761	waitq_grab_backtrace(wqs->last_wakeup, 2);
	1762	}
	1763	}
	1764
	1765	static __inline__ void
	1766	waitq_stats_count_clear_wakeup(struct waitq *waitq)
	1767	{
	1768	struct wq_stats *wqs = waitq_global_stats(waitq);
	1769	if (wqs != NULL) {
	1770	wqs->wakeups++;
	1771	wqs->clears++;
	1772	waitq_grab_backtrace(wqs->last_wakeup, 2);
	1773	}
	1774	}
	1775
	1776	static __inline__ void
	1777	waitq_stats_count_fail(struct waitq *waitq)
	1778	{
	1779	struct wq_stats *wqs = waitq_global_stats(waitq);
	1780	if (wqs != NULL) {
	1781	wqs->failed_wakeups++;
	1782	waitq_grab_backtrace(wqs->last_failed_wakeup, 2);
	1783	}
	1784	}
	1785	#else /* !CONFIG_WAITQ_STATS */
	1786	#define waitq_stats_count_wait(q) do { } while (0)
	1787	#define waitq_stats_count_wakeup(q) do { } while (0)
	1788	#define waitq_stats_count_clear_wakeup(q) do { } while (0)
	1789	#define waitq_stats_count_fail(q) do { } while (0)
	1790	#endif
	1791
	1792	int
	1793	waitq_is_valid(struct waitq *waitq)
	1794	{
	1795	return (waitq != NULL) && waitq->waitq_isvalid;
	1796	}
	1797
	1798	int
	1799	waitq_set_is_valid(struct waitq_set *wqset)
	1800	{
	1801	return (wqset != NULL) && wqset->wqset_q.waitq_isvalid && waitqs_is_set(wqset);
	1802	}
	1803
	1804	int
	1805	waitq_is_global(struct waitq *waitq)
	1806	{
	1807	if (waitq >= global_waitqs && waitq < global_waitqs + g_num_waitqs) {
	1808	return 1;
	1809	}
	1810	return 0;
	1811	}
	1812
	1813	int
	1814	waitq_irq_safe(struct waitq *waitq)
	1815	{
	1816	/* global wait queues have this bit set on initialization */
	1817	return waitq->waitq_irq;
	1818	}
	1819
	1820	static inline bool
	1821	waitq_empty(struct waitq *wq)
	1822	{
	1823	if (waitq_is_turnstile_queue(wq)) {
	1824	return priority_queue_empty(&wq->waitq_prio_queue);
	1825	} else if (waitq_is_turnstile_proxy(wq)) {
	1826	struct turnstile *ts = wq->waitq_ts;
	1827	return ts == TURNSTILE_NULL \|\|
	1828	priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
	1829	} else {
	1830	return queue_empty(&wq->waitq_queue);
	1831	}
	1832	}
	1833
	1834	static struct waitq *
	1835	waitq_get_safeq(struct waitq *waitq)
	1836	{
	1837	/* Check if it's a port waitq */
	1838	if (waitq_is_turnstile_proxy(waitq)) {
	1839	struct turnstile *ts = waitq->waitq_ts;
	1840	return ts ? &ts->ts_waitq : NULL;
	1841	}
	1842	return global_eventq(waitq);
	1843	}
	1844
	1845	static uint32_t
	1846	waitq_hash_size(void)
	1847	{
	1848	uint32_t hsize, queues;
	1849
	1850	if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize))) {
	1851	return hsize;
	1852	}
	1853
	1854	queues = thread_max / 5;
	1855	hsize = P2ROUNDUP(queues * sizeof(struct waitq), PAGE_SIZE);
	1856
	1857	return hsize;
	1858	}
	1859
	1860	/*
	1861	* Since the priority ordered waitq uses basepri as the
	1862	* ordering key assert that this value fits in a uint8_t.
	1863	*/
	1864	static_assert(MAXPRI <= UINT8_MAX);
	1865
	1866	static inline void
	1867	waitq_thread_insert(struct waitq *wq,
	1868	thread_t thread, boolean_t fifo)
	1869	{
	1870	if (waitq_is_turnstile_queue(wq)) {
	1871	turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
	1872	turnstile_waitq_add_thread_priority_queue(wq, thread);
	1873	} else {
	1874	turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
	1875	if (fifo) {
	1876	enqueue_tail(&wq->waitq_queue, &thread->wait_links);
	1877	} else {
	1878	enqueue_head(&wq->waitq_queue, &thread->wait_links);
	1879	}
	1880	}
	1881	}
	1882
	1883	static inline void
	1884	waitq_thread_remove(struct waitq *wq,
	1885	thread_t thread)
	1886	{
	1887	if (waitq_is_turnstile_queue(wq)) {
	1888	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
	1889	(TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) \| DBG_FUNC_NONE,
	1890	VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
	1891	thread_tid(thread),
	1892	0, 0, 0);
	1893	priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links);
	1894	} else {
	1895	remqueue(&(thread->wait_links));
	1896	}
	1897	}
	1898
	1899	void
	1900	waitq_bootstrap(void)
	1901	{
	1902	kern_return_t kret;
	1903	uint32_t whsize, qsz, tmp32;
	1904
	1905	g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM;
	1906	if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE) {
	1907	g_min_free_table_elem = tmp32;
	1908	}
	1909	wqdbg("Minimum free table elements: %d", tmp32);
	1910
	1911	/*
	1912	* Determine the amount of memory we're willing to reserve for
	1913	* the waitqueue hash table
	1914	*/
	1915	whsize = waitq_hash_size();
	1916
	1917	/* Determine the number of waitqueues we can fit. */
	1918	qsz = sizeof(struct waitq);
	1919	whsize = ROUNDDOWN(whsize, qsz);
	1920	g_num_waitqs = whsize / qsz;
	1921
	1922	/*
	1923	* The hash algorithm requires that this be a power of 2, so we
	1924	* just mask off all the low-order bits.
	1925	*/
	1926	for (uint32_t i = 0; i < 31; i++) {
	1927	uint32_t bit = (1 << i);
	1928	if ((g_num_waitqs & bit) == g_num_waitqs) {
	1929	break;
	1930	}
	1931	g_num_waitqs &= ~bit;
	1932	}
	1933	assert(g_num_waitqs > 0);
	1934
	1935	/* Now determine how much memory we really need. */
	1936	whsize = P2ROUNDUP(g_num_waitqs * qsz, PAGE_SIZE);
	1937
	1938	wqdbg("allocating %d global queues (%d bytes)", g_num_waitqs, whsize);
	1939	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&global_waitqs,
	1940	whsize, 0, KMA_KOBJECT \| KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
	1941	if (kret != KERN_SUCCESS \|\| global_waitqs == NULL) {
	1942	panic("kernel_memory_allocate() failed to alloc global_waitqs"
	1943	", error: %d, whsize: 0x%x", kret, whsize);
	1944	}
	1945
	1946	#if CONFIG_WAITQ_STATS
	1947	whsize = P2ROUNDUP(g_num_waitqs * sizeof(struct wq_stats), PAGE_SIZE);
	1948	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&g_waitq_stats,
	1949	whsize, 0, KMA_KOBJECT \| KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
	1950	if (kret != KERN_SUCCESS \|\| global_waitqs == NULL) {
	1951	panic("kernel_memory_allocate() failed to alloc g_waitq_stats"
	1952	", error: %d, whsize: 0x%x", kret, whsize);
	1953	}
	1954	memset(g_waitq_stats, 0, whsize);
	1955	#endif
	1956
	1957	for (uint32_t i = 0; i < g_num_waitqs; i++) {
	1958	waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO \| SYNC_POLICY_DISABLE_IRQ);
	1959	}
	1960
	1961	/* initialize the global waitq link table */
	1962	wql_init();
	1963
	1964	/* initialize the global waitq prepost table */
	1965	wqp_init();
	1966	}
	1967
	1968
	1969	/* ----------------------------------------------------------------------
	1970	*
	1971	* Wait Queue Implementation
	1972	*
	1973	* ---------------------------------------------------------------------- */
	1974
	1975	/*
	1976	* Double the standard lock timeout, because wait queues tend
	1977	* to iterate over a number of threads - locking each. If there is
	1978	* a problem with a thread lock, it normally times out at the wait
	1979	* queue level first, hiding the real problem.
	1980	*/
	1981	/* For x86, the hardware timeout is in TSC units. */
	1982	#if defined(__i386__) \|\| defined(__x86_64__)
	1983	#define hwLockTimeOut LockTimeOutTSC
	1984	#else
	1985	#define hwLockTimeOut LockTimeOut
	1986	#endif
	1987
	1988	void
	1989	waitq_lock(struct waitq *wq)
	1990	{
	1991	if (__improbable(waitq_lock_to(wq,
	1992	hwLockTimeOut * 2) == 0)) {
	1993	boolean_t wql_acquired = FALSE;
	1994
	1995	while (machine_timeout_suspended()) {
	1996	mp_enable_preemption();
	1997	wql_acquired = waitq_lock_to(wq,
	1998	hwLockTimeOut * 2);
	1999	if (wql_acquired) {
	2000	break;
	2001	}
	2002	}
	2003	if (wql_acquired == FALSE) {
	2004	panic("waitq deadlock - waitq=%p, cpu=%d\n",
	2005	wq, cpu_number());
	2006	}
	2007	}
	2008	#if defined(__x86_64__)
	2009	pltrace(FALSE);
	2010	#endif
	2011	assert(waitq_held(wq));
	2012	}
	2013
	2014	void
	2015	waitq_unlock(struct waitq *wq)
	2016	{
	2017	assert(waitq_held(wq));
	2018	#if defined(__x86_64__)
	2019	pltrace(TRUE);
	2020	#endif
	2021	waitq_lock_unlock(wq);
	2022	}
	2023
	2024
	2025	/**
	2026	* clear the thread-related waitq state
	2027	*
	2028	* Conditions:
	2029	* 'thread' is locked
	2030	*/
	2031	static inline void
	2032	thread_clear_waitq_state(thread_t thread)
	2033	{
	2034	thread->waitq = NULL;
	2035	thread->wait_event = NO_EVENT64;
	2036	thread->at_safe_point = FALSE;
	2037	}
	2038
	2039
	2040	typedef thread_t (waitq_select_cb)(void ctx, struct waitq *waitq,
	2041	int is_global, thread_t thread);
	2042
	2043	struct waitq_select_args {
	2044	/* input parameters */
	2045	struct waitq *posted_waitq;
	2046	struct waitq *waitq;
	2047	event64_t event;
	2048	waitq_select_cb select_cb;
	2049	void *select_ctx;
	2050	int priority;
	2051
	2052	uint64_t *reserved_preposts;
	2053
	2054	/* output parameters */
	2055	queue_t threadq;
	2056	int max_threads;
	2057	int *nthreads;
	2058	spl_t *spl;
	2059	};
	2060
	2061	static void do_waitq_select_n_locked(struct waitq_select_args *args);
	2062
	2063	/**
	2064	* callback invoked once for every waitq set to which a waitq belongs
	2065	*
	2066	* Conditions:
	2067	* ctx->posted_waitq is locked
	2068	* 'link' points to a valid waitq set
	2069	*
	2070	* Notes:
	2071	* Takes the waitq set lock on the set pointed to by 'link'
	2072	* Calls do_waitq_select_n_locked() which could recurse back into
	2073	* this function if the waitq set is a member of other sets.
	2074	* If no threads were selected, it preposts the input waitq
	2075	* onto the waitq set pointed to by 'link'.
	2076	*/
	2077	static int
	2078	waitq_select_walk_cb(struct waitq waitq, void ctx,
	2079	struct waitq_link *link)
	2080	{
	2081	int ret = WQ_ITERATE_CONTINUE;
	2082	struct waitq_select_args args = ((struct waitq_select_args )ctx);
	2083	struct waitq_set *wqset;
	2084
	2085	(void)waitq;
	2086	assert(wql_type(link) == WQL_WQS);
	2087
	2088	wqset = link->wql_wqs.wql_set;
	2089	args.waitq = &wqset->wqset_q;
	2090
	2091	assert(!waitq_irq_safe(waitq));
	2092	assert(!waitq_irq_safe(&wqset->wqset_q));
	2093
	2094	waitq_set_lock(wqset);
	2095	/*
	2096	* verify that the link wasn't invalidated just before
	2097	* we were able to take the lock.
	2098	*/
	2099	if (wqset->wqset_id != link->wql_setid.id) {
	2100	goto out_unlock;
	2101	}
	2102
	2103	assert(waitqs_is_linked(wqset));
	2104
	2105	/*
	2106	* Find any threads waiting on this wait queue set,
	2107	* and recurse into any waitq set to which this set belongs.
	2108	*/
	2109	do_waitq_select_n_locked(&args);
	2110
	2111	if (*args.nthreads > 0 \|\| (args.threadq && !queue_empty(args.threadq))) {
	2112	/* at least 1 thread was selected and returned: don't prepost */
	2113	if (args.max_threads > 0 && *args.nthreads >= args.max_threads) {
	2114	/* break out of the setid walk */
	2115	ret = WQ_ITERATE_FOUND;
	2116	}
	2117	} else if (args.event == NO_EVENT64) {
	2118	/*
	2119	* No thread selected: prepost 'waitq' to 'wqset'
	2120	* if wqset can handle preposts and the event is set to 0.
	2121	* We also make sure to not post waitq sets to other sets.
	2122	*
	2123	* If the set doesn't support preposts, but does support
	2124	* prepost callout/hook interaction, invoke the predefined
	2125	* callout function and pass the set's 'prepost_hook.' This
	2126	* could potentially release another thread to handle events.
	2127	*/
	2128	if (waitq_set_can_prepost(wqset)) {
	2129	wq_prepost_do_post_locked(
	2130	wqset, waitq, args.reserved_preposts);
	2131	} else if (waitq_set_has_prepost_hook(wqset)) {
	2132	waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
	2133
	2134	/*
	2135	* When calling out to the prepost hook,
	2136	* we drop the waitq lock, to allow for the kevent
	2137	* subsytem to call into the waitq subsystem again,
	2138	* without risking a deadlock.
	2139	*
	2140	* However, we need to guard against wqset going away,
	2141	* so we increment the prepost hook use count
	2142	* while the lock is dropped.
	2143	*
	2144	* This lets waitq_set_deinit() know to wait for the
	2145	* prepost hook call to be done before it can proceed.
	2146	*
	2147	* Note: we need to keep preemption disabled the whole
	2148	* time as waitq_set_deinit will spin on this.
	2149	*/
	2150
	2151	disable_preemption();
	2152	os_atomic_add(hook, (uint16_t)1, relaxed);
	2153	waitq_set_unlock(wqset);
	2154
	2155	waitq_set__CALLING_PREPOST_HOOK__(hook);
	2156
	2157	/* Note: after this decrement, the wqset may be deallocated */
	2158	os_atomic_add(hook, (uint16_t)-1, relaxed);
	2159	enable_preemption();
	2160	return ret;
	2161	}
	2162	}
	2163
	2164	out_unlock:
	2165	waitq_set_unlock(wqset);
	2166	return ret;
	2167	}
	2168
	2169	/**
	2170	* Routine to iterate over the waitq for non-priority ordered waitqs
	2171	*
	2172	* Conditions:
	2173	* args->waitq (and args->posted_waitq) is locked
	2174	*
	2175	* Notes:
	2176	* Uses the optional select callback function to refine the selection
	2177	* of one or more threads from a waitq. The select callback is invoked
	2178	* once for every thread that is found to be waiting on the input args->waitq.
	2179	*
	2180	* If one or more threads are selected, this may disable interrupts.
	2181	* The previous interrupt state is returned in args->spl and should
	2182	* be used in a call to splx() if threads are returned to the caller.
	2183	*/
	2184	static thread_t
	2185	waitq_queue_iterate_locked(struct waitq safeq, struct waitq waitq,
	2186	spl_t spl, struct waitq_select_args *args,
	2187	uint32_t *remaining_eventmask)
	2188	{
	2189	int max_threads = args->max_threads;
	2190	int *nthreads = args->nthreads;
	2191	thread_t thread = THREAD_NULL;
	2192	thread_t first_thread = THREAD_NULL;
	2193
	2194	qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
	2195	thread_t t = THREAD_NULL;
	2196	assert_thread_magic(thread);
	2197
	2198	/*
	2199	* For non-priority ordered waitqs, we allow multiple events to be
	2200	* mux'ed into the same waitq. Also safeqs may contain threads from
	2201	* multiple waitqs. Only pick threads that match the
	2202	* requested wait event.
	2203	*/
	2204	if (thread->waitq == waitq && thread->wait_event == args->event) {
	2205	t = thread;
	2206	if (first_thread == THREAD_NULL) {
	2207	first_thread = thread;
	2208	}
	2209
	2210	/* allow the caller to futher refine the selection */
	2211	if (args->select_cb) {
	2212	t = args->select_cb(args->select_ctx, waitq,
	2213	waitq_is_global(waitq), thread);
	2214	}
	2215	if (t != THREAD_NULL) {
	2216	*nthreads += 1;
	2217	if (args->threadq) {
	2218	/* if output queue, add locked thread to it */
	2219	if (*nthreads == 1) {
	2220	*(args->spl) = (safeq != waitq) ? spl : splsched();
	2221	}
	2222	thread_lock(t);
	2223	thread_clear_waitq_state(t);
	2224	re_queue_tail(args->threadq, &t->wait_links);
	2225	}
	2226	/* only enqueue up to 'max' threads */
	2227	if (*nthreads >= max_threads && max_threads > 0) {
	2228	break;
	2229	}
	2230	}
	2231	}
	2232	/* thread wasn't selected so track it's event */
	2233	if (t == THREAD_NULL) {
	2234	*remaining_eventmask \|= (thread->waitq != safeq) ?
	2235	_CAST_TO_EVENT_MASK(thread->waitq) : _CAST_TO_EVENT_MASK(thread->wait_event);
	2236	}
	2237	}
	2238
	2239	return first_thread;
	2240	}
	2241
	2242	/**
	2243	* Routine to iterate and remove threads from priority ordered waitqs
	2244	*
	2245	* Conditions:
	2246	* args->waitq (and args->posted_waitq) is locked
	2247	*
	2248	* Notes:
	2249	* The priority ordered waitqs only support maximum priority element removal.
	2250	*
	2251	* Also, the implementation makes sure that all threads in a priority ordered
	2252	* waitq are waiting on the same wait event. This is not necessarily true for
	2253	* non-priority ordered waitqs. If one or more threads are selected, this may
	2254	* disable interrupts. The previous interrupt state is returned in args->spl
	2255	* and should be used in a call to splx() if threads are returned to the caller.
	2256	*
	2257	* In the future, we could support priority ordered waitqs with multiple wait
	2258	* events in the same queue. The way to implement that would be to keep removing
	2259	* elements from the waitq and if the event does not match the requested one,
	2260	* add it to a local list. This local list of elements needs to be re-inserted
	2261	* into the priority queue at the end and the select_cb return value &
	2262	* remaining_eventmask would need to be handled appropriately. The implementation
	2263	* is not very efficient but would work functionally.
	2264	*/
	2265	static thread_t
	2266	waitq_prioq_iterate_locked(struct waitq safeq, struct waitq waitq,
	2267	spl_t spl, struct waitq_select_args *args,
	2268	uint32_t *remaining_eventmask)
	2269	{
	2270	int max_threads = args->max_threads;
	2271	int *nthreads = args->nthreads;
	2272	thread_t first_thread = THREAD_NULL;
	2273	thread_t thread = THREAD_NULL;
	2274
	2275	/*
	2276	* The waitq select routines need to handle two cases:
	2277	* Case 1: Peek at maximum priority thread in the waitq (remove_op = 0)
	2278	* Get the maximum priority thread from the waitq without removing it.
	2279	* In that case args->threadq == NULL and max_threads == 1.
	2280	* Case 2: Remove 'n' highest priority threads from waitq (remove_op = 1)
	2281	* Get max_threads (if available) while removing them from the waitq.
	2282	* In that case args->threadq != NULL and max_threads is one of {-1, 1}.
	2283	*
	2284	* The only possible values for remaining_eventmask for the priority queue
	2285	* waitq are either 0 (for the remove all threads case) or the original
	2286	* safeq->waitq_eventmask (for the lookup/remove one thread cases).
	2287	*/
	2288	*remaining_eventmask = safeq->waitq_eventmask;
	2289	boolean_t remove_op = !!(args->threadq);
	2290
	2291	while ((max_threads <= 0) \|\| (*nthreads < max_threads)) {
	2292	if (priority_queue_empty(&(safeq->waitq_prio_queue))) {
	2293	*remaining_eventmask = 0;
	2294	break;
	2295	}
	2296
	2297	if (remove_op) {
	2298	thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
	2299	struct thread, wait_prioq_links);
	2300	} else {
	2301	/* For the peek operation, the only valid value for max_threads is 1 */
	2302	assert(max_threads == 1);
	2303	thread = priority_queue_max(&safeq->waitq_prio_queue,
	2304	struct thread, wait_prioq_links);
	2305	}
	2306	/*
	2307	* Ensure the wait event matches since priority ordered waitqs do not
	2308	* support multiple events in the same waitq.
	2309	*/
	2310	assert((thread->waitq == waitq) && (thread->wait_event == args->event));
	2311
	2312	if (args->select_cb) {
	2313	/*
	2314	* Call the select_cb passed into the waitq_select args. The callback
	2315	* updates the select_ctx with information about the highest priority
	2316	* thread which is eventually used by the caller.
	2317	*/
	2318	thread_t __assert_only ret_thread = args->select_cb(args->select_ctx, waitq,
	2319	waitq_is_global(waitq), thread);
	2320	if (!remove_op) {
	2321	/* For the peek operation, the thread should not be selected for addition */
	2322	assert(ret_thread == THREAD_NULL);
	2323	} else {
	2324	/*
	2325	* For the remove operation, the select routine should always return a valid
	2326	* thread for priority waitqs. Since all threads in a prioq are equally
	2327	* eligible, it should match the thread removed from the prioq. If this
	2328	* invariant changes, the implementation would need to handle the
	2329	* remaining_eventmask here correctly.
	2330	*/
	2331	assert(ret_thread == thread);
	2332	}
	2333	}
	2334
	2335	if (first_thread == THREAD_NULL) {
	2336	first_thread = thread;
	2337	/*
	2338	* turnstile_kernel_update_inheritor_on_wake_locked will lock
	2339	* first_thread, so call it before locking it.
	2340	*/
	2341	if (args->priority == WAITQ_PROMOTE_ON_WAKE && first_thread != THREAD_NULL && waitq_is_turnstile_queue(safeq)) {
	2342	turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq), (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD);
	2343	}
	2344	}
	2345
	2346	/* For the peek operation, break out early */
	2347	if (!remove_op) {
	2348	break;
	2349	}
	2350
	2351	/* Add the thread to the result thread list */
	2352	*nthreads += 1;
	2353	if (*nthreads == 1) {
	2354	*(args->spl) = (safeq != waitq) ? spl : splsched();
	2355	}
	2356	thread_lock(thread);
	2357	thread_clear_waitq_state(thread);
	2358	enqueue_tail(args->threadq, &(thread->wait_links));
	2359	}
	2360
	2361	return first_thread;
	2362	}
	2363
	2364	/**
	2365	* generic thread selection from a waitq (and sets to which the waitq belongs)
	2366	*
	2367	* Conditions:
	2368	* args->waitq (and args->posted_waitq) is locked
	2369	*
	2370	* Notes:
	2371	* Uses the optional select callback function to refine the selection
	2372	* of one or more threads from a waitq and any set to which the waitq
	2373	* belongs. The select callback is invoked once for every thread that
	2374	* is found to be waiting on the input args->waitq.
	2375	*
	2376	* If one or more threads are selected, this may disable interrupts.
	2377	* The previous interrupt state is returned in args->spl and should
	2378	* be used in a call to splx() if threads are returned to the caller.
	2379	*/
	2380	static void
	2381	do_waitq_select_n_locked(struct waitq_select_args *args)
	2382	{
	2383	struct waitq *waitq = args->waitq;
	2384	int max_threads = args->max_threads;
	2385	thread_t first_thread = THREAD_NULL;
	2386	struct waitq *safeq;
	2387	uint32_t remaining_eventmask = 0;
	2388	uint32_t eventmask;
	2389	int *nthreads = args->nthreads;
	2390	spl_t spl = 0;
	2391
	2392	assert(max_threads != 0);
	2393
	2394	if (!waitq_irq_safe(waitq)) {
	2395	/* JMM - add flag to waitq to avoid global lookup if no waiters */
	2396	eventmask = _CAST_TO_EVENT_MASK(waitq);
	2397	safeq = waitq_get_safeq(waitq);
	2398	if (safeq == NULL) {
	2399	/*
	2400	* in the WQT_TSPROXY case, if there's no turnstile,
	2401	* there's no queue and no waiters, so we can move straight
	2402	* to the waitq set recursion
	2403	*/
	2404	goto handle_waitq_set;
	2405	}
	2406
	2407	if (*nthreads == 0) {
	2408	spl = splsched();
	2409	}
	2410	waitq_lock(safeq);
	2411	} else {
	2412	eventmask = _CAST_TO_EVENT_MASK(args->event);
	2413	safeq = waitq;
	2414	}
	2415
	2416	/*
	2417	* If the safeq doesn't have an eventmask (not global) or the event
	2418	* we're looking for IS set in its eventmask, then scan the threads
	2419	* in that queue for ones that match the original <waitq,event> pair.
	2420	*/
	2421	if (!waitq_is_global(safeq) \|\|
	2422	(safeq->waitq_eventmask & eventmask) == eventmask) {
	2423	if (waitq_is_turnstile_queue(safeq)) {
	2424	first_thread = waitq_prioq_iterate_locked(safeq, waitq,
	2425	spl, args,
	2426	&remaining_eventmask);
	2427	} else {
	2428	first_thread = waitq_queue_iterate_locked(safeq, waitq,
	2429	spl, args,
	2430	&remaining_eventmask);
	2431	}
	2432
	2433	/*
	2434	* Update the eventmask of global queues we just scanned:
	2435	* - If we selected all the threads in the queue, we can clear its
	2436	* eventmask.
	2437	*
	2438	* - If we didn't find enough threads to fill our needs, then we can
	2439	* assume we looked at every thread in the queue and the mask we
	2440	* computed is complete - so reset it.
	2441	*/
	2442	if (waitq_is_global(safeq)) {
	2443	if (waitq_empty(safeq)) {
	2444	safeq->waitq_eventmask = 0;
	2445	} else if (max_threads < 0 \|\| *nthreads < max_threads) {
	2446	safeq->waitq_eventmask = remaining_eventmask;
	2447	}
	2448	}
	2449	}
	2450
	2451	/*
	2452	* Grab the first thread in the queue if no other thread was selected.
	2453	* We can guarantee that no one has manipulated this thread because
	2454	* it's waiting on the given waitq, and we have that waitq locked.
	2455	*/
	2456	if (*nthreads == 0 && first_thread != THREAD_NULL && args->threadq) {
	2457	/* we know this is the first (and only) thread */
	2458	++(*nthreads);
	2459	*(args->spl) = (safeq != waitq) ? spl : splsched();
	2460
	2461	thread_lock(first_thread);
	2462	thread_clear_waitq_state(first_thread);
	2463	waitq_thread_remove(safeq, first_thread);
	2464	enqueue_tail(args->threadq, &(first_thread->wait_links));
	2465
	2466	/* update the eventmask on [now] empty global queues */
	2467	if (waitq_is_global(safeq) && waitq_empty(safeq)) {
	2468	safeq->waitq_eventmask = 0;
	2469	}
	2470	}
	2471
	2472	/* unlock the safe queue if we locked one above */
	2473	if (safeq != waitq) {
	2474	waitq_unlock(safeq);
	2475	if (*nthreads == 0) {
	2476	splx(spl);
	2477	}
	2478	}
	2479
	2480	if (max_threads > 0 && *nthreads >= max_threads) {
	2481	return;
	2482	}
	2483
	2484	handle_waitq_set:
	2485	/*
	2486	* wait queues that are not in any sets
	2487	* are the bottom of the recursion
	2488	*/
	2489	if (!waitq->waitq_set_id) {
	2490	return;
	2491	}
	2492
	2493	/* check to see if the set ID for this wait queue is valid */
	2494	struct waitq_link *link = wql_get_link(waitq->waitq_set_id);
	2495	if (!link) {
	2496	/* the waitq set to which this waitq belonged, has been invalidated */
	2497	waitq->waitq_set_id = 0;
	2498	return;
	2499	}
	2500
	2501	wql_put_link(link);
	2502
	2503	/*
	2504	* If this waitq is a member of any wait queue sets, we need to look
	2505	* for waiting thread(s) in any of those sets, and prepost all sets that
	2506	* don't have active waiters.
	2507	*
	2508	* Note that we do a local walk of this waitq's links - we manually
	2509	* recurse down wait queue set's with non-zero wqset_q.waitq_set_id
	2510	*/
	2511	(void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	2512	WQL_WQS, (void *)args, waitq_select_walk_cb);
	2513	}
	2514
	2515	/**
	2516	* main entry point for thread selection from a waitq
	2517	*
	2518	* Conditions:
	2519	* waitq is locked
	2520	*
	2521	* Returns:
	2522	* The number of threads waiting on 'waitq' for 'event' which have
	2523	* been placed onto the input 'threadq'
	2524	*
	2525	* Notes:
	2526	* The 'select_cb' function is invoked for every thread found waiting on
	2527	* 'waitq' for 'event'. The thread is _not_ locked upon callback
	2528	* invocation. This parameter may be NULL.
	2529	*
	2530	* If one or more threads are returned in 'threadq' then the caller is
	2531	* responsible to call splx() using the returned 'spl' value. Each
	2532	* returned thread is locked.
	2533	*/
	2534	static __inline__ int
	2535	waitq_select_n_locked(struct waitq *waitq,
	2536	event64_t event,
	2537	waitq_select_cb select_cb,
	2538	void *select_ctx,
	2539	uint64_t *reserved_preposts,
	2540	queue_t threadq,
	2541	int max_threads, spl_t *spl,
	2542	int priority)
	2543	{
	2544	int nthreads = 0;
	2545
	2546	struct waitq_select_args args = {
	2547	.posted_waitq = waitq,
	2548	.waitq = waitq,
	2549	.event = event,
	2550	.select_cb = select_cb,
	2551	.select_ctx = select_ctx,
	2552	.priority = priority,
	2553	.reserved_preposts = reserved_preposts,
	2554	.threadq = threadq,
	2555	.max_threads = max_threads,
	2556	.nthreads = &nthreads,
	2557	.spl = spl,
	2558	};
	2559
	2560	do_waitq_select_n_locked(&args);
	2561	return nthreads;
	2562	}
	2563
	2564	/**
	2565	* select from a waitq a single thread waiting for a given event
	2566	*
	2567	* Conditions:
	2568	* 'waitq' is locked
	2569	*
	2570	* Returns:
	2571	* A locked thread that's been removed from the waitq, but has not
	2572	* yet been put on a run queue. Caller is responsible to call splx
	2573	* with the '*spl' value.
	2574	*/
	2575	static thread_t
	2576	waitq_select_one_locked(struct waitq *waitq, event64_t event,
	2577	uint64_t *reserved_preposts,
	2578	int priority, spl_t *spl)
	2579	{
	2580	int nthreads;
	2581	queue_head_t threadq;
	2582
	2583	queue_init(&threadq);
	2584
	2585	nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
	2586	reserved_preposts, &threadq, 1, spl, priority);
	2587
	2588	/* if we selected a thread, return it (still locked) */
	2589	if (!queue_empty(&threadq)) {
	2590	thread_t t;
	2591	queue_entry_t qe = dequeue_head(&threadq);
	2592	t = qe_element(qe, struct thread, wait_links);
	2593	assert(queue_empty(&threadq)); /* there should be 1 entry */
	2594	/* t has been locked and removed from all queues */
	2595	return t;
	2596	}
	2597
	2598	return THREAD_NULL;
	2599	}
	2600
	2601	struct select_thread_ctx {
	2602	thread_t thread;
	2603	event64_t event;
	2604	spl_t *spl;
	2605	};
	2606
	2607	/**
	2608	* link walk callback invoked once for each set to which a waitq belongs
	2609	*
	2610	* Conditions:
	2611	* initial waitq is locked
	2612	* ctx->thread is unlocked
	2613	*
	2614	* Notes:
	2615	* This may disable interrupts and early-out of the full DAG link walk by
	2616	* returning KERN_ALREADY_IN_SET. In this case, the returned thread has
	2617	* been removed from the waitq, it's waitq state has been reset, and the
	2618	* caller is responsible to call splx() with the returned interrupt state
	2619	* in ctx->spl.
	2620	*/
	2621	static int
	2622	waitq_select_thread_cb(struct waitq waitq, void ctx,
	2623	struct waitq_link *link)
	2624	{
	2625	struct select_thread_ctx stctx = (struct select_thread_ctx )ctx;
	2626	struct waitq_set *wqset;
	2627	struct waitq *wqsetq;
	2628	struct waitq *safeq;
	2629	spl_t s;
	2630
	2631	(void)waitq;
	2632
	2633	thread_t thread = stctx->thread;
	2634	event64_t event = stctx->event;
	2635
	2636	if (wql_type(link) != WQL_WQS) {
	2637	return WQ_ITERATE_CONTINUE;
	2638	}
	2639
	2640	wqset = link->wql_wqs.wql_set;
	2641	wqsetq = &wqset->wqset_q;
	2642
	2643	assert(!waitq_irq_safe(waitq));
	2644	assert(!waitq_irq_safe(wqsetq));
	2645
	2646	waitq_set_lock(wqset);
	2647
	2648	s = splsched();
	2649
	2650	/* find and lock the interrupt-safe waitq the thread is thought to be on */
	2651	safeq = waitq_get_safeq(wqsetq);
	2652	waitq_lock(safeq);
	2653
	2654	thread_lock(thread);
	2655
	2656	if ((thread->waitq == wqsetq) && (thread->wait_event == event)) {
	2657	waitq_thread_remove(wqsetq, thread);
	2658	if (waitq_empty(safeq)) {
	2659	safeq->waitq_eventmask = 0;
	2660	}
	2661	thread_clear_waitq_state(thread);
	2662	waitq_unlock(safeq);
	2663	waitq_set_unlock(wqset);
	2664	/*
	2665	* thread still locked,
	2666	* return non-zero to break out of WQS walk
	2667	*/
	2668	*(stctx->spl) = s;
	2669	return WQ_ITERATE_FOUND;
	2670	}
	2671
	2672	thread_unlock(thread);
	2673	waitq_set_unlock(wqset);
	2674	waitq_unlock(safeq);
	2675	splx(s);
	2676
	2677	return WQ_ITERATE_CONTINUE;
	2678	}
	2679
	2680	/**
	2681	* returns KERN_SUCCESS and locks 'thread' if-and-only-if 'thread' is waiting
	2682	* on 'waitq' (or any set to which waitq belongs) for 'event'
	2683	*
	2684	* Conditions:
	2685	* 'waitq' is locked
	2686	* 'thread' is unlocked
	2687	*/
	2688	static kern_return_t
	2689	waitq_select_thread_locked(struct waitq *waitq,
	2690	event64_t event,
	2691	thread_t thread, spl_t *spl)
	2692	{
	2693	struct waitq *safeq;
	2694	struct waitq_link *link;
	2695	struct select_thread_ctx ctx;
	2696	kern_return_t kr;
	2697	spl_t s;
	2698
	2699	/* Find and lock the interrupts disabled queue the thread is actually on */
	2700	if (!waitq_irq_safe(waitq)) {
	2701	safeq = waitq_get_safeq(waitq);
	2702	if (safeq == NULL) {
	2703	/*
	2704	* in the WQT_TSPROXY case, if there's no turnstile,
	2705	* there's no queue and no waiters, so we can move straight
	2706	* to the waitq set recursion
	2707	*/
	2708	goto handle_waitq_set;
	2709	}
	2710
	2711	s = splsched();
	2712	waitq_lock(safeq);
	2713	} else {
	2714	s = splsched();
	2715	safeq = waitq;
	2716	}
	2717
	2718	thread_lock(thread);
	2719
	2720	if ((thread->waitq == waitq) && (thread->wait_event == event)) {
	2721	waitq_thread_remove(safeq, thread);
	2722	if (waitq_empty(safeq)) {
	2723	safeq->waitq_eventmask = 0;
	2724	}
	2725	thread_clear_waitq_state(thread);
	2726	*spl = s;
	2727	/* thread still locked */
	2728	return KERN_SUCCESS;
	2729	}
	2730
	2731	thread_unlock(thread);
	2732
	2733	if (safeq != waitq) {
	2734	waitq_unlock(safeq);
	2735	}
	2736
	2737	splx(s);
	2738
	2739	handle_waitq_set:
	2740	if (!waitq->waitq_set_id) {
	2741	return KERN_NOT_WAITING;
	2742	}
	2743
	2744	/* check to see if the set ID for this wait queue is valid */
	2745	link = wql_get_link(waitq->waitq_set_id);
	2746	if (!link) {
	2747	/* the waitq to which this set belonged, has been invalidated */
	2748	waitq->waitq_set_id = 0;
	2749	return KERN_NOT_WAITING;
	2750	}
	2751
	2752	/*
	2753	* The thread may be waiting on a wait queue set to which
	2754	* the input 'waitq' belongs. Go look for the thread in
	2755	* all wait queue sets. If it's there, we'll remove it
	2756	* because it's equivalent to waiting directly on the input waitq.
	2757	*/
	2758	ctx.thread = thread;
	2759	ctx.event = event;
	2760	ctx.spl = spl;
	2761	kr = walk_waitq_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id,
	2762	WQL_WQS, (void *)&ctx, waitq_select_thread_cb);
	2763
	2764	wql_put_link(link);
	2765
	2766	/* we found a thread, return success */
	2767	if (kr == WQ_ITERATE_FOUND) {
	2768	return KERN_SUCCESS;
	2769	}
	2770
	2771	return KERN_NOT_WAITING;
	2772	}
	2773
	2774	static int
	2775	prepost_exists_cb(struct waitq_set __unused *wqset,
	2776	void __unused *ctx,
	2777	struct wq_prepost __unused *wqp,
	2778	struct waitq __unused *waitq)
	2779	{
	2780	/* if we get here, then we know that there is a valid prepost object! */
	2781	return WQ_ITERATE_FOUND;
	2782	}
	2783
	2784	/**
	2785	* declare a thread's intent to wait on 'waitq' for 'wait_event'
	2786	*
	2787	* Conditions:
	2788	* 'waitq' is locked
	2789	*/
	2790	wait_result_t
	2791	waitq_assert_wait64_locked(struct waitq *waitq,
	2792	event64_t wait_event,
	2793	wait_interrupt_t interruptible,
	2794	wait_timeout_urgency_t urgency,
	2795	uint64_t deadline,
	2796	uint64_t leeway,
	2797	thread_t thread)
	2798	{
	2799	wait_result_t wait_result;
	2800	int realtime = 0;
	2801	struct waitq *safeq;
	2802	uintptr_t eventmask;
	2803	spl_t s;
	2804
	2805
	2806	/*
	2807	* Warning: Do _not_ place debugging print statements here.
	2808	* The waitq is locked!
	2809	*/
	2810	assert(!thread->started \|\| thread == current_thread());
	2811
	2812	if (thread->waitq != NULL) {
	2813	panic("thread already waiting on %p", thread->waitq);
	2814	}
	2815
	2816	if (waitq_is_set(waitq)) {
	2817	struct waitq_set wqset = (struct waitq_set )waitq;
	2818	/*
	2819	* early-out if the thread is waiting on a wait queue set
	2820	* that has already been pre-posted.
	2821	*/
	2822	if (wait_event == NO_EVENT64 && waitq_set_maybe_preposted(wqset)) {
	2823	int ret;
	2824	/*
	2825	* Run through the list of potential preposts. Because
	2826	* this is a hot path, we short-circuit the iteration
	2827	* if we find just one prepost object.
	2828	*/
	2829	ret = wq_prepost_foreach_locked(wqset, NULL,
	2830	prepost_exists_cb);
	2831	if (ret == WQ_ITERATE_FOUND) {
	2832	s = splsched();
	2833	thread_lock(thread);
	2834	thread->wait_result = THREAD_AWAKENED;
	2835	thread_unlock(thread);
	2836	splx(s);
	2837	return THREAD_AWAKENED;
	2838	}
	2839	}
	2840	}
	2841
	2842	s = splsched();
	2843
	2844	/*
	2845	* If already dealing with an irq safe wait queue, we are all set.
	2846	* Otherwise, determine a global queue to use and lock it.
	2847	*/
	2848	if (!waitq_irq_safe(waitq)) {
	2849	safeq = waitq_get_safeq(waitq);
	2850	if (__improbable(safeq == NULL)) {
	2851	panic("Trying to assert_wait on a turnstile proxy "
	2852	"that hasn't been donated one (waitq: %p)", waitq);
	2853	}
	2854	eventmask = _CAST_TO_EVENT_MASK(waitq);
	2855	waitq_lock(safeq);
	2856	} else {
	2857	safeq = waitq;
	2858	eventmask = _CAST_TO_EVENT_MASK(wait_event);
	2859	}
	2860
	2861	/* lock the thread now that we have the irq-safe waitq locked */
	2862	thread_lock(thread);
	2863
	2864	/*
	2865	* Realtime threads get priority for wait queue placements.
	2866	* This allows wait_queue_wakeup_one to prefer a waiting
	2867	* realtime thread, similar in principle to performing
	2868	* a wait_queue_wakeup_all and allowing scheduler prioritization
	2869	* to run the realtime thread, but without causing the
	2870	* lock contention of that scenario.
	2871	*/
	2872	if (thread->sched_pri >= BASEPRI_REALTIME) {
	2873	realtime = 1;
	2874	}
	2875
	2876	/*
	2877	* This is the extent to which we currently take scheduling attributes
	2878	* into account. If the thread is vm priviledged, we stick it at
	2879	* the front of the queue. Later, these queues will honor the policy
	2880	* value set at waitq_init time.
	2881	*/
	2882	wait_result = thread_mark_wait_locked(thread, interruptible);
	2883	/* thread->wait_result has been set */
	2884	if (wait_result == THREAD_WAITING) {
	2885	if (!safeq->waitq_fifo
	2886	\|\| (thread->options & TH_OPT_VMPRIV) \|\| realtime) {
	2887	waitq_thread_insert(safeq, thread, false);
	2888	} else {
	2889	waitq_thread_insert(safeq, thread, true);
	2890	}
	2891
	2892	/* mark the event and real waitq, even if enqueued on a global safeq */
	2893	thread->wait_event = wait_event;
	2894	thread->waitq = waitq;
	2895
	2896	if (deadline != 0) {
	2897	boolean_t act;
	2898
	2899	act = timer_call_enter_with_leeway(&thread->wait_timer,
	2900	NULL,
	2901	deadline, leeway,
	2902	urgency, FALSE);
	2903	if (!act) {
	2904	thread->wait_timer_active++;
	2905	}
	2906	thread->wait_timer_is_set = TRUE;
	2907	}
	2908
	2909	if (waitq_is_global(safeq)) {
	2910	safeq->waitq_eventmask \|= eventmask;
	2911	}
	2912
	2913	waitq_stats_count_wait(waitq);
	2914	}
	2915
	2916	/* unlock the thread */
	2917	thread_unlock(thread);
	2918
	2919	/* update the inheritor's thread priority if the waitq is embedded in turnstile */
	2920	if (waitq_is_turnstile_queue(safeq) && wait_result == THREAD_WAITING) {
	2921	turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
	2922	turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
	2923	}
	2924
	2925	/* unlock the safeq if we locked it here */
	2926	if (safeq != waitq) {
	2927	waitq_unlock(safeq);
	2928	}
	2929
	2930	splx(s);
	2931
	2932	return wait_result;
	2933	}
	2934
	2935	/**
	2936	* remove 'thread' from its current blocking state on 'waitq'
	2937	*
	2938	* Conditions:
	2939	* 'thread' is locked
	2940	*
	2941	* Notes:
	2942	* This function is primarily used by clear_wait_internal in
	2943	* sched_prim.c from the thread timer wakeup path
	2944	* (i.e. the thread was waiting on 'waitq' with a timeout that expired)
	2945	*/
	2946	int
	2947	waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
	2948	{
	2949	struct waitq *safeq;
	2950
	2951	assert_thread_magic(thread);
	2952	assert(thread->waitq == waitq);
	2953
	2954	/* Find the interrupts disabled queue thread is waiting on */
	2955	if (!waitq_irq_safe(waitq)) {
	2956	safeq = waitq_get_safeq(waitq);
	2957	if (__improbable(safeq == NULL)) {
	2958	panic("Trying to clear_wait on a turnstile proxy "
	2959	"that hasn't been donated one (waitq: %p)", waitq);
	2960	}
	2961	} else {
	2962	safeq = waitq;
	2963	}
	2964
	2965	/* thread is already locked so have to try for the waitq lock */
	2966	if (!waitq_lock_try(safeq)) {
	2967	return 0;
	2968	}
	2969
	2970	waitq_thread_remove(safeq, thread);
	2971	thread_clear_waitq_state(thread);
	2972	waitq_stats_count_clear_wakeup(waitq);
	2973
	2974	/* clear the global event mask if this was the last thread there! */
	2975	if (waitq_is_global(safeq) && waitq_empty(safeq)) {
	2976	safeq->waitq_eventmask = 0;
	2977	/* JMM - also mark no-waiters on waitq (if not the same as the safeq) */
	2978	}
	2979
	2980	waitq_unlock(safeq);
	2981
	2982	return 1;
	2983	}
	2984
	2985
	2986	static __inline__
	2987	void
	2988	maybe_adjust_thread_pri(thread_t thread,
	2989	int priority,
	2990	__kdebug_only struct waitq *waitq)
	2991	{
	2992	/*
	2993	* If the caller is requesting the waitq subsystem to promote the
	2994	* priority of the awoken thread, then boost the thread's priority to
	2995	* the default WAITQ_BOOST_PRIORITY (if it's not already equal or
	2996	* higher priority). This boost must be removed via a call to
	2997	* waitq_clear_promotion_locked before the thread waits again.
	2998	*
	2999	* WAITQ_PROMOTE_PRIORITY is -2.
	3000	* Anything above 0 represents a mutex promotion.
	3001	* The default 'no action' value is -1.
	3002	* TODO: define this in a header
	3003	*/
	3004	if (priority == WAITQ_PROMOTE_PRIORITY) {
	3005	uintptr_t trace_waitq = 0;
	3006	if (__improbable(kdebug_enable)) {
	3007	trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq);
	3008	}
	3009
	3010	sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
	3011	}
	3012	}
	3013
	3014	/*
	3015	* Clear a potential thread priority promotion from a waitq wakeup
	3016	* with WAITQ_PROMOTE_PRIORITY.
	3017	*
	3018	* This must be called on the thread which was woken up with TH_SFLAG_WAITQ_PROMOTED.
	3019	*/
	3020	void
	3021	waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread)
	3022	{
	3023	spl_t s;
	3024
	3025	assert(waitq_held(waitq));
	3026	assert(thread != THREAD_NULL);
	3027	assert(thread == current_thread());
	3028
	3029	/* This flag is only cleared by the thread itself, so safe to check outside lock */
	3030	if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) {
	3031	return;
	3032	}
	3033
	3034	if (!waitq_irq_safe(waitq)) {
	3035	s = splsched();
	3036	}
	3037	thread_lock(thread);
	3038
	3039	sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, 0);
	3040
	3041	thread_unlock(thread);
	3042	if (!waitq_irq_safe(waitq)) {
	3043	splx(s);
	3044	}
	3045	}
	3046
	3047	/**
	3048	* wakeup all threads waiting on 'waitq' for 'wake_event'
	3049	*
	3050	* Conditions:
	3051	* 'waitq' is locked
	3052	*
	3053	* Notes:
	3054	* May temporarily disable and re-enable interrupts
	3055	* and re-adjust thread priority of each awoken thread.
	3056	*
	3057	* If the input 'lock_state' == WAITQ_UNLOCK then the waitq will have
	3058	* been unlocked before calling thread_go() on any returned threads, and
	3059	* is guaranteed to be unlocked upon function return.
	3060	*/
	3061	kern_return_t
	3062	waitq_wakeup64_all_locked(struct waitq *waitq,
	3063	event64_t wake_event,
	3064	wait_result_t result,
	3065	uint64_t *reserved_preposts,
	3066	int priority,
	3067	waitq_lock_state_t lock_state)
	3068	{
	3069	kern_return_t ret;
	3070	thread_t thread;
	3071	spl_t th_spl;
	3072	int nthreads;
	3073	queue_head_t wakeup_queue;
	3074
	3075	assert(waitq_held(waitq));
	3076	queue_init(&wakeup_queue);
	3077
	3078	nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
	3079	reserved_preposts,
	3080	&wakeup_queue, -1, &th_spl, priority);
	3081
	3082	/* set each thread running */
	3083	ret = KERN_NOT_WAITING;
	3084
	3085	#if CONFIG_WAITQ_STATS
	3086	qe_foreach_element(thread, &wakeup_queue, wait_links)
	3087	waitq_stats_count_wakeup(waitq);
	3088	#endif
	3089	if (lock_state == WAITQ_UNLOCK) {
	3090	waitq_unlock(waitq);
	3091	}
	3092
	3093	qe_foreach_element_safe(thread, &wakeup_queue, wait_links) {
	3094	assert_thread_magic(thread);
	3095	remqueue(&thread->wait_links);
	3096	maybe_adjust_thread_pri(thread, priority, waitq);
	3097	ret = thread_go(thread, result, WQ_OPTION_NONE);
	3098	assert(ret == KERN_SUCCESS);
	3099	thread_unlock(thread);
	3100	}
	3101	if (nthreads > 0) {
	3102	splx(th_spl);
	3103	} else {
	3104	waitq_stats_count_fail(waitq);
	3105	}
	3106
	3107	return ret;
	3108	}
	3109
	3110	/**
	3111	* wakeup one thread waiting on 'waitq' for 'wake_event'
	3112	*
	3113	* Conditions:
	3114	* 'waitq' is locked
	3115	*
	3116	* Notes:
	3117	* May temporarily disable and re-enable interrupts.
	3118	*/
	3119	kern_return_t
	3120	waitq_wakeup64_one_locked(struct waitq *waitq,
	3121	event64_t wake_event,
	3122	wait_result_t result,
	3123	uint64_t *reserved_preposts,
	3124	int priority,
	3125	waitq_lock_state_t lock_state,
	3126	waitq_options_t option)
	3127	{
	3128	thread_t thread;
	3129	spl_t th_spl;
	3130
	3131	assert(waitq_held(waitq));
	3132
	3133	thread = waitq_select_one_locked(waitq, wake_event,
	3134	reserved_preposts,
	3135	priority, &th_spl);
	3136
	3137	if (thread != THREAD_NULL) {
	3138	waitq_stats_count_wakeup(waitq);
	3139	} else {
	3140	waitq_stats_count_fail(waitq);
	3141	}
	3142
	3143	if (lock_state == WAITQ_UNLOCK) {
	3144	waitq_unlock(waitq);
	3145	}
	3146
	3147	if (thread != THREAD_NULL) {
	3148	maybe_adjust_thread_pri(thread, priority, waitq);
	3149	kern_return_t ret = thread_go(thread, result, option);
	3150	assert(ret == KERN_SUCCESS);
	3151	thread_unlock(thread);
	3152	splx(th_spl);
	3153	return ret;
	3154	}
	3155
	3156	return KERN_NOT_WAITING;
	3157	}
	3158
	3159	/**
	3160	* wakeup one thread waiting on 'waitq' for 'wake_event'
	3161	*
	3162	* Conditions:
	3163	* 'waitq' is locked
	3164	*
	3165	* Returns:
	3166	* A locked, runnable thread.
	3167	* If return value is non-NULL, interrupts have also
	3168	* been disabled, and the caller is responsible to call
	3169	* splx() with the returned '*spl' value.
	3170	*/
	3171	thread_t
	3172	waitq_wakeup64_identify_locked(struct waitq *waitq,
	3173	event64_t wake_event,
	3174	wait_result_t result,
	3175	spl_t *spl,
	3176	uint64_t *reserved_preposts,
	3177	int priority,
	3178	waitq_lock_state_t lock_state)
	3179	{
	3180	thread_t thread;
	3181
	3182	assert(waitq_held(waitq));
	3183
	3184	thread = waitq_select_one_locked(waitq, wake_event,
	3185	reserved_preposts,
	3186	priority, spl);
	3187
	3188	if (thread != THREAD_NULL) {
	3189	waitq_stats_count_wakeup(waitq);
	3190	} else {
	3191	waitq_stats_count_fail(waitq);
	3192	}
	3193
	3194	if (lock_state == WAITQ_UNLOCK) {
	3195	waitq_unlock(waitq);
	3196	}
	3197
	3198	if (thread != THREAD_NULL) {
	3199	kern_return_t __assert_only ret;
	3200	ret = thread_go(thread, result, WQ_OPTION_NONE);
	3201	assert(ret == KERN_SUCCESS);
	3202	}
	3203
	3204	return thread; /* locked if not NULL (caller responsible for spl) */
	3205	}
	3206
	3207	/**
	3208	* wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
	3209	*
	3210	* Conditions:
	3211	* 'waitq' is locked
	3212	* 'thread' is unlocked
	3213	*
	3214	* Notes:
	3215	* May temporarily disable and re-enable interrupts
	3216	*
	3217	* If the input lock_state == WAITQ_UNLOCK then the waitq will have been
	3218	* unlocked before calling thread_go() if 'thread' is to be awoken, and
	3219	* is guaranteed to be unlocked upon function return.
	3220	*/
	3221	kern_return_t
	3222	waitq_wakeup64_thread_locked(struct waitq *waitq,
	3223	event64_t wake_event,
	3224	thread_t thread,
	3225	wait_result_t result,
	3226	waitq_lock_state_t lock_state)
	3227	{
	3228	kern_return_t ret;
	3229	spl_t th_spl;
	3230
	3231	assert(waitq_held(waitq));
	3232	assert_thread_magic(thread);
	3233
	3234	/*
	3235	* See if the thread was still waiting there. If so, it got
	3236	* dequeued and returned locked.
	3237	*/
	3238	ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
	3239
	3240	if (ret == KERN_SUCCESS) {
	3241	waitq_stats_count_wakeup(waitq);
	3242	} else {
	3243	waitq_stats_count_fail(waitq);
	3244	}
	3245
	3246	if (lock_state == WAITQ_UNLOCK) {
	3247	waitq_unlock(waitq);
	3248	}
	3249
	3250	if (ret != KERN_SUCCESS) {
	3251	return KERN_NOT_WAITING;
	3252	}
	3253
	3254	ret = thread_go(thread, result, WQ_OPTION_NONE);
	3255	assert(ret == KERN_SUCCESS);
	3256	thread_unlock(thread);
	3257	splx(th_spl);
	3258
	3259	return ret;
	3260	}
	3261
	3262
	3263
	3264	/* ----------------------------------------------------------------------
	3265	*
	3266	* In-Kernel API
	3267	*
	3268	* ---------------------------------------------------------------------- */
	3269
	3270	/**
	3271	* initialize a waitq object
	3272	*/
	3273	kern_return_t
	3274	waitq_init(struct waitq *waitq, int policy)
	3275	{
	3276	assert(waitq != NULL);
	3277
	3278	/* only FIFO and LIFO for now */
	3279	if ((policy & SYNC_POLICY_FIXED_PRIORITY) != 0) {
	3280	return KERN_INVALID_ARGUMENT;
	3281	}
	3282
	3283	waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
	3284	waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
	3285	waitq->waitq_prepost = 0;
	3286	if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
	3287	waitq->waitq_type = WQT_TSPROXY;
	3288	} else {
	3289	waitq->waitq_type = WQT_QUEUE;
	3290	}
	3291	waitq->waitq_turnstile = !!(policy & SYNC_POLICY_TURNSTILE);
	3292	waitq->waitq_eventmask = 0;
	3293
	3294	waitq->waitq_set_id = 0;
	3295	waitq->waitq_prepost_id = 0;
	3296
	3297	waitq_lock_init(waitq);
	3298	if (waitq_is_turnstile_queue(waitq)) {
	3299	/* For turnstile, initialize it as a priority queue */
	3300	priority_queue_init(&waitq->waitq_prio_queue);
	3301	assert(waitq->waitq_fifo == 0);
	3302	} else if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
	3303	waitq->waitq_ts = TURNSTILE_NULL;
	3304	waitq->waitq_tspriv = NULL;
	3305	} else {
	3306	queue_init(&waitq->waitq_queue);
	3307	}
	3308
	3309	waitq->waitq_isvalid = 1;
	3310	return KERN_SUCCESS;
	3311	}
	3312
	3313	struct wq_unlink_ctx {
	3314	struct waitq *unlink_wq;
	3315	struct waitq_set *unlink_wqset;
	3316	};
	3317
	3318	static int waitq_unlink_prepost_cb(struct waitq_set __unused wqset, void ctx,
	3319	struct wq_prepost wqp, struct waitq waitq);
	3320
	3321	/**
	3322	* walk_waitq_links callback to invalidate 'link' parameter
	3323	*
	3324	* Conditions:
	3325	* Called from walk_waitq_links.
	3326	* Note that unlink other callbacks, this one make no assumptions about
	3327	* the 'waitq' parameter, specifically it does not have to be locked or
	3328	* even valid.
	3329	*/
	3330	static int
	3331	waitq_unlink_all_cb(struct waitq waitq, void ctx,
	3332	struct waitq_link *link)
	3333	{
	3334	(void)waitq;
	3335	(void)ctx;
	3336	if (wql_type(link) == WQL_LINK && wql_is_valid(link)) {
	3337	wql_invalidate(link);
	3338	}
	3339
	3340	if (wql_type(link) == WQL_WQS) {
	3341	struct waitq_set *wqset;
	3342	struct wq_unlink_ctx ulctx;
	3343
	3344	/*
	3345	* When destroying the waitq, take the time to clear out any
	3346	* preposts it may have made. This could potentially save time
	3347	* on the IPC send path which would otherwise have to iterate
	3348	* over lots of dead port preposts.
	3349	*/
	3350	if (waitq->waitq_prepost_id == 0) {
	3351	goto out;
	3352	}
	3353
	3354	wqset = link->wql_wqs.wql_set;
	3355	assert(wqset != NULL);
	3356	assert(!waitq_irq_safe(&wqset->wqset_q));
	3357
	3358	waitq_set_lock(wqset);
	3359
	3360	if (!waitq_set_is_valid(wqset)) {
	3361	/* someone raced us to teardown */
	3362	goto out_unlock;
	3363	}
	3364	if (!waitq_set_maybe_preposted(wqset)) {
	3365	goto out_unlock;
	3366	}
	3367
	3368	ulctx.unlink_wq = waitq;
	3369	ulctx.unlink_wqset = wqset;
	3370	(void)wq_prepost_iterate(wqset->wqset_prepost_id, &ulctx,
	3371	waitq_unlink_prepost_cb);
	3372	out_unlock:
	3373	waitq_set_unlock(wqset);
	3374	}
	3375
	3376	out:
	3377	return WQ_ITERATE_CONTINUE;
	3378	}
	3379
	3380
	3381	/**
	3382	* cleanup any link/prepost table resources associated with a waitq
	3383	*/
	3384	void
	3385	waitq_deinit(struct waitq *waitq)
	3386	{
	3387	spl_t s;
	3388
	3389	assert(waitq);
	3390	if (!waitq_is_valid(waitq)) {
	3391	return;
	3392	}
	3393
	3394	if (!waitq_is_queue(waitq) && !waitq_is_turnstile_proxy(waitq)) {
	3395	return;
	3396	}
	3397
	3398	if (waitq_irq_safe(waitq)) {
	3399	s = splsched();
	3400	}
	3401	waitq_lock(waitq);
	3402
	3403	if (waitq_valid(waitq)) {
	3404	waitq->waitq_isvalid = 0;
	3405	if (!waitq_irq_safe(waitq)) {
	3406	waitq_unlink_all_unlock(waitq);
	3407	/* waitq unlocked and set links deallocated */
	3408	goto out;
	3409	}
	3410	}
	3411
	3412	waitq_unlock(waitq);
	3413	if (waitq_irq_safe(waitq)) {
	3414	splx(s);
	3415	}
	3416
	3417	out:
	3418	#if MACH_ASSERT
	3419	if (waitq_is_turnstile_queue(waitq)) {
	3420	assert(priority_queue_empty(&waitq->waitq_prio_queue));
	3421	} else if (waitq_is_turnstile_proxy(waitq)) {
	3422	assert(waitq->waitq_ts == TURNSTILE_NULL);
	3423	} else {
	3424	assert(queue_empty(&waitq->waitq_queue));
	3425	}
	3426	#else
	3427	(void)0;
	3428	#endif // MACH_ASSERT
	3429	}
	3430
	3431	void
	3432	waitq_invalidate_locked(struct waitq *waitq)
	3433	{
	3434	assert(waitq_held(waitq));
	3435	assert(waitq_is_valid(waitq));
	3436	waitq->waitq_isvalid = 0;
	3437	}
	3438
	3439	/**
	3440	* invalidate the given wq_prepost object
	3441	*
	3442	* Conditions:
	3443	* Called from wq_prepost_iterate (_not_ from wq_prepost_foreach_locked!)
	3444	*/
	3445	static int
	3446	wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
	3447	void __unused *ctx,
	3448	struct wq_prepost *wqp,
	3449	struct waitq __unused *waitq)
	3450	{
	3451	if (wqp_type(wqp) == WQP_POST) {
	3452	wq_prepost_invalidate(wqp);
	3453	}
	3454	return WQ_ITERATE_CONTINUE;
	3455	}
	3456
	3457
	3458	/**
	3459	* allocate and initialize a waitq set object
	3460	*
	3461	* Conditions:
	3462	* may block
	3463	*
	3464	* Returns:
	3465	* allocated / initialized waitq_set object.
	3466	* the waits_set object returned does not have
	3467	* a waitq_link associated.
	3468	*
	3469	* NULL on failure
	3470	*/
	3471	struct waitq_set *
	3472	waitq_set_alloc(int policy, waitq_set_prepost_hook_t *prepost_hook)
	3473	{
	3474	struct waitq_set *wqset;
	3475
	3476	wqset = (struct waitq_set *)zalloc(waitq_set_zone);
	3477	if (!wqset) {
	3478	panic("Can't allocate a new waitq set from zone %p", waitq_set_zone);
	3479	}
	3480
	3481	kern_return_t ret;
	3482	ret = waitq_set_init(wqset, policy, NULL, prepost_hook);
	3483	if (ret != KERN_SUCCESS) {
	3484	zfree(waitq_set_zone, wqset);
	3485	wqset = NULL;
	3486	}
	3487
	3488	return wqset;
	3489	}
	3490
	3491	/**
	3492	* initialize a waitq set object
	3493	*
	3494	* if no 'reserved_link' object is passed
	3495	* the waitq_link will be lazily allocated
	3496	* on demand through waitq_set_lazy_init_link.
	3497	*/
	3498	kern_return_t
	3499	waitq_set_init(struct waitq_set *wqset,
	3500	int policy, uint64_t *reserved_link,
	3501	waitq_set_prepost_hook_t *prepost_hook)
	3502	{
	3503	struct waitq_link *link;
	3504	kern_return_t ret;
	3505
	3506	memset(wqset, 0, sizeof(*wqset));
	3507
	3508	ret = waitq_init(&wqset->wqset_q, policy);
	3509	if (ret != KERN_SUCCESS) {
	3510	return ret;
	3511	}
	3512
	3513	wqset->wqset_q.waitq_type = WQT_SET;
	3514	if (policy & SYNC_POLICY_PREPOST) {
	3515	wqset->wqset_q.waitq_prepost = 1;
	3516	wqset->wqset_prepost_id = 0;
	3517	assert(prepost_hook == NULL);
	3518	} else {
	3519	wqset->wqset_q.waitq_prepost = 0;
	3520	wqset->wqset_prepost_hook = prepost_hook;
	3521	}
	3522
	3523	if (reserved_link && *reserved_link != 0) {
	3524	link = wql_get_reserved(*reserved_link, WQL_WQS);
	3525
	3526	if (!link) {
	3527	panic("Can't allocate link object for waitq set: %p", wqset);
	3528	}
	3529
	3530	/* always consume the caller's reference */
	3531	*reserved_link = 0;
	3532
	3533	link->wql_wqs.wql_set = wqset;
	3534	wql_mkvalid(link);
	3535
	3536	wqset->wqset_id = link->wql_setid.id;
	3537	wql_put_link(link);
	3538	} else {
	3539	/*
	3540	* Lazy allocate the link only when an actual id is needed.
	3541	*/
	3542	wqset->wqset_id = WQSET_NOT_LINKED;
	3543	}
	3544
	3545	return KERN_SUCCESS;
	3546	}
	3547
	3548	#if DEVELOPMENT \|\| DEBUG
	3549
	3550	int
	3551	sysctl_helper_waitq_set_nelem(void)
	3552	{
	3553	return ltable_nelem(&g_wqlinktable);
	3554	}
	3555
	3556	#endif
	3557
	3558	/**
	3559	* initialize a waitq set link.
	3560	*
	3561	* Conditions:
	3562	* may block
	3563	* locks and unlocks the waiq set lock
	3564	*
	3565	*/
	3566	void
	3567	waitq_set_lazy_init_link(struct waitq_set *wqset)
	3568	{
	3569	struct waitq_link *link;
	3570
	3571	assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
	3572
	3573	waitq_set_lock(wqset);
	3574	if (!waitq_set_should_lazy_init_link(wqset)) {
	3575	waitq_set_unlock(wqset);
	3576	return;
	3577	}
	3578
	3579	assert(wqset->wqset_id == WQSET_NOT_LINKED);
	3580	waitq_set_unlock(wqset);
	3581
	3582	link = wql_alloc_link(WQL_WQS);
	3583	if (!link) {
	3584	panic("Can't allocate link object for waitq set: %p", wqset);
	3585	}
	3586
	3587	link->wql_wqs.wql_set = wqset;
	3588
	3589	waitq_set_lock(wqset);
	3590	if (waitq_set_should_lazy_init_link(wqset)) {
	3591	wql_mkvalid(link);
	3592	wqset->wqset_id = link->wql_setid.id;
	3593	}
	3594
	3595	assert(wqset->wqset_id != 0);
	3596	assert(wqset->wqset_id != WQSET_NOT_LINKED);
	3597
	3598	waitq_set_unlock(wqset);
	3599
	3600	wql_put_link(link);
	3601
	3602	return;
	3603	}
	3604
	3605	/**
	3606	* checks if a waitq set needs to be linked.
	3607	*
	3608	*/
	3609	boolean_t
	3610	waitq_set_should_lazy_init_link(struct waitq_set *wqset)
	3611	{
	3612	if (waitqs_is_linked(wqset) \|\| wqset->wqset_id == 0) {
	3613	return FALSE;
	3614	}
	3615	return TRUE;
	3616	}
	3617
	3618	/**
	3619	* clear out / release any resources associated with a waitq set
	3620	*
	3621	* Conditions:
	3622	* may block
	3623	* Note:
	3624	* This will render the waitq set invalid, and it must
	3625	* be re-initialized with waitq_set_init before it can be used again
	3626	*/
	3627	void
	3628	waitq_set_deinit(struct waitq_set *wqset)
	3629	{
	3630	struct waitq_link *link = NULL;
	3631	uint64_t set_id, prepost_id;
	3632
	3633	if (!waitqs_is_set(wqset)) {
	3634	panic("trying to de-initialize an invalid wqset @%p", wqset);
	3635	}
	3636
	3637	assert(!waitq_irq_safe(&wqset->wqset_q));
	3638
	3639	waitq_set_lock(wqset);
	3640
	3641	if (waitq_set_has_prepost_hook(wqset)) {
	3642	waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
	3643	/*
	3644	* If the wqset_prepost_hook value is non 0,
	3645	* then another core is currently posting to this waitq set
	3646	* and we need for it to finish what it's doing.
	3647	*/
	3648	while (os_atomic_load(hook, relaxed) != 0) {
	3649	waitq_set_unlock(wqset);
	3650	delay(1);
	3651	waitq_set_lock(wqset);
	3652	}
	3653	}
	3654
	3655	set_id = wqset->wqset_id;
	3656
	3657	if (waitqs_is_linked(wqset) \|\| set_id == 0) {
	3658	/* grab the set's link object */
	3659	link = wql_get_link(set_id);
	3660	if (link) {
	3661	wql_invalidate(link);
	3662	}
	3663	/* someone raced us to deinit */
	3664	if (!link \|\| wqset->wqset_id != set_id \|\| set_id != link->wql_setid.id) {
	3665	if (link) {
	3666	wql_put_link(link);
	3667	}
	3668	waitq_set_unlock(wqset);
	3669	return;
	3670	}
	3671
	3672	/* the link should be a valid link object at this point */
	3673	assert(link != NULL && wql_type(link) == WQL_WQS);
	3674
	3675	wqset->wqset_id = 0;
	3676	}
	3677
	3678	/*
	3679	* This set may have a lot of preposts, or may have been a member of
	3680	* many other sets. To minimize spinlock hold times, we clear out the
	3681	* waitq set data structure under the lock-hold, but don't clear any
	3682	* table objects. We keep handles to the prepost and set linkage
	3683	* objects and free those outside the critical section.
	3684	*/
	3685	prepost_id = 0;
	3686	if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {
	3687	assert(link != NULL);
	3688	prepost_id = wqset->wqset_prepost_id;
	3689	}
	3690	/* else { TODO: notify kqueue subsystem? } */
	3691	wqset->wqset_prepost_id = 0;
	3692
	3693	wqset->wqset_q.waitq_fifo = 0;
	3694	wqset->wqset_q.waitq_prepost = 0;
	3695	wqset->wqset_q.waitq_isvalid = 0;
	3696
	3697	/* don't clear the 'waitq_irq' bit: it's used in locking! */
	3698	wqset->wqset_q.waitq_eventmask = 0;
	3699
	3700	waitq_unlink_all_unlock(&wqset->wqset_q);
	3701	/* wqset->wqset_q unlocked and set links deallocated */
	3702
	3703
	3704	if (link) {
	3705	/*
	3706	* walk_waitq_links may race with us for access to the waitq set.
	3707	* If walk_waitq_links has a reference to the set, then we should wait
	3708	* until the link's refcount goes to 1 (our reference) before we exit
	3709	* this function. That way we ensure that the waitq set memory will
	3710	* remain valid even though it's been cleared out.
	3711	*/
	3712	while (wql_refcnt(link) > 1) {
	3713	delay(1);
	3714	}
	3715	wql_put_link(link);
	3716	}
	3717
	3718	/* drop / unlink all the prepost table objects */
	3719	/* JMM - can this happen before the delay? */
	3720	if (prepost_id) {
	3721	(void)wq_prepost_iterate(prepost_id, NULL,
	3722	wqset_clear_prepost_chain_cb);
	3723	}
	3724	}
	3725
	3726	/**
	3727	* de-initialize and free an allocated waitq set object
	3728	*
	3729	* Conditions:
	3730	* may block
	3731	*/
	3732	kern_return_t
	3733	waitq_set_free(struct waitq_set *wqset)
	3734	{
	3735	waitq_set_deinit(wqset);
	3736
	3737	memset(wqset, 0, sizeof(*wqset));
	3738	zfree(waitq_set_zone, wqset);
	3739
	3740	return KERN_SUCCESS;
	3741	}
	3742
	3743	#if DEVELOPMENT \|\| DEBUG
	3744	#if CONFIG_WAITQ_DEBUG
	3745	/**
	3746	* return the set ID of 'wqset'
	3747	*/
	3748	uint64_t
	3749	wqset_id(struct waitq_set *wqset)
	3750	{
	3751	if (!wqset) {
	3752	return 0;
	3753	}
	3754
	3755	assert(waitqs_is_set(wqset));
	3756
	3757	if (!waitqs_is_linked(wqset)) {
	3758	waitq_set_lazy_init_link(wqset);
	3759	}
	3760
	3761	return wqset->wqset_id;
	3762	}
	3763
	3764	/**
	3765	* returns a pointer to the waitq object embedded in 'wqset'
	3766	*/
	3767	struct waitq *
	3768	wqset_waitq(struct waitq_set *wqset)
	3769	{
	3770	if (!wqset) {
	3771	return NULL;
	3772	}
	3773
	3774	assert(waitqs_is_set(wqset));
	3775
	3776	return &wqset->wqset_q;
	3777	}
	3778	#endif /* CONFIG_WAITQ_DEBUG */
	3779	#endif /* DEVELOPMENT \|\| DEBUG */
	3780
	3781
	3782	/**
	3783	* clear all preposts originating from 'waitq'
	3784	*
	3785	* Conditions:
	3786	* 'waitq' locked
	3787	* may (rarely) spin waiting for another on-core thread to
	3788	* release the last reference to the waitq's prepost link object
	3789	*
	3790	* NOTE:
	3791	* If this function needs to spin, it will drop the waitq lock!
	3792	* The return value of the function indicates whether or not this
	3793	* happened: 1 == lock was dropped, 0 == lock held
	3794	*/
	3795	int
	3796	waitq_clear_prepost_locked(struct waitq *waitq)
	3797	{
	3798	struct wq_prepost *wqp;
	3799	int dropped_lock = 0;
	3800
	3801	assert(!waitq_irq_safe(waitq));
	3802
	3803	if (waitq->waitq_prepost_id == 0) {
	3804	return 0;
	3805	}
	3806
	3807	wqp = wq_prepost_get(waitq->waitq_prepost_id);
	3808	waitq->waitq_prepost_id = 0;
	3809	if (wqp) {
	3810	uint64_t wqp_id = wqp->wqp_prepostid.id;
	3811	wqdbg_v("invalidate prepost 0x%llx (refcnt:%d)",
	3812	wqp->wqp_prepostid.id, wqp_refcnt(wqp));
	3813	wq_prepost_invalidate(wqp);
	3814	while (wqp_refcnt(wqp) > 1) {
	3815	/*
	3816	* Some other thread must have raced us to grab a link
	3817	* object reference before we invalidated it. This
	3818	* means that they are probably trying to access the
	3819	* waitq to which the prepost object points. We need
	3820	* to wait here until the other thread drops their
	3821	* reference. We know that no one else can get a
	3822	* reference (the object has been invalidated), and
	3823	* that prepost references are short-lived (dropped on
	3824	* a call to wq_prepost_put). We also know that no one
	3825	* blocks while holding a reference therefore the
	3826	* other reference holder must be on-core. We'll just
	3827	* sit and wait for the other reference to be dropped.
	3828	*/
	3829	disable_preemption();
	3830
	3831	waitq_unlock(waitq);
	3832	dropped_lock = 1;
	3833	/*
	3834	* don't yield here, just spin and assume the other
	3835	* consumer is already on core...
	3836	*/
	3837	delay(1);
	3838
	3839	waitq_lock(waitq);
	3840
	3841	enable_preemption();
	3842	}
	3843	if (wqp_refcnt(wqp) > 0 && wqp->wqp_prepostid.id == wqp_id) {
	3844	wq_prepost_put(wqp);
	3845	}
	3846	}
	3847
	3848	return dropped_lock;
	3849	}
	3850
	3851	/**
	3852	* clear all preposts originating from 'waitq'
	3853	*
	3854	* Conditions:
	3855	* 'waitq' is not locked
	3856	* may disable and re-enable interrupts
	3857	*/
	3858	void
	3859	waitq_clear_prepost(struct waitq *waitq)
	3860	{
	3861	assert(waitq_valid(waitq));
	3862	assert(!waitq_irq_safe(waitq));
	3863
	3864	waitq_lock(waitq);
	3865	/* it doesn't matter to us if the lock is dropped here */
	3866	(void)waitq_clear_prepost_locked(waitq);
	3867	waitq_unlock(waitq);
	3868	}
	3869
	3870	/**
	3871	* return a the waitq's prepost object ID (allocate if necessary)
	3872	*
	3873	* Conditions:
	3874	* 'waitq' is unlocked
	3875	*/
	3876	uint64_t
	3877	waitq_get_prepost_id(struct waitq *waitq)
	3878	{
	3879	struct wq_prepost *wqp;
	3880	uint64_t wqp_id = 0;
	3881
	3882	if (!waitq_valid(waitq)) {
	3883	return 0;
	3884	}
	3885
	3886	assert(!waitq_irq_safe(waitq));
	3887
	3888	waitq_lock(waitq);
	3889
	3890	if (!waitq_valid(waitq)) {
	3891	goto out_unlock;
	3892	}
	3893
	3894	if (waitq->waitq_prepost_id) {
	3895	wqp_id = waitq->waitq_prepost_id;
	3896	goto out_unlock;
	3897	}
	3898
	3899	/* don't hold a spinlock while allocating a prepost object */
	3900	waitq_unlock(waitq);
	3901
	3902	wqp = wq_prepost_alloc(WQP_WQ, 1);
	3903	if (!wqp) {
	3904	return 0;
	3905	}
	3906
	3907	/* re-acquire the waitq lock */
	3908	waitq_lock(waitq);
	3909
	3910	if (!waitq_valid(waitq)) {
	3911	wq_prepost_put(wqp);
	3912	wqp_id = 0;
	3913	goto out_unlock;
	3914	}
	3915
	3916	if (waitq->waitq_prepost_id) {
	3917	/* we were beat by someone else */
	3918	wq_prepost_put(wqp);
	3919	wqp_id = waitq->waitq_prepost_id;
	3920	goto out_unlock;
	3921	}
	3922
	3923	wqp->wqp_wq.wqp_wq_ptr = waitq;
	3924
	3925	wqp_set_valid(wqp);
	3926	wqp_id = wqp->wqp_prepostid.id;
	3927	waitq->waitq_prepost_id = wqp_id;
	3928
	3929	wq_prepost_put(wqp);
	3930
	3931	out_unlock:
	3932	waitq_unlock(waitq);
	3933
	3934	return wqp_id;
	3935	}
	3936
	3937
	3938	static int
	3939	waitq_inset_cb(struct waitq waitq, void ctx, struct waitq_link *link)
	3940	{
	3941	uint64_t setid = (uint64_t )ctx;
	3942	int wqltype = wql_type(link);
	3943	(void)waitq;
	3944	if (wqltype == WQL_WQS && link->wql_setid.id == setid) {
	3945	wqdbg_v(" waitq already in set 0x%llx", setid);
	3946	return WQ_ITERATE_FOUND;
	3947	} else if (wqltype == WQL_LINK) {
	3948	/*
	3949	* break out early if we see a link that points to the setid
	3950	* in question. This saves us a step in the
	3951	* iteration/recursion
	3952	*/
	3953	wqdbg_v(" waitq already in set 0x%llx (WQL_LINK)", setid);
	3954	if (link->wql_link.left_setid == setid \|\|
	3955	link->wql_link.right_setid == setid) {
	3956	return WQ_ITERATE_FOUND;
	3957	}
	3958	}
	3959
	3960	return WQ_ITERATE_CONTINUE;
	3961	}
	3962
	3963	/**
	3964	* determine if 'waitq' is a member of 'wqset'
	3965	*
	3966	* Conditions:
	3967	* neither 'waitq' nor 'wqset' is not locked
	3968	* may disable and re-enable interrupts while locking 'waitq'
	3969	*/
	3970	boolean_t
	3971	waitq_member(struct waitq waitq, struct waitq_set wqset)
	3972	{
	3973	kern_return_t kr = WQ_ITERATE_SUCCESS;
	3974	uint64_t setid;
	3975
	3976	if (!waitq_valid(waitq)) {
	3977	panic("Invalid waitq: %p", waitq);
	3978	}
	3979	assert(!waitq_irq_safe(waitq));
	3980
	3981	if (!waitqs_is_set(wqset)) {
	3982	return FALSE;
	3983	}
	3984
	3985	waitq_lock(waitq);
	3986
	3987	if (!waitqs_is_linked(wqset)) {
	3988	goto out_unlock;
	3989	}
	3990
	3991	setid = wqset->wqset_id;
	3992
	3993	/* fast path: most waitqs are members of only 1 set */
	3994	if (waitq->waitq_set_id == setid) {
	3995	waitq_unlock(waitq);
	3996	return TRUE;
	3997	}
	3998
	3999	/* walk the link table and look for the Set ID of wqset */
	4000	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	4001	WQL_ALL, (void *)&setid, waitq_inset_cb);
	4002
	4003	out_unlock:
	4004	waitq_unlock(waitq);
	4005	return kr == WQ_ITERATE_FOUND;
	4006	}
	4007
	4008	/**
	4009	* Returns true is the given waitq is a member of at least 1 set
	4010	*/
	4011	boolean_t
	4012	waitq_in_set(struct waitq *waitq)
	4013	{
	4014	struct waitq_link *link;
	4015	boolean_t inset = FALSE;
	4016
	4017	if (waitq_irq_safe(waitq)) {
	4018	return FALSE;
	4019	}
	4020
	4021	waitq_lock(waitq);
	4022
	4023	if (!waitq->waitq_set_id) {
	4024	goto out_unlock;
	4025	}
	4026
	4027	link = wql_get_link(waitq->waitq_set_id);
	4028	if (link) {
	4029	/* if we get here, the waitq is in _at_least_one_ set */
	4030	inset = TRUE;
	4031	wql_put_link(link);
	4032	} else {
	4033	/* we can just optimize this for next time */
	4034	waitq->waitq_set_id = 0;
	4035	}
	4036
	4037	out_unlock:
	4038	waitq_unlock(waitq);
	4039	return inset;
	4040	}
	4041
	4042
	4043	/**
	4044	* pre-allocate a waitq link structure from the link table
	4045	*
	4046	* Conditions:
	4047	* 'waitq' is not locked
	4048	* may (rarely) block if link table needs to grow
	4049	*/
	4050	uint64_t
	4051	waitq_link_reserve(struct waitq *waitq)
	4052	{
	4053	struct waitq_link *link;
	4054	uint64_t reserved_id = 0;
	4055
	4056	assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
	4057
	4058	/*
	4059	* We've asserted that the caller can block, so we enforce a
	4060	* minimum-free table element policy here.
	4061	*/
	4062	wql_ensure_free_space();
	4063
	4064	(void)waitq;
	4065	link = wql_alloc_link(LT_RESERVED);
	4066	if (!link) {
	4067	return 0;
	4068	}
	4069
	4070	reserved_id = link->wql_setid.id;
	4071
	4072	return reserved_id;
	4073	}
	4074
	4075	/**
	4076	* release a pre-allocated waitq link structure
	4077	*/
	4078	void
	4079	waitq_link_release(uint64_t id)
	4080	{
	4081	struct waitq_link *link;
	4082
	4083	if (id == 0) {
	4084	return;
	4085	}
	4086
	4087	link = wql_get_reserved(id, WQL_LINK);
	4088	if (!link) {
	4089	return;
	4090	}
	4091
	4092	/*
	4093	* if we successfully got a link object, then we know
	4094	* it's not been marked valid, and can be released with
	4095	* a standard wql_put_link() which should free the element.
	4096	*/
	4097	wql_put_link(link);
	4098	#if CONFIG_LTABLE_STATS
	4099	g_wqlinktable.nreserved_releases += 1;
	4100	#endif
	4101	}
	4102
	4103	/**
	4104	* link 'waitq' to the set identified by 'setid' using the 'link' structure
	4105	*
	4106	* Conditions:
	4107	* 'waitq' is locked
	4108	* caller should have a reference to the 'link' object
	4109	*/
	4110	static kern_return_t
	4111	waitq_link_internal(struct waitq *waitq,
	4112	uint64_t setid, struct waitq_link *link)
	4113	{
	4114	struct waitq_link *qlink;
	4115	kern_return_t kr;
	4116
	4117	assert(waitq_held(waitq));
	4118	assert(setid != 0);
	4119	assert(setid != WQSET_NOT_LINKED);
	4120
	4121	/*
	4122	* If the waitq_set_id field is empty, then this waitq is not
	4123	* a member of any other set. All we have to do is update the
	4124	* field.
	4125	*/
	4126	if (!waitq->waitq_set_id) {
	4127	waitq->waitq_set_id = setid;
	4128	return KERN_SUCCESS;
	4129	}
	4130
	4131	qlink = wql_get_link(waitq->waitq_set_id);
	4132	if (!qlink) {
	4133	/*
	4134	* The set to which this wait queue belonged has been
	4135	* destroyed / invalidated. We can re-use the waitq field.
	4136	*/
	4137	waitq->waitq_set_id = setid;
	4138	return KERN_SUCCESS;
	4139	}
	4140	wql_put_link(qlink);
	4141
	4142	/*
	4143	* Check to see if it's already a member of the set.
	4144	*
	4145	* TODO: check for cycles!
	4146	*/
	4147	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	4148	WQL_ALL, (void *)&setid, waitq_inset_cb);
	4149	if (kr == WQ_ITERATE_FOUND) {
	4150	return KERN_ALREADY_IN_SET;
	4151	}
	4152
	4153	/*
	4154	* This wait queue is a member of at least one set already,
	4155	* and _not_ a member of the given set. Use our previously
	4156	* allocated link object, and hook it up to the wait queue.
	4157	* Note that it's possible that one or more of the wait queue sets to
	4158	* which the wait queue belongs was invalidated before we allocated
	4159	* this link object. That's OK because the next time we use that
	4160	* object we'll just ignore it.
	4161	*/
	4162	link->wql_link.left_setid = setid;
	4163	link->wql_link.right_setid = waitq->waitq_set_id;
	4164	wql_mkvalid(link);
	4165
	4166	waitq->waitq_set_id = link->wql_setid.id;
	4167
	4168	return KERN_SUCCESS;
	4169	}
	4170
	4171	/**
	4172	* link 'waitq' to 'wqset'
	4173	*
	4174	* Conditions:
	4175	* if 'lock_state' contains WAITQ_SHOULD_LOCK, 'waitq' must be unlocked.
	4176	* Otherwise, 'waitq' must be locked.
	4177	*
	4178	* may (rarely) block on link table allocation if the table has to grow,
	4179	* and no 'reserved_link' object is passed.
	4180	*
	4181	* may block and acquire wqset lock if the wqset passed has no link.
	4182	*
	4183	* Notes:
	4184	* The caller can guarantee that this function will never block by
	4185	* - pre-allocating a link table object and passing its ID in 'reserved_link'
	4186	* - and pre-allocating the waitq set link calling waitq_set_lazy_init_link.
	4187	* It is not possible to provide a reserved_link without having also linked
	4188	* the wqset.
	4189	*/
	4190	kern_return_t
	4191	waitq_link(struct waitq waitq, struct waitq_set wqset,
	4192	waitq_lock_state_t lock_state, uint64_t *reserved_link)
	4193	{
	4194	kern_return_t kr;
	4195	struct waitq_link *link;
	4196	int should_lock = (lock_state == WAITQ_SHOULD_LOCK);
	4197
	4198	if (!waitq_valid(waitq) \|\| waitq_irq_safe(waitq)) {
	4199	panic("Invalid waitq: %p", waitq);
	4200	}
	4201
	4202	if (!waitqs_is_set(wqset)) {
	4203	return KERN_INVALID_ARGUMENT;
	4204	}
	4205
	4206	if (!reserved_link \|\| *reserved_link == 0) {
	4207	if (!waitqs_is_linked(wqset)) {
	4208	waitq_set_lazy_init_link(wqset);
	4209	}
	4210	}
	4211
	4212	wqdbg_v("Link waitq %p to wqset 0x%llx",
	4213	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
	4214
	4215	/*
	4216	* We _might_ need a new link object here, so we'll grab outside
	4217	* the lock because the alloc call _might_ block.
	4218	*
	4219	* If the caller reserved a link beforehand, then wql_get_link
	4220	* is guaranteed not to block because the caller holds an extra
	4221	* reference to the link which, in turn, hold a reference to the
	4222	* link table.
	4223	*/
	4224	if (reserved_link && *reserved_link != 0) {
	4225	link = wql_get_reserved(*reserved_link, WQL_LINK);
	4226	/* always consume the caller's reference */
	4227	*reserved_link = 0;
	4228	} else {
	4229	link = wql_alloc_link(WQL_LINK);
	4230	}
	4231	if (!link) {
	4232	return KERN_NO_SPACE;
	4233	}
	4234
	4235	if (should_lock) {
	4236	waitq_lock(waitq);
	4237	}
	4238
	4239	kr = waitq_link_internal(waitq, wqset->wqset_id, link);
	4240
	4241	if (should_lock) {
	4242	waitq_unlock(waitq);
	4243	}
	4244
	4245	wql_put_link(link);
	4246
	4247	return kr;
	4248	}
	4249
	4250	/**
	4251	* helper: unlink 'waitq' from waitq set identified by 'setid'
	4252	* this function also prunes invalid objects from the tree
	4253	*
	4254	* Conditions:
	4255	* MUST be called from walk_waitq_links link table walk
	4256	* 'waitq' is locked
	4257	*
	4258	* Notes:
	4259	* This is a helper function which compresses the link table by culling
	4260	* unused or unnecessary links. See comments below for different
	4261	* scenarios.
	4262	*/
	4263	static inline int
	4264	waitq_maybe_remove_link(struct waitq *waitq,
	4265	uint64_t setid,
	4266	struct waitq_link *parent,
	4267	struct waitq_link *left,
	4268	struct waitq_link *right)
	4269	{
	4270	uint64_t *wq_setid = &waitq->waitq_set_id;
	4271
	4272	/*
	4273	* There are two scenarios:
	4274	*
	4275	* Scenario 1:
	4276	* --------------------------------------------------------------------
	4277	* waitq->waitq_set_id == parent
	4278	*
	4279	* parent(LINK)
	4280	* / \
	4281	* / \
	4282	* / \
	4283	* L(LINK/WQS_l) R(LINK/WQS_r)
	4284	*
	4285	* In this scenario, we assert that the original waitq points to the
	4286	* parent link we were passed in. If WQS_l (or WQS_r) is the waitq
	4287	* set we're looking for, we can set the corresponding parent
	4288	* link id (left or right) to 0. To compress the tree, we can reset the
	4289	* waitq_set_id of the original waitq to point to the side of the
	4290	* parent that is still valid. We then discard the parent link object.
	4291	*/
	4292	if (*wq_setid == parent->wql_setid.id) {
	4293	if (!left && !right) {
	4294	/* completely invalid children */
	4295	wql_invalidate(parent);
	4296	wqdbg_v("S1, L+R");
	4297	*wq_setid = 0;
	4298	return WQ_ITERATE_INVALID;
	4299	} else if (!left \|\| left->wql_setid.id == setid) {
	4300	/*
	4301	* left side matches we know it points either to the
	4302	* WQS we're unlinking, or to an invalid object:
	4303	* no need to invalidate it
	4304	*/
	4305	*wq_setid = right ? right->wql_setid.id : 0;
	4306	wql_invalidate(parent);
	4307	wqdbg_v("S1, L");
	4308	return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4309	} else if (!right \|\| right->wql_setid.id == setid) {
	4310	/*
	4311	* if right side matches we know it points either to the
	4312	* WQS we're unlinking, or to an invalid object:
	4313	* no need to invalidate it
	4314	*/
	4315	*wq_setid = left ? left->wql_setid.id : 0;
	4316	wql_invalidate(parent);
	4317	wqdbg_v("S1, R");
	4318	return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4319	}
	4320	}
	4321
	4322	/*
	4323	* the tree walk starts at the top-of-tree and moves down,
	4324	* so these are safe asserts.
	4325	*/
	4326	assert(left \|\| right); /* one of them has to be valid at this point */
	4327
	4328	/*
	4329	* Scenario 2:
	4330	* --------------------------------------------------------------------
	4331	* waitq->waitq_set_id == ... (OR parent)
	4332	*
	4333	* ...
	4334	* \|
	4335	* parent
	4336	* / \
	4337	* / \
	4338	* L(LINK) R(LINK)
	4339	* /\ /\
	4340	* / \ / \
	4341	* / \ Rl() Rr()
	4342	* Ll(WQS) Lr(WQS)
	4343	*
	4344	* In this scenario, a leaf node of either the left or right side
	4345	* could be the wait queue set we're looking to unlink. We also handle
	4346	* the case where one of these links is invalid. If a leaf node is
	4347	* invalid or it's the set we're looking for, we can safely remove the
	4348	* middle link (left or right) and point the parent link directly to
	4349	* the remaining leaf node.
	4350	*/
	4351	if (left && wql_type(left) == WQL_LINK) {
	4352	uint64_t Ll, Lr;
	4353	struct waitq_link linkLl, linkLr;
	4354	assert(left->wql_setid.id != setid);
	4355	Ll = left->wql_link.left_setid;
	4356	Lr = left->wql_link.right_setid;
	4357	linkLl = wql_get_link(Ll);
	4358	linkLr = wql_get_link(Lr);
	4359	if (!linkLl && !linkLr) {
	4360	/*
	4361	* The left object points to two invalid objects!
	4362	* We can invalidate the left w/o touching the parent.
	4363	*/
	4364	wql_invalidate(left);
	4365	wqdbg_v("S2, Ll+Lr");
	4366	return WQ_ITERATE_INVALID;
	4367	} else if (!linkLl \|\| Ll == setid) {
	4368	/* Ll is invalid and/or the wait queue set we're looking for */
	4369	parent->wql_link.left_setid = Lr;
	4370	wql_invalidate(left);
	4371	wql_put_link(linkLl);
	4372	wql_put_link(linkLr);
	4373	wqdbg_v("S2, Ll");
	4374	return linkLl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4375	} else if (!linkLr \|\| Lr == setid) {
	4376	/* Lr is invalid and/or the wait queue set we're looking for */
	4377	parent->wql_link.left_setid = Ll;
	4378	wql_invalidate(left);
	4379	wql_put_link(linkLr);
	4380	wql_put_link(linkLl);
	4381	wqdbg_v("S2, Lr");
	4382	return linkLr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4383	}
	4384	wql_put_link(linkLl);
	4385	wql_put_link(linkLr);
	4386	}
	4387
	4388	if (right && wql_type(right) == WQL_LINK) {
	4389	uint64_t Rl, Rr;
	4390	struct waitq_link linkRl, linkRr;
	4391	assert(right->wql_setid.id != setid);
	4392	Rl = right->wql_link.left_setid;
	4393	Rr = right->wql_link.right_setid;
	4394	linkRl = wql_get_link(Rl);
	4395	linkRr = wql_get_link(Rr);
	4396	if (!linkRl && !linkRr) {
	4397	/*
	4398	* The right object points to two invalid objects!
	4399	* We can invalidate the right w/o touching the parent.
	4400	*/
	4401	wql_invalidate(right);
	4402	wqdbg_v("S2, Rl+Rr");
	4403	return WQ_ITERATE_INVALID;
	4404	} else if (!linkRl \|\| Rl == setid) {
	4405	/* Rl is invalid and/or the wait queue set we're looking for */
	4406	parent->wql_link.right_setid = Rr;
	4407	wql_invalidate(right);
	4408	wql_put_link(linkRl);
	4409	wql_put_link(linkRr);
	4410	wqdbg_v("S2, Rl");
	4411	return linkRl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4412	} else if (!linkRr \|\| Rr == setid) {
	4413	/* Rr is invalid and/or the wait queue set we're looking for */
	4414	parent->wql_link.right_setid = Rl;
	4415	wql_invalidate(right);
	4416	wql_put_link(linkRl);
	4417	wql_put_link(linkRr);
	4418	wqdbg_v("S2, Rr");
	4419	return linkRr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
	4420	}
	4421	wql_put_link(linkRl);
	4422	wql_put_link(linkRr);
	4423	}
	4424
	4425	return WQ_ITERATE_CONTINUE;
	4426	}
	4427
	4428	/**
	4429	* link table walk callback that unlinks 'waitq' from 'ctx->setid'
	4430	*
	4431	* Conditions:
	4432	* called from walk_waitq_links
	4433	* 'waitq' is locked
	4434	*
	4435	* Notes:
	4436	* uses waitq_maybe_remove_link() to compress the linktable and
	4437	* perform the actual unlinking
	4438	*/
	4439	static int
	4440	waitq_unlink_cb(struct waitq waitq, void ctx,
	4441	struct waitq_link *link)
	4442	{
	4443	uint64_t setid = ((uint64_t )ctx);
	4444	struct waitq_link right, left;
	4445	int ret = 0;
	4446
	4447	if (wql_type(link) != WQL_LINK) {
	4448	return WQ_ITERATE_CONTINUE;
	4449	}
	4450
	4451	do {
	4452	left = wql_get_link(link->wql_link.left_setid);
	4453	right = wql_get_link(link->wql_link.right_setid);
	4454
	4455	ret = waitq_maybe_remove_link(waitq, setid, link, left, right);
	4456
	4457	wql_put_link(left);
	4458	wql_put_link(right);
	4459
	4460	if (!wql_is_valid(link)) {
	4461	return WQ_ITERATE_INVALID;
	4462	}
	4463	/* A ret value of UNLINKED will break us out of table walk */
	4464	} while (ret == WQ_ITERATE_INVALID);
	4465
	4466	return ret;
	4467	}
	4468
	4469
	4470	/**
	4471	* undo/remove a prepost from 'ctx' (waitq) to 'wqset'
	4472	*
	4473	* Conditions:
	4474	* Called from wq_prepost_foreach_locked OR wq_prepost_iterate
	4475	* 'wqset' may be NULL
	4476	* (ctx)->unlink_wqset is locked
	4477	*/
	4478	static int
	4479	waitq_unlink_prepost_cb(struct waitq_set __unused wqset, void ctx,
	4480	struct wq_prepost wqp, struct waitq waitq)
	4481	{
	4482	struct wq_unlink_ctx ulctx = (struct wq_unlink_ctx )ctx;
	4483
	4484	if (waitq != ulctx->unlink_wq) {
	4485	return WQ_ITERATE_CONTINUE;
	4486	}
	4487
	4488	if (wqp_type(wqp) == WQP_WQ &&
	4489	wqp->wqp_prepostid.id == ulctx->unlink_wqset->wqset_prepost_id) {
	4490	/* this is the only prepost on this wait queue set */
	4491	wqdbg_v("unlink wqp (WQ) 0x%llx", wqp->wqp_prepostid.id);
	4492	ulctx->unlink_wqset->wqset_prepost_id = 0;
	4493	return WQ_ITERATE_BREAK;
	4494	}
	4495
	4496	assert(wqp_type(wqp) == WQP_POST);
	4497
	4498	/*
	4499	* The prepost object 'wqp' points to a waitq which should no longer
	4500	* be preposted to 'ulctx->unlink_wqset'. We can remove the prepost
	4501	* object from the list and break out of the iteration. Using the
	4502	* context object in this way allows this same callback function to be
	4503	* used from both wq_prepost_foreach_locked and wq_prepost_iterate.
	4504	*/
	4505	wq_prepost_remove(ulctx->unlink_wqset, wqp);
	4506	return WQ_ITERATE_BREAK;
	4507	}
	4508
	4509	/**
	4510	* unlink 'waitq' from 'wqset'
	4511	*
	4512	* Conditions:
	4513	* 'waitq' is locked
	4514	* 'wqset' is _not_ locked
	4515	* may (rarely) spin in prepost clear and drop/re-acquire 'waitq' lock
	4516	* (see waitq_clear_prepost_locked)
	4517	*/
	4518	static kern_return_t
	4519	waitq_unlink_locked(struct waitq *waitq,
	4520	struct waitq_set *wqset)
	4521	{
	4522	uint64_t setid;
	4523	kern_return_t kr;
	4524
	4525	assert(!waitq_irq_safe(waitq));
	4526
	4527	if (waitq->waitq_set_id == 0) {
	4528	/*
	4529	* TODO:
	4530	* it doesn't belong to anyone, and it has a prepost object?
	4531	* This is an artifact of not cleaning up after kqueues when
	4532	* they prepost into select sets...
	4533	*/
	4534	if (waitq->waitq_prepost_id != 0) {
	4535	(void)waitq_clear_prepost_locked(waitq);
	4536	}
	4537	return KERN_NOT_IN_SET;
	4538	}
	4539
	4540	if (!waitqs_is_linked(wqset)) {
	4541	/*
	4542	* No link has been allocated for the wqset,
	4543	* so no waitq could have been linked to it.
	4544	*/
	4545	return KERN_NOT_IN_SET;
	4546	}
	4547
	4548	setid = wqset->wqset_id;
	4549
	4550	if (waitq->waitq_set_id == setid) {
	4551	waitq->waitq_set_id = 0;
	4552	/*
	4553	* This was the only set to which the waitq belonged: we can
	4554	* safely release the waitq's prepost object. It doesn't
	4555	* matter if this function drops and re-acquires the lock
	4556	* because we're not manipulating waitq state any more.
	4557	*/
	4558	(void)waitq_clear_prepost_locked(waitq);
	4559	return KERN_SUCCESS;
	4560	}
	4561
	4562	/*
	4563	* The waitq was a member of more that 1 set, so we need to
	4564	* handle potentially compressing the link table, and
	4565	* adjusting the waitq->waitq_set_id value.
	4566	*
	4567	* Note: we can't free the waitq's associated prepost object (if any)
	4568	* because it may be in use by the one or more _other_ sets to
	4569	* which this queue belongs.
	4570	*
	4571	* Note: This function only handles a single level of the queue linkage.
	4572	* Removing a waitq from a set to which it does not directly
	4573	* belong is undefined. For example, if a waitq belonged to set
	4574	* A, and set A belonged to set B. You can't remove the waitq
	4575	* from set B.
	4576	*/
	4577	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	4578	WQL_LINK, (void *)&setid, waitq_unlink_cb);
	4579
	4580	if (kr == WQ_ITERATE_UNLINKED) {
	4581	struct wq_unlink_ctx ulctx;
	4582
	4583	kr = KERN_SUCCESS; /* found it and dis-associated it */
	4584
	4585	/* don't look for preposts if it's not prepost-enabled */
	4586	if (!wqset->wqset_q.waitq_prepost) {
	4587	goto out;
	4588	}
	4589
	4590	assert(!waitq_irq_safe(&wqset->wqset_q));
	4591
	4592	waitq_set_lock(wqset);
	4593	/*
	4594	* clear out any prepost from waitq into wqset
	4595	* TODO: this could be more efficient than a linear search of
	4596	* the waitq set's prepost list.
	4597	*/
	4598	ulctx.unlink_wq = waitq;
	4599	ulctx.unlink_wqset = wqset;
	4600	(void)wq_prepost_iterate(wqset->wqset_prepost_id, (void *)&ulctx,
	4601	waitq_unlink_prepost_cb);
	4602	waitq_set_unlock(wqset);
	4603	} else {
	4604	kr = KERN_NOT_IN_SET; /* waitq is _not_ associated with wqset */
	4605	}
	4606
	4607	out:
	4608	return kr;
	4609	}
	4610
	4611	/**
	4612	* unlink 'waitq' from 'wqset'
	4613	*
	4614	* Conditions:
	4615	* neither 'waitq' nor 'wqset' is locked
	4616	* may disable and re-enable interrupts
	4617	* may (rarely) spin in prepost clear
	4618	* (see waitq_clear_prepost_locked)
	4619	*/
	4620	kern_return_t
	4621	waitq_unlink(struct waitq waitq, struct waitq_set wqset)
	4622	{
	4623	kern_return_t kr = KERN_SUCCESS;
	4624
	4625	assert(waitqs_is_set(wqset));
	4626
	4627	/*
	4628	* we allow the waitq to be invalid because the caller may be trying
	4629	* to clear out old/dirty state
	4630	*/
	4631	if (!waitq_valid(waitq)) {
	4632	return KERN_INVALID_ARGUMENT;
	4633	}
	4634
	4635	wqdbg_v("unlink waitq %p from set 0x%llx",
	4636	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
	4637
	4638	assert(!waitq_irq_safe(waitq));
	4639
	4640	waitq_lock(waitq);
	4641
	4642	kr = waitq_unlink_locked(waitq, wqset);
	4643
	4644	waitq_unlock(waitq);
	4645	return kr;
	4646	}
	4647
	4648	/**
	4649	* unlink a waitq from a waitq set, but reference the waitq by its prepost ID
	4650	*
	4651	* Conditions:
	4652	* 'wqset' is unlocked
	4653	* wqp_id may be valid or invalid
	4654	*/
	4655	void
	4656	waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset)
	4657	{
	4658	struct wq_prepost *wqp;
	4659
	4660	disable_preemption();
	4661	wqp = wq_prepost_get(wqp_id);
	4662	if (wqp) {
	4663	struct waitq *wq;
	4664
	4665	wq = wqp->wqp_wq.wqp_wq_ptr;
	4666
	4667	/*
	4668	* lock the waitq, then release our prepost ID reference, then
	4669	* unlink the waitq from the wqset: this ensures that we don't
	4670	* hold a prepost ID reference during the unlink, but we also
	4671	* complete the unlink operation atomically to avoid a race
	4672	* with waitq_unlink[_all].
	4673	*/
	4674	assert(!waitq_irq_safe(wq));
	4675
	4676	waitq_lock(wq);
	4677	wq_prepost_put(wqp);
	4678
	4679	if (!waitq_valid(wq)) {
	4680	/* someone already tore down this waitq! */
	4681	waitq_unlock(wq);
	4682	enable_preemption();
	4683	return;
	4684	}
	4685
	4686	/* this _may_ drop the wq lock, but that's OK */
	4687	waitq_unlink_locked(wq, wqset);
	4688
	4689	waitq_unlock(wq);
	4690	}
	4691	enable_preemption();
	4692	return;
	4693	}
	4694
	4695
	4696	/**
	4697	* reference and lock a waitq by its prepost ID
	4698	*
	4699	* Conditions:
	4700	* wqp_id may be valid or invalid
	4701	*
	4702	* Returns:
	4703	* a locked waitq if wqp_id was valid
	4704	* NULL on failure
	4705	*/
	4706	struct waitq *
	4707	waitq_lock_by_prepost_id(uint64_t wqp_id)
	4708	{
	4709	struct waitq *wq = NULL;
	4710	struct wq_prepost *wqp;
	4711
	4712	disable_preemption();
	4713	wqp = wq_prepost_get(wqp_id);
	4714	if (wqp) {
	4715	wq = wqp->wqp_wq.wqp_wq_ptr;
	4716
	4717	assert(!waitq_irq_safe(wq));
	4718
	4719	waitq_lock(wq);
	4720	wq_prepost_put(wqp);
	4721
	4722	if (!waitq_valid(wq)) {
	4723	/* someone already tore down this waitq! */
	4724	waitq_unlock(wq);
	4725	enable_preemption();
	4726	return NULL;
	4727	}
	4728	}
	4729	enable_preemption();
	4730	return wq;
	4731	}
	4732
	4733
	4734	/**
	4735	* unlink 'waitq' from all sets to which it belongs
	4736	*
	4737	* Conditions:
	4738	* 'waitq' is locked on entry
	4739	* returns with waitq lock dropped
	4740	*
	4741	* Notes:
	4742	* may (rarely) spin (see waitq_clear_prepost_locked)
	4743	*/
	4744	kern_return_t
	4745	waitq_unlink_all_unlock(struct waitq *waitq)
	4746	{
	4747	uint64_t old_set_id = 0;
	4748	wqdbg_v("unlink waitq %p from all sets",
	4749	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq));
	4750	assert(!waitq_irq_safe(waitq));
	4751
	4752	/* it's not a member of any sets */
	4753	if (waitq->waitq_set_id == 0) {
	4754	waitq_unlock(waitq);
	4755	return KERN_SUCCESS;
	4756	}
	4757
	4758	old_set_id = waitq->waitq_set_id;
	4759	waitq->waitq_set_id = 0;
	4760
	4761	/*
	4762	* invalidate the prepost entry for this waitq.
	4763	* This may drop and re-acquire the waitq lock, but that's OK because
	4764	* if it was added to another set and preposted to that set in the
	4765	* time we drop the lock, the state will remain consistent.
	4766	*/
	4767	(void)waitq_clear_prepost_locked(waitq);
	4768
	4769	waitq_unlock(waitq);
	4770
	4771	if (old_set_id) {
	4772	/*
	4773	* Walk the link table and invalidate each LINK object that
	4774	* used to connect this waitq to one or more sets: this works
	4775	* because WQL_LINK objects are private to each wait queue
	4776	*/
	4777	(void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, old_set_id,
	4778	WQL_LINK, NULL, waitq_unlink_all_cb);
	4779	}
	4780
	4781	return KERN_SUCCESS;
	4782	}
	4783
	4784	/**
	4785	* unlink 'waitq' from all sets to which it belongs
	4786	*
	4787	* Conditions:
	4788	* 'waitq' is not locked
	4789	* may disable and re-enable interrupts
	4790	* may (rarely) spin
	4791	* (see waitq_unlink_all_locked, waitq_clear_prepost_locked)
	4792	*/
	4793	kern_return_t
	4794	waitq_unlink_all(struct waitq *waitq)
	4795	{
	4796	kern_return_t kr = KERN_SUCCESS;
	4797
	4798	if (!waitq_valid(waitq)) {
	4799	panic("Invalid waitq: %p", waitq);
	4800	}
	4801
	4802	assert(!waitq_irq_safe(waitq));
	4803	waitq_lock(waitq);
	4804	if (!waitq_valid(waitq)) {
	4805	waitq_unlock(waitq);
	4806	return KERN_SUCCESS;
	4807	}
	4808
	4809	kr = waitq_unlink_all_unlock(waitq);
	4810	/* waitq unlocked and set links deallocated */
	4811
	4812	return kr;
	4813	}
	4814
	4815
	4816	/**
	4817	* unlink all waitqs from 'wqset'
	4818	*
	4819	* Conditions:
	4820	* 'wqset' is locked on entry
	4821	* 'wqset' is unlocked on exit and spl is restored
	4822	*
	4823	* Note:
	4824	* may (rarely) spin/block (see waitq_clear_prepost_locked)
	4825	*/
	4826	kern_return_t
	4827	waitq_set_unlink_all_unlock(struct waitq_set *wqset)
	4828	{
	4829	struct waitq_link *link;
	4830	uint64_t prepost_id;
	4831
	4832	wqdbg_v("unlink all queues from set 0x%llx", wqset->wqset_id);
	4833
	4834	/*
	4835	* This operation does not require interaction with any of the set's
	4836	* constituent wait queues. All we have to do is invalidate the SetID
	4837	*/
	4838
	4839	if (waitqs_is_linked(wqset)) {
	4840	/* invalidate and re-alloc the link object first */
	4841	link = wql_get_link(wqset->wqset_id);
	4842
	4843	/* we may have raced with a waitq_set_deinit: handle this */
	4844	if (!link) {
	4845	waitq_set_unlock(wqset);
	4846	return KERN_SUCCESS;
	4847	}
	4848
	4849	wql_invalidate(link);
	4850
	4851	/* re-alloc the object to get a new generation ID */
	4852	wql_realloc_link(link, WQL_WQS);
	4853	link->wql_wqs.wql_set = wqset;
	4854
	4855	wqset->wqset_id = link->wql_setid.id;
	4856	wql_mkvalid(link);
	4857	wql_put_link(link);
	4858	}
	4859
	4860	/* clear any preposts attached to this set */
	4861	prepost_id = 0;
	4862	if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {
	4863	prepost_id = wqset->wqset_prepost_id;
	4864	}
	4865	/* else { TODO: notify kqueue subsystem? } */
	4866	wqset->wqset_prepost_id = 0;
	4867
	4868	/*
	4869	* clear set linkage and prepost object associated with this set:
	4870	* waitq sets may prepost to other sets if, for example, they are
	4871	* associated with a kqueue which is in a select set.
	4872	*
	4873	* This releases all the set link objects
	4874	* (links to other sets to which this set was previously added)
	4875	*/
	4876	waitq_unlink_all_unlock(&wqset->wqset_q);
	4877	/* wqset->wqset_q unlocked */
	4878
	4879	/* drop / unlink all the prepost table objects */
	4880	if (prepost_id) {
	4881	(void)wq_prepost_iterate(prepost_id, NULL,
	4882	wqset_clear_prepost_chain_cb);
	4883	}
	4884
	4885	return KERN_SUCCESS;
	4886	}
	4887
	4888	/**
	4889	* unlink all waitqs from 'wqset'
	4890	*
	4891	* Conditions:
	4892	* 'wqset' is not locked
	4893	* may (rarely) spin/block (see waitq_clear_prepost_locked)
	4894	*/
	4895	kern_return_t
	4896	waitq_set_unlink_all(struct waitq_set *wqset)
	4897	{
	4898	assert(waitqs_is_set(wqset));
	4899	assert(!waitq_irq_safe(&wqset->wqset_q));
	4900
	4901	waitq_set_lock(wqset);
	4902	return waitq_set_unlink_all_unlock(wqset);
	4903	/* wqset unlocked and set links and preposts deallocated */
	4904	}
	4905
	4906	static int
	4907	waitq_prepost_reserve_cb(struct waitq waitq, void ctx,
	4908	struct waitq_link *link)
	4909	{
	4910	uint32_t num = (uint32_t )ctx;
	4911	(void)waitq;
	4912
	4913	/*
	4914	* In the worst case, we'll have to allocate 2 prepost objects
	4915	* per waitq set (if the set was already preposted by another
	4916	* waitq).
	4917	*/
	4918	if (wql_type(link) == WQL_WQS) {
	4919	/*
	4920	* check to see if the associated waitq actually supports
	4921	* preposting
	4922	*/
	4923	if (waitq_set_can_prepost(link->wql_wqs.wql_set)) {
	4924	*num += 2;
	4925	}
	4926	}
	4927	return WQ_ITERATE_CONTINUE;
	4928	}
	4929
	4930	static int
	4931	waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,
	4932	int did_unlock, struct wq_prepost *wqp)
	4933	{
	4934	struct wq_prepost *tmp;
	4935	struct wqp_cache *cache;
	4936
	4937	*did_unlock = 0;
	4938
	4939	/*
	4940	* Before we unlock the waitq, check the per-processor prepost object
	4941	* cache to see if there's enough there for us. If so, do the
	4942	* allocation, keep the lock and save an entire iteration over the set
	4943	* linkage!
	4944	*/
	4945	if (waitq) {
	4946	disable_preemption();
	4947	cache = PERCPU_GET(wqp_cache);
	4948	if (nalloc <= (int)cache->avail) {
	4949	goto do_alloc;
	4950	}
	4951	enable_preemption();
	4952
	4953	/* unlock the waitq to perform the allocation */
	4954	*did_unlock = 1;
	4955	waitq_unlock(waitq);
	4956	}
	4957
	4958	do_alloc:
	4959	tmp = wq_prepost_alloc(LT_RESERVED, nalloc);
	4960	if (!tmp) {
	4961	panic("Couldn't reserve %d preposts for waitq @%p (wqp@%p)",
	4962	nalloc, waitq, *wqp);
	4963	}
	4964	if (*wqp) {
	4965	/* link the two lists */
	4966	int __assert_only rc;
	4967	rc = wq_prepost_rlink(tmp, *wqp);
	4968	assert(rc == nalloc);
	4969	}
	4970	*wqp = tmp;
	4971
	4972	/*
	4973	* If the caller can block, then enforce a minimum-free table element
	4974	* policy here. This helps ensure that we will have enough prepost
	4975	* objects for callers such as selwakeup() that can be called with
	4976	* spin locks held.
	4977	*/
	4978	if (get_preemption_level() == 0) {
	4979	wq_prepost_ensure_free_space();
	4980	}
	4981
	4982	if (waitq) {
	4983	if (*did_unlock == 0) {
	4984	/* decrement the preemption count if alloc from cache */
	4985	enable_preemption();
	4986	} else {
	4987	/* otherwise: re-lock the waitq */
	4988	waitq_lock(waitq);
	4989	}
	4990	}
	4991
	4992	return nalloc;
	4993	}
	4994
	4995	static int
	4996	waitq_count_prepost_reservation(struct waitq *waitq, int extra, int keep_locked)
	4997	{
	4998	int npreposts = 0;
	4999
	5000	/*
	5001	* If the waitq is not currently part of a set, and we're not asked to
	5002	* keep the waitq locked then we'll want to have 3 in reserve
	5003	* just-in-case it becomes part of a set while we unlock and reserve.
	5004	* We may need up to 1 object for the waitq, and 2 for the set.
	5005	*/
	5006	if (waitq->waitq_set_id == 0) {
	5007	npreposts = 3;
	5008	} else {
	5009	/* this queue has never been preposted before */
	5010	if (waitq->waitq_prepost_id == 0) {
	5011	npreposts = 3;
	5012	}
	5013
	5014	/*
	5015	* Walk the set of table linkages associated with this waitq
	5016	* and count the worst-case number of prepost objects that
	5017	* may be needed during a wakeup_all. We can walk this without
	5018	* locking each set along the way because the table-based IDs
	5019	* disconnect us from the set pointers themselves, and the
	5020	* table walking is careful to read the setid values only once.
	5021	* Locking each set up the chain also doesn't guarantee that
	5022	* their membership won't change between the time we unlock
	5023	* that set and when we actually go to prepost, so our
	5024	* situation is no worse than before and we've alleviated lock
	5025	* contention on any sets to which this waitq belongs.
	5026	*/
	5027	(void)walk_waitq_links(LINK_WALK_FULL_DAG_UNLOCKED,
	5028	waitq, waitq->waitq_set_id,
	5029	WQL_WQS, (void *)&npreposts,
	5030	waitq_prepost_reserve_cb);
	5031	}
	5032
	5033	if (extra > 0) {
	5034	npreposts += extra;
	5035	}
	5036
	5037	if (npreposts == 0 && !keep_locked) {
	5038	/*
	5039	* If we get here, we were asked to reserve some prepost
	5040	* objects for a waitq that's previously preposted, and is not
	5041	* currently a member of any sets. We have also been
	5042	* instructed to unlock the waitq when we're done. In this
	5043	* case, we pre-allocated enough reserved objects to handle
	5044	* the case where the waitq gets added to a single set when
	5045	* the lock is released.
	5046	*/
	5047	npreposts = 3;
	5048	}
	5049
	5050	return npreposts;
	5051	}
	5052
	5053
	5054	/**
	5055	* pre-allocate prepost objects for 'waitq'
	5056	*
	5057	* Conditions:
	5058	* 'waitq' is not locked
	5059	*
	5060	* Returns:
	5061	* panic on error
	5062	*
	5063	* 0 on success, '*reserved' is set to the head of a singly-linked
	5064	* list of pre-allocated prepost objects.
	5065	*
	5066	* Notes:
	5067	* If 'lock_state' is WAITQ_KEEP_LOCKED, this function performs the pre-allocation
	5068	* atomically and returns 'waitq' locked.
	5069	*
	5070	* This function attempts to pre-allocate precisely enough prepost
	5071	* objects based on the current set membership of 'waitq'. If the
	5072	* operation is performed atomically, then the caller
	5073	* is guaranteed to have enough pre-allocated prepost object to avoid
	5074	* any (rare) blocking in the wakeup path.
	5075	*/
	5076	uint64_t
	5077	waitq_prepost_reserve(struct waitq *waitq, int extra,
	5078	waitq_lock_state_t lock_state)
	5079	{
	5080	uint64_t reserved = 0;
	5081	uint64_t prev_setid = 0, prev_prepostid = 0;
	5082	struct wq_prepost *wqp = NULL;
	5083	int nalloc = 0, npreposts = 0;
	5084	int keep_locked = (lock_state == WAITQ_KEEP_LOCKED);
	5085	int unlocked = 0;
	5086
	5087	wqdbg_v("Attempting to reserve prepost linkages for waitq %p (extra:%d)",
	5088	(void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), extra);
	5089
	5090	if (waitq == NULL && extra > 0) {
	5091	/*
	5092	* Simple prepost object allocation:
	5093	* we'll add 2 more because the waitq might need an object,
	5094	* and the set itself may need a new POST object in addition
	5095	* to the number of preposts requested by the caller
	5096	*/
	5097	nalloc = waitq_alloc_prepost_reservation(extra + 2, NULL,
	5098	&unlocked, &wqp);
	5099	assert(nalloc == extra + 2);
	5100	return wqp->wqp_prepostid.id;
	5101	}
	5102
	5103	assert(lock_state == WAITQ_KEEP_LOCKED \|\| lock_state == WAITQ_UNLOCK);
	5104
	5105	assert(!waitq_irq_safe(waitq));
	5106
	5107	waitq_lock(waitq);
	5108
	5109	/* remember the set ID that we started with */
	5110	prev_setid = waitq->waitq_set_id;
	5111	prev_prepostid = waitq->waitq_prepost_id;
	5112
	5113	/*
	5114	* If the waitq is not part of a set, and we're asked to
	5115	* keep the set locked, then we don't have to reserve
	5116	* anything!
	5117	*/
	5118	if (prev_setid == 0 && keep_locked) {
	5119	goto out;
	5120	}
	5121
	5122	npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
	5123
	5124	/* nothing for us to do! */
	5125	if (npreposts == 0) {
	5126	if (keep_locked) {
	5127	goto out;
	5128	}
	5129	goto out_unlock;
	5130	}
	5131
	5132	try_alloc:
	5133	/* this _may_ unlock and relock the waitq! */
	5134	nalloc = waitq_alloc_prepost_reservation(npreposts, waitq,
	5135	&unlocked, &wqp);
	5136
	5137	if (!unlocked) {
	5138	/* allocation held the waitq lock: we'd done! */
	5139	if (keep_locked) {
	5140	goto out;
	5141	}
	5142	goto out_unlock;
	5143	}
	5144
	5145	/*
	5146	* Before we return, if the allocation had to unlock the waitq, we
	5147	* must check one more time to see if we have enough. If not, we'll
	5148	* try to allocate the difference. If the caller requests it, we'll
	5149	* also leave the waitq locked so that the use of the pre-allocated
	5150	* prepost objects can be guaranteed to be enough if a wakeup_all is
	5151	* performed before unlocking the waitq.
	5152	*/
	5153
	5154	/*
	5155	* If the waitq is no longer associated with a set, or if the waitq's
	5156	* set/prepostid has not changed since we first walked its linkage,
	5157	* we're done.
	5158	*/
	5159	if ((waitq->waitq_set_id == 0) \|\|
	5160	(waitq->waitq_set_id == prev_setid &&
	5161	waitq->waitq_prepost_id == prev_prepostid)) {
	5162	if (keep_locked) {
	5163	goto out;
	5164	}
	5165	goto out_unlock;
	5166	}
	5167
	5168	npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
	5169
	5170	if (npreposts > nalloc) {
	5171	prev_setid = waitq->waitq_set_id;
	5172	prev_prepostid = waitq->waitq_prepost_id;
	5173	npreposts = npreposts - nalloc; /* only allocate the diff */
	5174	goto try_alloc;
	5175	}
	5176
	5177	if (keep_locked) {
	5178	goto out;
	5179	}
	5180
	5181	out_unlock:
	5182	waitq_unlock(waitq);
	5183	out:
	5184	if (wqp) {
	5185	reserved = wqp->wqp_prepostid.id;
	5186	}
	5187
	5188	return reserved;
	5189	}
	5190
	5191	/**
	5192	* release a linked list of prepost objects allocated via _prepost_reserve
	5193	*
	5194	* Conditions:
	5195	* may (rarely) spin waiting for prepost table growth memcpy
	5196	*/
	5197	void
	5198	waitq_prepost_release_reserve(uint64_t id)
	5199	{
	5200	struct wq_prepost *wqp;
	5201
	5202	wqdbg_v("releasing reserved preposts starting at: 0x%llx", id);
	5203
	5204	wqp = wq_prepost_rfirst(id);
	5205	if (!wqp) {
	5206	return;
	5207	}
	5208
	5209	wq_prepost_release_rlist(wqp);
	5210	}
	5211
	5212
	5213	/**
	5214	* clear all preposts from 'wqset'
	5215	*
	5216	* Conditions:
	5217	* 'wqset' is not locked
	5218	*/
	5219	void
	5220	waitq_set_clear_preposts(struct waitq_set *wqset)
	5221	{
	5222	uint64_t prepost_id;
	5223	spl_t spl;
	5224
	5225	assert(waitqs_is_set(wqset));
	5226
	5227	if (!wqset->wqset_q.waitq_prepost \|\| !wqset->wqset_prepost_id) {
	5228	return;
	5229	}
	5230
	5231	wqdbg_v("Clearing all preposted queues on waitq_set: 0x%llx",
	5232	wqset->wqset_id);
	5233
	5234	if (waitq_irq_safe(&wqset->wqset_q)) {
	5235	spl = splsched();
	5236	}
	5237	waitq_set_lock(wqset);
	5238	prepost_id = wqset->wqset_prepost_id;
	5239	wqset->wqset_prepost_id = 0;
	5240	waitq_set_unlock(wqset);
	5241	if (waitq_irq_safe(&wqset->wqset_q)) {
	5242	splx(spl);
	5243	}
	5244
	5245	/* drop / unlink all the prepost table objects */
	5246	if (prepost_id) {
	5247	(void)wq_prepost_iterate(prepost_id, NULL,
	5248	wqset_clear_prepost_chain_cb);
	5249	}
	5250	}
	5251
	5252
	5253	/* ----------------------------------------------------------------------
	5254	*
	5255	* Iteration: waitq -> sets / waitq_set -> preposts
	5256	*
	5257	* ---------------------------------------------------------------------- */
	5258
	5259	struct wq_it_ctx {
	5260	void *input;
	5261	void *ctx;
	5262	waitq_iterator_t it;
	5263	};
	5264
	5265	static int
	5266	waitq_iterate_sets_cb(struct waitq waitq, void ctx,
	5267	struct waitq_link *link)
	5268	{
	5269	struct wq_it_ctx wctx = (struct wq_it_ctx )(ctx);
	5270	struct waitq_set *wqset;
	5271	int ret;
	5272
	5273	(void)waitq;
	5274	assert(!waitq_irq_safe(waitq));
	5275	assert(wql_type(link) == WQL_WQS);
	5276
	5277	/*
	5278	* the waitq is locked, so we can just take the set lock
	5279	* and call the iterator function
	5280	*/
	5281	wqset = link->wql_wqs.wql_set;
	5282	assert(wqset != NULL);
	5283	assert(!waitq_irq_safe(&wqset->wqset_q));
	5284	waitq_set_lock(wqset);
	5285
	5286	ret = wctx->it(wctx->ctx, (struct waitq *)wctx->input, wqset);
	5287
	5288	waitq_set_unlock(wqset);
	5289	return ret;
	5290	}
	5291
	5292	/**
	5293	* call external iterator function for each prepost object in wqset
	5294	*
	5295	* Conditions:
	5296	* Called from wq_prepost_foreach_locked
	5297	* (wqset locked, waitq _not_ locked)
	5298	*/
	5299	static int
	5300	wqset_iterate_prepost_cb(struct waitq_set wqset, void ctx,
	5301	struct wq_prepost wqp, struct waitq waitq)
	5302	{
	5303	struct wq_it_ctx wctx = (struct wq_it_ctx )(ctx);
	5304	uint64_t wqp_id;
	5305	int ret;
	5306
	5307	(void)wqp;
	5308
	5309	/*
	5310	* This is a bit tricky. The 'wqset' is locked, but the 'waitq' is not.
	5311	* Taking the 'waitq' lock is a lock order violation, so we need to be
	5312	* careful. We also must realize that we may have taken a reference to
	5313	* the 'wqp' just as the associated waitq was being torn down (or
	5314	* clearing all its preposts) - see waitq_clear_prepost_locked(). If
	5315	* the 'wqp' is valid and we can get the waitq lock, then we are good
	5316	* to go. If not, we need to back off, check that the 'wqp' hasn't
	5317	* been invalidated, and try to re-take the locks.
	5318	*/
	5319	assert(!waitq_irq_safe(waitq));
	5320
	5321	if (waitq_lock_try(waitq)) {
	5322	goto call_iterator;
	5323	}
	5324
	5325	if (!wqp_is_valid(wqp)) {
	5326	return WQ_ITERATE_RESTART;
	5327	}
	5328
	5329	/* We are passed a prepost object with a reference on it. If neither
	5330	* the waitq set nor the waitq require interrupts disabled, then we
	5331	* may block on the delay(1) call below. We can't hold a prepost
	5332	* object reference while blocking, so we have to give that up as well
	5333	* and re-acquire it when we come back.
	5334	*/
	5335	wqp_id = wqp->wqp_prepostid.id;
	5336	wq_prepost_put(wqp);
	5337	waitq_set_unlock(wqset);
	5338	wqdbg_v("dropped set:%p lock waiting for wqp:%p (0x%llx -> wq:%p)",
	5339	wqset, wqp, wqp->wqp_prepostid.id, waitq);
	5340	delay(1);
	5341	waitq_set_lock(wqset);
	5342	wqp = wq_prepost_get(wqp_id);
	5343	if (!wqp) {
	5344	/* someone cleared preposts while we slept! */
	5345	return WQ_ITERATE_DROPPED;
	5346	}
	5347
	5348	/*
	5349	* TODO:
	5350	* This differs slightly from the logic in ipc_mqueue.c:
	5351	* ipc_mqueue_receive_on_thread(). There, if the waitq lock
	5352	* can't be obtained, the prepost link is placed on the back of
	5353	* the chain, and the iteration starts from the beginning. Here,
	5354	* we just restart from the beginning.
	5355	*/
	5356	return WQ_ITERATE_RESTART;
	5357
	5358	call_iterator:
	5359	if (!wqp_is_valid(wqp)) {
	5360	ret = WQ_ITERATE_RESTART;
	5361	goto out_unlock;
	5362	}
	5363
	5364	/* call the external callback */
	5365	ret = wctx->it(wctx->ctx, waitq, wqset);
	5366
	5367	if (ret == WQ_ITERATE_BREAK_KEEP_LOCKED) {
	5368	ret = WQ_ITERATE_BREAK;
	5369	goto out;
	5370	}
	5371
	5372	out_unlock:
	5373	waitq_unlock(waitq);
	5374	out:
	5375	return ret;
	5376	}
	5377
	5378	/**
	5379	* iterator over all sets to which the given waitq has been linked
	5380	*
	5381	* Conditions:
	5382	* 'waitq' is locked
	5383	*/
	5384	int
	5385	waitq_iterate_sets(struct waitq waitq, void ctx, waitq_iterator_t it)
	5386	{
	5387	int ret;
	5388	struct wq_it_ctx wctx = {
	5389	.input = (void *)waitq,
	5390	.ctx = ctx,
	5391	.it = it,
	5392	};
	5393	if (!it \|\| !waitq) {
	5394	return KERN_INVALID_ARGUMENT;
	5395	}
	5396
	5397	ret = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
	5398	WQL_WQS, (void *)&wctx, waitq_iterate_sets_cb);
	5399	if (ret == WQ_ITERATE_CONTINUE) {
	5400	ret = WQ_ITERATE_SUCCESS;
	5401	}
	5402	return ret;
	5403	}
	5404
	5405	/**
	5406	* iterator over all preposts in the given wqset
	5407	*
	5408	* Conditions:
	5409	* 'wqset' is locked
	5410	*/
	5411	int
	5412	waitq_set_iterate_preposts(struct waitq_set *wqset,
	5413	void *ctx, waitq_iterator_t it)
	5414	{
	5415	struct wq_it_ctx wctx = {
	5416	.input = (void *)wqset,
	5417	.ctx = ctx,
	5418	.it = it,
	5419	};
	5420	if (!it \|\| !wqset) {
	5421	return WQ_ITERATE_INVALID;
	5422	}
	5423
	5424	assert(waitq_held(&wqset->wqset_q));
	5425
	5426	return wq_prepost_foreach_locked(wqset, (void *)&wctx,
	5427	wqset_iterate_prepost_cb);
	5428	}
	5429
	5430
	5431	/* ----------------------------------------------------------------------
	5432	*
	5433	* Higher-level APIs
	5434	*
	5435	* ---------------------------------------------------------------------- */
	5436
	5437
	5438	/**
	5439	* declare a thread's intent to wait on 'waitq' for 'wait_event'
	5440	*
	5441	* Conditions:
	5442	* 'waitq' is not locked
	5443	*/
	5444	wait_result_t
	5445	waitq_assert_wait64(struct waitq *waitq,
	5446	event64_t wait_event,
	5447	wait_interrupt_t interruptible,
	5448	uint64_t deadline)
	5449	{
	5450	thread_t thread = current_thread();
	5451	wait_result_t ret;
	5452	spl_t s;
	5453
	5454	if (!waitq_valid(waitq)) {
	5455	panic("Invalid waitq: %p", waitq);
	5456	}
	5457
	5458	if (waitq_irq_safe(waitq)) {
	5459	s = splsched();
	5460	}
	5461
	5462	waitq_lock(waitq);
	5463	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
	5464	TIMEOUT_URGENCY_SYS_NORMAL,
	5465	deadline, TIMEOUT_NO_LEEWAY, thread);
	5466	waitq_unlock(waitq);
	5467
	5468	if (waitq_irq_safe(waitq)) {
	5469	splx(s);
	5470	}
	5471
	5472	return ret;
	5473	}
	5474
	5475	/**
	5476	* declare a thread's intent to wait on 'waitq' for 'wait_event'
	5477	*
	5478	* Conditions:
	5479	* 'waitq' is not locked
	5480	* will disable and re-enable interrupts while locking current_thread()
	5481	*/
	5482	wait_result_t
	5483	waitq_assert_wait64_leeway(struct waitq *waitq,
	5484	event64_t wait_event,
	5485	wait_interrupt_t interruptible,
	5486	wait_timeout_urgency_t urgency,
	5487	uint64_t deadline,
	5488	uint64_t leeway)
	5489	{
	5490	wait_result_t ret;
	5491	thread_t thread = current_thread();
	5492	spl_t s;
	5493
	5494	if (!waitq_valid(waitq)) {
	5495	panic("Invalid waitq: %p", waitq);
	5496	}
	5497
	5498	if (waitq_irq_safe(waitq)) {
	5499	s = splsched();
	5500	}
	5501
	5502	waitq_lock(waitq);
	5503	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
	5504	urgency, deadline, leeway, thread);
	5505	waitq_unlock(waitq);
	5506
	5507	if (waitq_irq_safe(waitq)) {
	5508	splx(s);
	5509	}
	5510
	5511	return ret;
	5512	}
	5513
	5514	/**
	5515	* wakeup a single thread from a waitq that's waiting for a given event
	5516	*
	5517	* Conditions:
	5518	* 'waitq' is not locked
	5519	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
	5520	* may disable and re-enable interrupts
	5521	*
	5522	* Notes:
	5523	* will _not_ block if waitq is global (or not a member of any set)
	5524	*/
	5525	kern_return_t
	5526	waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
	5527	wait_result_t result, int priority)
	5528	{
	5529	kern_return_t kr;
	5530	uint64_t reserved_preposts = 0;
	5531	spl_t spl;
	5532
	5533	if (!waitq_valid(waitq)) {
	5534	panic("Invalid waitq: %p", waitq);
	5535	}
	5536
	5537	if (!waitq_irq_safe(waitq)) {
	5538	/* reserve preposts in addition to locking the waitq */
	5539	reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);
	5540	} else {
	5541	spl = splsched();
	5542	waitq_lock(waitq);
	5543	}
	5544
	5545	/* waitq is locked upon return */
	5546	kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
	5547	&reserved_preposts, priority, WAITQ_UNLOCK, WQ_OPTION_NONE);
	5548
	5549	if (waitq_irq_safe(waitq)) {
	5550	splx(spl);
	5551	}
	5552
	5553	/* release any left-over prepost object (won't block/lock anything) */
	5554	waitq_prepost_release_reserve(reserved_preposts);
	5555
	5556	return kr;
	5557	}
	5558
	5559	/**
	5560	* wakeup all threads from a waitq that are waiting for a given event
	5561	*
	5562	* Conditions:
	5563	* 'waitq' is not locked
	5564	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
	5565	* may disable and re-enable interrupts
	5566	*
	5567	* Notes:
	5568	* will _not_ block if waitq is global (or not a member of any set)
	5569	*/
	5570	kern_return_t
	5571	waitq_wakeup64_all(struct waitq *waitq,
	5572	event64_t wake_event,
	5573	wait_result_t result,
	5574	int priority)
	5575	{
	5576	kern_return_t ret;
	5577	uint64_t reserved_preposts = 0;
	5578	spl_t s;
	5579
	5580	if (!waitq_valid(waitq)) {
	5581	panic("Invalid waitq: %p", waitq);
	5582	}
	5583
	5584	if (!waitq_irq_safe(waitq)) {
	5585	/* reserve preposts in addition to locking waitq */
	5586	reserved_preposts = waitq_prepost_reserve(waitq, 0,
	5587	WAITQ_KEEP_LOCKED);
	5588	} else {
	5589	s = splsched();
	5590	waitq_lock(waitq);
	5591	}
	5592
	5593	ret = waitq_wakeup64_all_locked(waitq, wake_event, result,
	5594	&reserved_preposts, priority,
	5595	WAITQ_UNLOCK);
	5596
	5597	if (waitq_irq_safe(waitq)) {
	5598	splx(s);
	5599	}
	5600
	5601	waitq_prepost_release_reserve(reserved_preposts);
	5602
	5603	return ret;
	5604	}
	5605
	5606	/**
	5607	* wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
	5608	*
	5609	* Conditions:
	5610	* 'waitq' is not locked
	5611	*
	5612	* Notes:
	5613	* May temporarily disable and re-enable interrupts
	5614	*/
	5615	kern_return_t
	5616	waitq_wakeup64_thread(struct waitq *waitq,
	5617	event64_t wake_event,
	5618	thread_t thread,
	5619	wait_result_t result)
	5620	{
	5621	kern_return_t ret;
	5622	spl_t s, th_spl;
	5623
	5624	if (!waitq_valid(waitq)) {
	5625	panic("Invalid waitq: %p", waitq);
	5626	}
	5627
	5628	if (waitq_irq_safe(waitq)) {
	5629	s = splsched();
	5630	}
	5631	waitq_lock(waitq);
	5632
	5633	ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
	5634	/* on success, returns 'thread' locked */
	5635
	5636	waitq_unlock(waitq);
	5637
	5638	if (ret == KERN_SUCCESS) {
	5639	ret = thread_go(thread, result, WQ_OPTION_NONE);
	5640	assert(ret == KERN_SUCCESS);
	5641	thread_unlock(thread);
	5642	splx(th_spl);
	5643	waitq_stats_count_wakeup(waitq);
	5644	} else {
	5645	ret = KERN_NOT_WAITING;
	5646	waitq_stats_count_fail(waitq);
	5647	}
	5648
	5649	if (waitq_irq_safe(waitq)) {
	5650	splx(s);
	5651	}
	5652
	5653	return ret;
	5654	}
	5655
	5656	/**
	5657	* wakeup a single thread from a waitq that's waiting for a given event
	5658	* and return a reference to that thread
	5659	* returns THREAD_NULL if no thread was waiting
	5660	*
	5661	* Conditions:
	5662	* 'waitq' is not locked
	5663	* may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
	5664	* may disable and re-enable interrupts
	5665	*
	5666	* Notes:
	5667	* will _not_ block if waitq is global (or not a member of any set)
	5668	*/
	5669	thread_t
	5670	waitq_wakeup64_identify(struct waitq *waitq,
	5671	event64_t wake_event,
	5672	wait_result_t result,
	5673	int priority)
	5674	{
	5675	uint64_t reserved_preposts = 0;
	5676	spl_t thread_spl = 0;
	5677	thread_t thread;
	5678	spl_t spl;
	5679
	5680	if (!waitq_valid(waitq)) {
	5681	panic("Invalid waitq: %p", waitq);
	5682	}
	5683
	5684	if (!waitq_irq_safe(waitq)) {
	5685	/* reserve preposts in addition to locking waitq */
	5686	reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);
	5687	} else {
	5688	spl = splsched();
	5689	waitq_lock(waitq);
	5690	}
	5691
	5692	thread = waitq_wakeup64_identify_locked(waitq, wake_event, result,
	5693	&thread_spl, &reserved_preposts,
	5694	priority, WAITQ_UNLOCK);
	5695	/* waitq is unlocked, thread is locked */
	5696
	5697	if (thread != THREAD_NULL) {
	5698	thread_reference(thread);
	5699	thread_unlock(thread);
	5700	splx(thread_spl);
	5701	}
	5702
	5703	if (waitq_irq_safe(waitq)) {
	5704	splx(spl);
	5705	}
	5706
	5707	/* release any left-over prepost object (won't block/lock anything) */
	5708	waitq_prepost_release_reserve(reserved_preposts);
	5709
	5710	/* returns +1 ref to running thread or THREAD_NULL */
	5711	return thread;
	5712	}