git.saurik.com Git - apple/libpthread.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2012 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
	29	/*
	30	* pthread_synch.c
	31	*/
	32
	33	#pragma mark - Front Matter
	34
	35	#define _PTHREAD_CONDATTR_T
	36	#define _PTHREAD_COND_T
	37	#define _PTHREAD_MUTEXATTR_T
	38	#define _PTHREAD_MUTEX_T
	39	#define _PTHREAD_RWLOCKATTR_T
	40	#define _PTHREAD_RWLOCK_T
	41
	42	#undef pthread_mutexattr_t
	43	#undef pthread_mutex_t
	44	#undef pthread_condattr_t
	45	#undef pthread_cond_t
	46	#undef pthread_rwlockattr_t
	47	#undef pthread_rwlock_t
	48
	49	#include <sys/param.h>
	50	#include <sys/queue.h>
	51	#include <sys/resourcevar.h>
	52	//#include <sys/proc_internal.h>
	53	#include <sys/kauth.h>
	54	#include <sys/systm.h>
	55	#include <sys/timeb.h>
	56	#include <sys/times.h>
	57	#include <sys/acct.h>
	58	#include <sys/kernel.h>
	59	#include <sys/wait.h>
	60	#include <sys/signalvar.h>
	61	#include <sys/sysctl.h>
	62	#include <sys/syslog.h>
	63	#include <sys/stat.h>
	64	#include <sys/lock.h>
	65	#include <sys/kdebug.h>
	66	//#include <sys/sysproto.h>
	67	#include <sys/vm.h>
	68	#include <sys/user.h> /* for coredump */
	69	#include <sys/proc_info.h> /* for fill_procworkqueue */
	70
	71	#include <mach/mach_port.h>
	72	#include <mach/mach_types.h>
	73	#include <mach/semaphore.h>
	74	#include <mach/sync_policy.h>
	75	#include <mach/task.h>
	76	#include <mach/vm_prot.h>
	77	#include <kern/kern_types.h>
	78	#include <kern/task.h>
	79	#include <kern/clock.h>
	80	#include <mach/kern_return.h>
	81	#include <kern/thread.h>
	82	#include <kern/sched_prim.h>
	83	#include <kern/kalloc.h>
	84	#include <kern/sched_prim.h> /* for thread_exception_return */
	85	#include <kern/processor.h>
	86	#include <kern/assert.h>
	87	#include <mach/mach_vm.h>
	88	#include <mach/mach_param.h>
	89	#include <mach/thread_status.h>
	90	#include <mach/thread_policy.h>
	91	#include <mach/message.h>
	92	#include <mach/port.h>
	93	//#include <vm/vm_protos.h>
	94	#include <vm/vm_fault.h>
	95	#include <vm/vm_map.h>
	96	#include <mach/thread_act.h> /* for thread_resume */
	97	#include <machine/machine_routines.h>
	98	#include <mach/shared_region.h>
	99
	100	#include <libkern/OSAtomic.h>
	101
	102	#include <sys/pthread_shims.h>
	103	#include "kern_internal.h"
	104
	105	#if DEBUG
	106	#define kevent_qos_internal kevent_qos_internal_stub
	107	static int kevent_qos_internal_stub(__unused struct proc *p, __unused int fd,
	108	__unused user_addr_t changelist, __unused int nchanges,
	109	__unused user_addr_t eventlist, __unused int nevents,
	110	__unused user_addr_t data_out, user_size_t *data_available,
	111	__unused unsigned int flags, int32_t *retval){
	112	if (data_available){
	113	static int i = 0;
	114	switch (i++ % 4) {
	115	case 0:
	116	case 2:
	117	data_available = data_available / 2;
	118	*retval = 4;
	119	break;
	120	case 1:
	121	*data_available = 0;
	122	*retval = 4;
	123	break;
	124	case 3:
	125	*retval = 0;
	126	break;
	127	}
	128	} else {
	129	*retval = 0;
	130	}
	131	return 0;
	132	}
	133	#endif /* DEBUG */
	134
	135	uint32_t pthread_debug_tracing = 1;
	136
	137	SYSCTL_INT(_kern, OID_AUTO, pthread_debug_tracing, CTLFLAG_RW \| CTLFLAG_LOCKED,
	138	&pthread_debug_tracing, 0, "")
	139
	140	// XXX: Dirty import for sys/signarvar.h that's wrapped in BSD_KERNEL_PRIVATE
	141	#define sigcantmask (sigmask(SIGKILL) \| sigmask(SIGSTOP))
	142
	143	lck_grp_attr_t *pthread_lck_grp_attr;
	144	lck_grp_t *pthread_lck_grp;
	145	lck_attr_t *pthread_lck_attr;
	146
	147	extern void thread_set_cthreadself(thread_t thread, uint64_t pself, int isLP64);
	148	extern void workqueue_thread_yielded(void);
	149
	150	enum run_nextreq_mode {RUN_NEXTREQ_DEFAULT, RUN_NEXTREQ_OVERCOMMIT, RUN_NEXTREQ_DEFERRED_OVERCOMMIT, RUN_NEXTREQ_UNCONSTRAINED, RUN_NEXTREQ_EVENT_MANAGER};
	151	static boolean_t workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t th, enum run_nextreq_mode mode, pthread_priority_t oc_prio);
	152
	153	static boolean_t workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority);
	154
	155	static void wq_runreq(proc_t p, pthread_priority_t priority, thread_t th, struct threadlist *tl,
	156	int reuse_thread, int wake_thread, int return_directly);
	157
	158	static int _setup_wqthread(proc_t p, thread_t th, pthread_priority_t priority, int reuse_thread, struct threadlist *tl);
	159
	160	static void wq_unpark_continue(void);
	161	static void wq_unsuspend_continue(void);
	162
	163	static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t ignore_constrained_thread_limit);
	164	static void workqueue_removethread(struct threadlist *tl, int fromexit);
	165	static void workqueue_lock_spin(proc_t);
	166	static void workqueue_unlock(proc_t);
	167
	168	static boolean_t may_start_constrained_thread(struct workqueue wq, uint32_t at_priclass, uint32_t my_priclass, boolean_t start_timer);
	169
	170	static mach_vm_offset_t stackaddr_hint(proc_t p);
	171
	172	int proc_settargetconc(pid_t pid, int queuenum, int32_t targetconc);
	173	int proc_setalltargetconc(pid_t pid, int32_t * targetconcp);
	174
	175	#define WQ_MAXPRI_MIN 0 /* low prio queue num */
	176	#define WQ_MAXPRI_MAX 2 /* max prio queuenum */
	177	#define WQ_PRI_NUM 3 /* number of prio work queues */
	178
	179	#define C_32_STK_ALIGN 16
	180	#define C_64_STK_ALIGN 16
	181	#define C_64_REDZONE_LEN 128
	182
	183	#define PTHREAD_T_OFFSET 0
	184
	185	/*
	186	* Flags filed passed to bsdthread_create and back in pthread_start
	187	31 <---------------------------------> 0
	188	_________________________________________
	189	\| flags(8) \| policy(8) \| importance(16) \|
	190	-----------------------------------------
	191	*/
	192
	193	#define PTHREAD_START_CUSTOM 0x01000000
	194	#define PTHREAD_START_SETSCHED 0x02000000
	195	#define PTHREAD_START_DETACHED 0x04000000
	196	#define PTHREAD_START_QOSCLASS 0x08000000
	197	#define PTHREAD_START_QOSCLASS_MASK 0xffffff
	198	#define PTHREAD_START_POLICY_BITSHIFT 16
	199	#define PTHREAD_START_POLICY_MASK 0xff
	200	#define PTHREAD_START_IMPORTANCE_MASK 0xffff
	201
	202	#define SCHED_OTHER POLICY_TIMESHARE
	203	#define SCHED_FIFO POLICY_FIFO
	204	#define SCHED_RR POLICY_RR
	205
	206	#define BASEPRI_DEFAULT 31
	207
	208	#pragma mark - Process/Thread Setup/Teardown syscalls
	209
	210	static mach_vm_offset_t stackaddr_hint(proc_t p __unused){
	211	mach_vm_offset_t stackaddr;
	212	#if defined(__i386__) \|\| defined(__x86_64__)
	213	if (proc_is64bit(p)){
	214	// Above nanomalloc range (see NANOZONE_SIGNATURE)
	215	stackaddr = 0x700000000000;
	216	} else {
	217	stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386;
	218	}
	219	#elif defined(__arm__) \|\| defined(__arm64__)
	220	if (proc_is64bit(p)){
	221	// 64 stacks below nanomalloc (see NANOZONE_SIGNATURE)
	222	stackaddr = 0x170000000 - 64 * PTH_DEFAULT_STACKSIZE;
	223	#if defined(__arm__)
	224	} else if (pthread_kern->map_is_1gb(get_task_map(pthread_kern->proc_get_task(p)))){
	225	stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE;
	226	#endif
	227	} else {
	228	stackaddr = SHARED_REGION_BASE_ARM + SHARED_REGION_SIZE_ARM;
	229	}
	230	#else
	231	#error Need to define a stack address hint for this architecture
	232	#endif
	233	return stackaddr;
	234	}
	235
	236	/**
	237	* bsdthread_create system call. Used by pthread_create.
	238	*/
	239	int
	240	_bsdthread_create(struct proc p, user_addr_t user_func, user_addr_t user_funcarg, user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags, user_addr_t retval)
	241	{
	242	kern_return_t kret;
	243	void * sright;
	244	int error = 0;
	245	int allocated = 0;
	246	mach_vm_offset_t stackaddr;
	247	mach_vm_size_t th_allocsize = 0;
	248	mach_vm_size_t th_guardsize;
	249	mach_vm_offset_t th_stack;
	250	mach_vm_offset_t th_pthread;
	251	mach_port_name_t th_thport;
	252	thread_t th;
	253	vm_map_t vmap = pthread_kern->current_map();
	254	task_t ctask = current_task();
	255	unsigned int policy, importance;
	256
	257	int isLP64 = 0;
	258
	259	if (pthread_kern->proc_get_register(p) == 0) {
	260	return EINVAL;
	261	}
	262
	263	PTHREAD_TRACE(TRACE_pthread_thread_create \| DBG_FUNC_START, flags, 0, 0, 0, 0);
	264
	265	isLP64 = proc_is64bit(p);
	266	th_guardsize = vm_map_page_size(vmap);
	267
	268	stackaddr = stackaddr_hint(p);
	269	kret = pthread_kern->thread_create(ctask, &th);
	270	if (kret != KERN_SUCCESS)
	271	return(ENOMEM);
	272	thread_reference(th);
	273
	274	sright = (void *)pthread_kern->convert_thread_to_port(th);
	275	th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
	276
	277	if ((flags & PTHREAD_START_CUSTOM) == 0) {
	278	mach_vm_size_t pthread_size =
	279	vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(vmap));
	280	th_allocsize = th_guardsize + user_stack + pthread_size;
	281	user_stack += PTHREAD_T_OFFSET;
	282
	283	kret = mach_vm_map(vmap, &stackaddr,
	284	th_allocsize,
	285	page_size-1,
	286	VM_MAKE_TAG(VM_MEMORY_STACK)\| VM_FLAGS_ANYWHERE , NULL,
	287	0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
	288	VM_INHERIT_DEFAULT);
	289	if (kret != KERN_SUCCESS){
	290	kret = mach_vm_allocate(vmap,
	291	&stackaddr, th_allocsize,
	292	VM_MAKE_TAG(VM_MEMORY_STACK)\| VM_FLAGS_ANYWHERE);
	293	}
	294	if (kret != KERN_SUCCESS) {
	295	error = ENOMEM;
	296	goto out;
	297	}
	298
	299	PTHREAD_TRACE(TRACE_pthread_thread_create\|DBG_FUNC_NONE, th_allocsize, stackaddr, 0, 2, 0);
	300
	301	allocated = 1;
	302	/*
	303	* The guard page is at the lowest address
	304	* The stack base is the highest address
	305	*/
	306	kret = mach_vm_protect(vmap, stackaddr, th_guardsize, FALSE, VM_PROT_NONE);
	307
	308	if (kret != KERN_SUCCESS) {
	309	error = ENOMEM;
	310	goto out1;
	311	}
	312
	313	th_pthread = stackaddr + th_guardsize + user_stack;
	314	th_stack = th_pthread;
	315
	316	/*
	317	* Pre-fault the first page of the new thread's stack and the page that will
	318	* contain the pthread_t structure.
	319	*/
	320	if (vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)) !=
	321	vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap))){
	322	vm_fault( vmap,
	323	vm_map_trunc_page_mask((vm_map_offset_t)(th_stack - C_64_REDZONE_LEN), vm_map_page_mask(vmap)),
	324	VM_PROT_READ \| VM_PROT_WRITE,
	325	FALSE,
	326	THREAD_UNINT, NULL, 0);
	327	}
	328
	329	vm_fault( vmap,
	330	vm_map_trunc_page_mask((vm_map_offset_t)th_pthread, vm_map_page_mask(vmap)),
	331	VM_PROT_READ \| VM_PROT_WRITE,
	332	FALSE,
	333	THREAD_UNINT, NULL, 0);
	334
	335	} else {
	336	th_stack = user_stack;
	337	th_pthread = user_pthread;
	338
	339	PTHREAD_TRACE(TRACE_pthread_thread_create\|DBG_FUNC_NONE, 0, 0, 0, 3, 0);
	340	}
	341
	342	#if defined(__i386__) \|\| defined(__x86_64__)
	343	/*
	344	* Set up i386 registers & function call.
	345	*/
	346	if (isLP64 == 0) {
	347	x86_thread_state32_t state = {
	348	.eip = (unsigned int)pthread_kern->proc_get_threadstart(p),
	349	.eax = (unsigned int)th_pthread,
	350	.ebx = (unsigned int)th_thport,
	351	.ecx = (unsigned int)user_func,
	352	.edx = (unsigned int)user_funcarg,
	353	.edi = (unsigned int)user_stack,
	354	.esi = (unsigned int)flags,
	355	/*
	356	* set stack pointer
	357	*/
	358	.esp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
	359	};
	360
	361	error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
	362	if (error != KERN_SUCCESS) {
	363	error = EINVAL;
	364	goto out;
	365	}
	366	} else {
	367	x86_thread_state64_t state64 = {
	368	.rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
	369	.rdi = (uint64_t)th_pthread,
	370	.rsi = (uint64_t)(th_thport),
	371	.rdx = (uint64_t)user_func,
	372	.rcx = (uint64_t)user_funcarg,
	373	.r8 = (uint64_t)user_stack,
	374	.r9 = (uint64_t)flags,
	375	/*
	376	* set stack pointer aligned to 16 byte boundary
	377	*/
	378	.rsp = (uint64_t)(th_stack - C_64_REDZONE_LEN)
	379	};
	380
	381	error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
	382	if (error != KERN_SUCCESS) {
	383	error = EINVAL;
	384	goto out;
	385	}
	386
	387	}
	388	#elif defined(__arm__)
	389	arm_thread_state_t state = {
	390	.pc = (int)pthread_kern->proc_get_threadstart(p),
	391	.r[0] = (unsigned int)th_pthread,
	392	.r[1] = (unsigned int)th_thport,
	393	.r[2] = (unsigned int)user_func,
	394	.r[3] = (unsigned int)user_funcarg,
	395	.r[4] = (unsigned int)user_stack,
	396	.r[5] = (unsigned int)flags,
	397
	398	/* Set r7 & lr to 0 for better back tracing */
	399	.r[7] = 0,
	400	.lr = 0,
	401
	402	/*
	403	* set stack pointer
	404	*/
	405	.sp = (int)((vm_offset_t)(th_stack-C_32_STK_ALIGN))
	406	};
	407
	408	(void) pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
	409
	410	#else
	411	#error bsdthread_create not defined for this architecture
	412	#endif
	413
	414	if ((flags & PTHREAD_START_SETSCHED) != 0) {
	415	/* Set scheduling parameters if needed */
	416	thread_extended_policy_data_t extinfo;
	417	thread_precedence_policy_data_t precedinfo;
	418
	419	importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
	420	policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
	421
	422	if (policy == SCHED_OTHER) {
	423	extinfo.timeshare = 1;
	424	} else {
	425	extinfo.timeshare = 0;
	426	}
	427
	428	thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
	429
	430	precedinfo.importance = (importance - BASEPRI_DEFAULT);
	431	thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
	432	} else if ((flags & PTHREAD_START_QOSCLASS) != 0) {
	433	/* Set thread QoS class if requested. */
	434	pthread_priority_t priority = (pthread_priority_t)(flags & PTHREAD_START_QOSCLASS_MASK);
	435
	436	thread_qos_policy_data_t qos;
	437	qos.qos_tier = pthread_priority_get_qos_class(priority);
	438	qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 :
	439	_pthread_priority_get_relpri(priority);
	440
	441	pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
	442	}
	443
	444	kret = pthread_kern->thread_resume(th);
	445	if (kret != KERN_SUCCESS) {
	446	error = EINVAL;
	447	goto out1;
	448	}
	449	thread_deallocate(th); /* drop the creator reference */
	450
	451	PTHREAD_TRACE(TRACE_pthread_thread_create\|DBG_FUNC_END, error, th_pthread, 0, 0, 0);
	452
	453	// cast required as mach_vm_offset_t is always 64 bits even on 32-bit platforms
	454	*retval = (user_addr_t)th_pthread;
	455
	456	return(0);
	457
	458	out1:
	459	if (allocated != 0) {
	460	(void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
	461	}
	462	out:
	463	(void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
	464	(void)thread_terminate(th);
	465	(void)thread_deallocate(th);
	466	return(error);
	467	}
	468
	469	/**
	470	* bsdthread_terminate system call. Used by pthread_terminate
	471	*/
	472	int
	473	_bsdthread_terminate(__unused struct proc *p,
	474	user_addr_t stackaddr,
	475	size_t size,
	476	uint32_t kthport,
	477	uint32_t sem,
	478	__unused int32_t *retval)
	479	{
	480	mach_vm_offset_t freeaddr;
	481	mach_vm_size_t freesize;
	482	kern_return_t kret;
	483
	484	freeaddr = (mach_vm_offset_t)stackaddr;
	485	freesize = size;
	486
	487	PTHREAD_TRACE(TRACE_pthread_thread_terminate\|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff, 0);
	488
	489	if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
	490	kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
	491	if (kret != KERN_SUCCESS) {
	492	PTHREAD_TRACE(TRACE_pthread_thread_terminate\|DBG_FUNC_END, kret, 0, 0, 0, 0);
	493	return(EINVAL);
	494	}
	495	}
	496
	497	(void) thread_terminate(current_thread());
	498	if (sem != MACH_PORT_NULL) {
	499	kret = pthread_kern->semaphore_signal_internal_trap(sem);
	500	if (kret != KERN_SUCCESS) {
	501	PTHREAD_TRACE(TRACE_pthread_thread_terminate\|DBG_FUNC_END, kret, 0, 0, 0, 0);
	502	return(EINVAL);
	503	}
	504	}
	505
	506	if (kthport != MACH_PORT_NULL) {
	507	pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
	508	}
	509
	510	PTHREAD_TRACE(TRACE_pthread_thread_terminate\|DBG_FUNC_END, 0, 0, 0, 0, 0);
	511
	512	pthread_kern->thread_exception_return();
	513	panic("bsdthread_terminate: still running\n");
	514
	515	PTHREAD_TRACE(TRACE_pthread_thread_terminate\|DBG_FUNC_END, 0, 0xff, 0, 0, 0);
	516
	517	return(0);
	518	}
	519
	520	/**
	521	* bsdthread_register system call. Performs per-process setup. Responsible for
	522	* returning capabilitiy bits to userspace and receiving userspace function addresses.
	523	*/
	524	int
	525	_bsdthread_register(struct proc *p,
	526	user_addr_t threadstart,
	527	user_addr_t wqthread,
	528	int pthsize,
	529	user_addr_t pthread_init_data,
	530	user_addr_t targetconc_ptr,
	531	uint64_t dispatchqueue_offset,
	532	int32_t *retval)
	533	{
	534	/* prevent multiple registrations */
	535	if (pthread_kern->proc_get_register(p) != 0) {
	536	return(EINVAL);
	537	}
	538	/* syscall randomizer test can pass bogus values */
	539	if (pthsize < 0 \|\| pthsize > MAX_PTHREAD_SIZE) {
	540	return(EINVAL);
	541	}
	542	pthread_kern->proc_set_threadstart(p, threadstart);
	543	pthread_kern->proc_set_wqthread(p, wqthread);
	544	pthread_kern->proc_set_pthsize(p, pthsize);
	545	pthread_kern->proc_set_register(p);
	546
	547	/* if we have pthread_init_data, then we use that and target_concptr (which is an offset) get data. */
	548	if (pthread_init_data != 0) {
	549	thread_qos_policy_data_t qos;
	550
	551	struct _pthread_registration_data data;
	552	size_t pthread_init_sz = MIN(sizeof(struct _pthread_registration_data), (size_t)targetconc_ptr);
	553
	554	kern_return_t kr = copyin(pthread_init_data, &data, pthread_init_sz);
	555	if (kr != KERN_SUCCESS) {
	556	return EINVAL;
	557	}
	558
	559	/* Incoming data from the data structure */
	560	pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
	561
	562	/* Outgoing data that userspace expects as a reply */
	563	if (pthread_kern->qos_main_thread_active()) {
	564	mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
	565	boolean_t gd = FALSE;
	566
	567	kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
	568	if (kr != KERN_SUCCESS \|\| qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
	569	/* Unspecified threads means the kernel wants us to impose legacy upon the thread. */
	570	qos.qos_tier = THREAD_QOS_LEGACY;
	571	qos.tier_importance = 0;
	572
	573	kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
	574	}
	575
	576	if (kr == KERN_SUCCESS) {
	577	data.main_qos = pthread_qos_class_get_priority(qos.qos_tier);
	578	} else {
	579	data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
	580	}
	581	} else {
	582	data.main_qos = _pthread_priority_make_newest(QOS_CLASS_UNSPECIFIED, 0, 0);
	583	}
	584
	585	kr = copyout(&data, pthread_init_data, pthread_init_sz);
	586	if (kr != KERN_SUCCESS) {
	587	return EINVAL;
	588	}
	589	} else {
	590	pthread_kern->proc_set_dispatchqueue_offset(p, dispatchqueue_offset);
	591	pthread_kern->proc_set_targconc(p, targetconc_ptr);
	592	}
	593
	594	/* return the supported feature set as the return value. */
	595	*retval = PTHREAD_FEATURE_SUPPORTED;
	596
	597	return(0);
	598	}
	599
	600	#pragma mark - QoS Manipulation
	601
	602	int
	603	_bsdthread_ctl_set_qos(struct proc p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t tsd_priority_addr, user_addr_t arg3, int retval)
	604	{
	605	kern_return_t kr;
	606	thread_t th;
	607
	608	pthread_priority_t priority;
	609
	610	/* Unused parameters must be zero. */
	611	if (arg3 != 0) {
	612	return EINVAL;
	613	}
	614
	615	/* QoS is stored in a given slot in the pthread TSD. We need to copy that in and set our QoS based on it. */
	616	if (proc_is64bit(p)) {
	617	uint64_t v;
	618	kr = copyin(tsd_priority_addr, &v, sizeof(v));
	619	if (kr != KERN_SUCCESS) {
	620	return kr;
	621	}
	622	priority = (int)(v & 0xffffffff);
	623	} else {
	624	uint32_t v;
	625	kr = copyin(tsd_priority_addr, &v, sizeof(v));
	626	if (kr != KERN_SUCCESS) {
	627	return kr;
	628	}
	629	priority = v;
	630	}
	631
	632	if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
	633	return ESRCH;
	634	}
	635
	636	/* <rdar://problem/16211829> Disable pthread_set_qos_class_np() on threads other than pthread_self */
	637	if (th != current_thread()) {
	638	thread_deallocate(th);
	639	return EPERM;
	640	}
	641
	642	int rv = _bsdthread_ctl_set_self(p, 0, priority, 0, _PTHREAD_SET_SELF_QOS_FLAG, retval);
	643
	644	/* Static param the thread, we just set QoS on it, so its stuck in QoS land now. */
	645	/* pthread_kern->thread_static_param(th, TRUE); */ // see <rdar://problem/16433744>, for details
	646
	647	thread_deallocate(th);
	648
	649	return rv;
	650	}
	651
	652	static inline struct threadlist *
	653	util_get_thread_threadlist_entry(thread_t th)
	654	{
	655	struct uthread *uth = pthread_kern->get_bsdthread_info(th);
	656	if (uth) {
	657	struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
	658	return tl;
	659	}
	660	return NULL;
	661	}
	662
	663	static inline void
	664	wq_thread_override_reset(thread_t th, user_addr_t resource)
	665	{
	666	struct uthread *uth = pthread_kern->get_bsdthread_info(th);
	667	struct threadlist *tl = pthread_kern->uthread_get_threadlist(uth);
	668
	669	if (tl) {
	670	/*
	671	* Drop all outstanding overrides on this thread, done outside the wq lock
	672	* because proc_usynch_thread_qos_remove_override_for_resource takes a spinlock that
	673	* could cause us to panic.
	674	*/
	675	PTHREAD_TRACE(TRACE_wq_override_reset \| DBG_FUNC_NONE, tl->th_workq, 0, 0, 0, 0);
	676
	677	pthread_kern->proc_usynch_thread_qos_reset_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
	678	}
	679	}
	680
	681	int
	682	_bsdthread_ctl_set_self(struct proc p, user_addr_t __unused cmd, pthread_priority_t priority, mach_port_name_t voucher, _pthread_set_flags_t flags, int __unused retval)
	683	{
	684	thread_qos_policy_data_t qos;
	685	mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
	686	boolean_t gd = FALSE;
	687
	688	kern_return_t kr;
	689	int qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
	690
	691	if ((flags & _PTHREAD_SET_SELF_QOS_FLAG) != 0) {
	692	kr = pthread_kern->thread_policy_get(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
	693	if (kr != KERN_SUCCESS) {
	694	qos_rv = EINVAL;
	695	goto voucher;
	696	}
	697
	698	/* If we have main-thread QoS then we don't allow a thread to come out of QOS_CLASS_UNSPECIFIED. */
	699	if (pthread_kern->qos_main_thread_active() && qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
	700	qos_rv = EPERM;
	701	goto voucher;
	702	}
	703
	704	/* Get the work queue for tracing, also the threadlist for bucket manipluation. */
	705	struct workqueue *wq = NULL;
	706	struct threadlist *tl = util_get_thread_threadlist_entry(current_thread());
	707	if (tl) {
	708	wq = tl->th_workq;
	709	}
	710
	711	PTHREAD_TRACE(TRACE_pthread_set_qos_self \| DBG_FUNC_START, wq, qos.qos_tier, qos.tier_importance, 0, 0);
	712
	713	qos.qos_tier = pthread_priority_get_qos_class(priority);
	714	qos.tier_importance = (qos.qos_tier == QOS_CLASS_UNSPECIFIED) ? 0 : _pthread_priority_get_relpri(priority);
	715
	716	kr = pthread_kern->thread_policy_set_internal(current_thread(), THREAD_QOS_POLICY, (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
	717	if (kr != KERN_SUCCESS) {
	718	qos_rv = EINVAL;
	719	goto voucher;
	720	}
	721
	722	/* If we're a workqueue, the threadlist item priority needs adjusting, along with the bucket we were running in. */
	723	if (tl) {
	724	workqueue_lock_spin(p);
	725
	726	/* Fix up counters. */
	727	uint8_t old_bucket = tl->th_priority;
	728	uint8_t new_bucket = pthread_priority_get_class_index(priority);
	729
	730	uint32_t old_active = OSAddAtomic(-1, &wq->wq_thactive_count[old_bucket]);
	731	OSAddAtomic(1, &wq->wq_thactive_count[new_bucket]);
	732
	733	wq->wq_thscheduled_count[old_bucket]--;
	734	wq->wq_thscheduled_count[new_bucket]++;
	735
	736	tl->th_priority = new_bucket;
	737
	738	/* If we were at the ceiling of non-overcommitted threads for a given bucket, we have to
	739	* reevaluate whether we should start more work.
	740	*/
	741	if (old_active == wq->wq_reqconc[old_bucket]) {
	742	/* workqueue_run_nextreq will drop the workqueue lock in all exit paths. */
	743	(void)workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_DEFAULT, 0);
	744	} else {
	745	workqueue_unlock(p);
	746	}
	747	}
	748
	749	PTHREAD_TRACE(TRACE_pthread_set_qos_self \| DBG_FUNC_END, wq, qos.qos_tier, qos.tier_importance, 0, 0);
	750	}
	751
	752	voucher:
	753	if ((flags & _PTHREAD_SET_SELF_VOUCHER_FLAG) != 0) {
	754	kr = pthread_kern->thread_set_voucher_name(voucher);
	755	if (kr != KERN_SUCCESS) {
	756	voucher_rv = ENOENT;
	757	goto fixedpri;
	758	}
	759	}
	760
	761	fixedpri:
	762	if ((flags & _PTHREAD_SET_SELF_FIXEDPRIORITY_FLAG) != 0) {
	763	thread_extended_policy_data_t extpol = {.timeshare = 0};
	764	thread_t thread = current_thread();
	765
	766	struct threadlist *tl = util_get_thread_threadlist_entry(thread);
	767	if (tl) {
	768	/* Not allowed on workqueue threads */
	769	fixedpri_rv = ENOTSUP;
	770	goto done;
	771	}
	772
	773	kr = pthread_kern->thread_policy_set_internal(thread, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
	774	if (kr != KERN_SUCCESS) {
	775	fixedpri_rv = EINVAL;
	776	goto done;
	777	}
	778	} else if ((flags & _PTHREAD_SET_SELF_TIMESHARE_FLAG) != 0) {
	779	thread_extended_policy_data_t extpol = {.timeshare = 1};
	780	thread_t thread = current_thread();
	781
	782	struct threadlist *tl = util_get_thread_threadlist_entry(thread);
	783	if (tl) {
	784	/* Not allowed on workqueue threads */
	785	fixedpri_rv = ENOTSUP;
	786	goto done;
	787	}
	788
	789	kr = pthread_kern->thread_policy_set_internal(thread, THREAD_EXTENDED_POLICY, (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
	790	if (kr != KERN_SUCCESS) {
	791	fixedpri_rv = EINVAL;
	792	goto done;
	793	}
	794	}
	795
	796	done:
	797	if (qos_rv && voucher_rv) {
	798	/* Both failed, give that a unique error. */
	799	return EBADMSG;
	800	}
	801
	802	if (qos_rv) {
	803	return qos_rv;
	804	}
	805
	806	if (voucher_rv) {
	807	return voucher_rv;
	808	}
	809
	810	if (fixedpri_rv) {
	811	return fixedpri_rv;
	812	}
	813
	814	return 0;
	815	}
	816
	817	int
	818	_bsdthread_ctl_qos_override_start(struct proc __unused p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused retval)
	819	{
	820	thread_t th;
	821	int rv = 0;
	822
	823	if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
	824	return ESRCH;
	825	}
	826
	827	struct uthread *uth = pthread_kern->get_bsdthread_info(th);
	828	int override_qos = pthread_priority_get_qos_class(priority);
	829
	830	struct threadlist *tl = util_get_thread_threadlist_entry(th);
	831	if (tl) {
	832	PTHREAD_TRACE(TRACE_wq_override_start \| DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
	833	}
	834
	835	/* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
	836	pthread_kern->proc_usynch_thread_qos_add_override_for_resource(current_task(), uth, 0, override_qos, TRUE, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
	837
	838	thread_deallocate(th);
	839	return rv;
	840	}
	841
	842	int
	843	_bsdthread_ctl_qos_override_end(struct proc __unused p, user_addr_t __unused cmd, mach_port_name_t kport, user_addr_t resource, user_addr_t arg3, int __unused retval)
	844	{
	845	thread_t th;
	846	int rv = 0;
	847
	848	if (arg3 != 0) {
	849	return EINVAL;
	850	}
	851
	852	if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
	853	return ESRCH;
	854	}
	855
	856	struct uthread *uth = pthread_kern->get_bsdthread_info(th);
	857
	858	struct threadlist *tl = util_get_thread_threadlist_entry(th);
	859	if (tl) {
	860	PTHREAD_TRACE(TRACE_wq_override_end \| DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 0, 0, 0);
	861	}
	862
	863	pthread_kern->proc_usynch_thread_qos_remove_override_for_resource(current_task(), uth, 0, resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
	864
	865	thread_deallocate(th);
	866	return rv;
	867	}
	868
	869	int
	870	_bsdthread_ctl_qos_override_dispatch(struct proc p, user_addr_t cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t arg3, int retval)
	871	{
	872	if (arg3 != 0) {
	873	return EINVAL;
	874	}
	875
	876	return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, kport, priority, USER_ADDR_NULL, retval);
	877	}
	878
	879	int
	880	_bsdthread_ctl_qos_dispatch_asynchronous_override_add(struct proc __unused p, user_addr_t __unused cmd, mach_port_name_t kport, pthread_priority_t priority, user_addr_t resource, int __unused retval)
	881	{
	882	thread_t th;
	883	int rv = 0;
	884
	885	if ((th = port_name_to_thread(kport)) == THREAD_NULL) {
	886	return ESRCH;
	887	}
	888
	889	struct uthread *uth = pthread_kern->get_bsdthread_info(th);
	890	int override_qos = pthread_priority_get_qos_class(priority);
	891
	892	struct threadlist *tl = util_get_thread_threadlist_entry(th);
	893	if (!tl) {
	894	thread_deallocate(th);
	895	return EPERM;
	896	}
	897
	898	PTHREAD_TRACE(TRACE_wq_override_dispatch \| DBG_FUNC_NONE, tl->th_workq, thread_tid(th), 1, priority, 0);
	899
	900	/* The only failure case here is if we pass a tid and have it lookup the thread, we pass the uthread, so this all always succeeds. */
	901	pthread_kern->proc_usynch_thread_qos_add_override_for_resource(current_task(), uth, 0, override_qos, TRUE, resource, THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE);
	902
	903	thread_deallocate(th);
	904	return rv;
	905	}
	906
	907	int
	908	_bsdthread_ctl_qos_override_reset(struct proc p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int retval)
	909	{
	910	if (arg1 != 0 \|\| arg2 != 0 \|\| arg3 != 0) {
	911	return EINVAL;
	912	}
	913
	914	return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, 1 /* reset_all */, 0, 0, retval);
	915	}
	916
	917	int
	918	_bsdthread_ctl_qos_dispatch_asynchronous_override_reset(struct proc __unused p, user_addr_t __unused cmd, int reset_all, user_addr_t resource, user_addr_t arg3, int __unused retval)
	919	{
	920	thread_t th;
	921	struct threadlist *tl;
	922	int rv = 0;
	923
	924	if ((reset_all && (resource != 0)) \|\| arg3 != 0) {
	925	return EINVAL;
	926	}
	927
	928	th = current_thread();
	929	tl = util_get_thread_threadlist_entry(th);
	930
	931	if (tl) {
	932	wq_thread_override_reset(th, reset_all ? THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD : resource);
	933	} else {
	934	rv = EPERM;
	935	}
	936
	937	return rv;
	938	}
	939
	940	int
	941	_bsdthread_ctl(struct proc p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int retval)
	942	{
	943	switch (cmd) {
	944	case BSDTHREAD_CTL_SET_QOS:
	945	return _bsdthread_ctl_set_qos(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
	946	case BSDTHREAD_CTL_QOS_OVERRIDE_START:
	947	return _bsdthread_ctl_qos_override_start(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
	948	case BSDTHREAD_CTL_QOS_OVERRIDE_END:
	949	return _bsdthread_ctl_qos_override_end(p, cmd, (mach_port_name_t)arg1, arg2, arg3, retval);
	950	case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
	951	return _bsdthread_ctl_qos_override_reset(p, cmd, arg1, arg2, arg3, retval);
	952	case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
	953	return _bsdthread_ctl_qos_override_dispatch(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
	954	case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
	955	return _bsdthread_ctl_qos_dispatch_asynchronous_override_add(p, cmd, (mach_port_name_t)arg1, (pthread_priority_t)arg2, arg3, retval);
	956	case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
	957	return _bsdthread_ctl_qos_dispatch_asynchronous_override_reset(p, cmd, (int)arg1, arg2, arg3, retval);
	958	case BSDTHREAD_CTL_SET_SELF:
	959	return _bsdthread_ctl_set_self(p, cmd, (pthread_priority_t)arg1, (mach_port_name_t)arg2, (_pthread_set_flags_t)arg3, retval);
	960	default:
	961	return EINVAL;
	962	}
	963	}
	964
	965	#pragma mark - Workqueue Implementation
	966	#pragma mark sysctls
	967
	968	uint32_t wq_yielded_threshold = WQ_YIELDED_THRESHOLD;
	969	uint32_t wq_yielded_window_usecs = WQ_YIELDED_WINDOW_USECS;
	970	uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS;
	971	uint32_t wq_reduce_pool_window_usecs = WQ_REDUCE_POOL_WINDOW_USECS;
	972	uint32_t wq_max_timer_interval_usecs = WQ_MAX_TIMER_INTERVAL_USECS;
	973	uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS;
	974	uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8;
	975	uint32_t wq_max_concurrency = 1; // set to ncpus on load
	976
	977	SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW \| CTLFLAG_LOCKED,
	978	&wq_yielded_threshold, 0, "");
	979
	980	SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW \| CTLFLAG_LOCKED,
	981	&wq_yielded_window_usecs, 0, "");
	982
	983	SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW \| CTLFLAG_LOCKED,
	984	&wq_stalled_window_usecs, 0, "");
	985
	986	SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW \| CTLFLAG_LOCKED,
	987	&wq_reduce_pool_window_usecs, 0, "");
	988
	989	SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW \| CTLFLAG_LOCKED,
	990	&wq_max_timer_interval_usecs, 0, "");
	991
	992	SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW \| CTLFLAG_LOCKED,
	993	&wq_max_threads, 0, "");
	994
	995	SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW \| CTLFLAG_LOCKED,
	996	&wq_max_constrained_threads, 0, "");
	997
	998	#ifdef DEBUG
	999	SYSCTL_INT(_kern, OID_AUTO, wq_max_concurrency, CTLFLAG_RW \| CTLFLAG_LOCKED,
	1000	&wq_max_concurrency, 0, "");
	1001
	1002	static int wq_kevent_test SYSCTL_HANDLER_ARGS;
	1003	SYSCTL_PROC(_debug, OID_AUTO, wq_kevent_test, CTLFLAG_MASKED \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_ANYBODY \| CTLTYPE_OPAQUE, NULL, 0, wq_kevent_test, 0, "-");
	1004	#endif
	1005
	1006	static uint32_t wq_init_constrained_limit = 1;
	1007
	1008	#pragma mark workqueue lock
	1009
	1010	void
	1011	_workqueue_init_lock(proc_t p)
	1012	{
	1013	lck_spin_init(pthread_kern->proc_get_wqlockptr(p), pthread_lck_grp, pthread_lck_attr);
	1014	*(pthread_kern->proc_get_wqinitingptr(p)) = FALSE;
	1015	}
	1016
	1017	void
	1018	_workqueue_destroy_lock(proc_t p)
	1019	{
	1020	lck_spin_destroy(pthread_kern->proc_get_wqlockptr(p), pthread_lck_grp);
	1021	}
	1022
	1023
	1024	static void
	1025	workqueue_lock_spin(proc_t p)
	1026	{
	1027	lck_spin_lock(pthread_kern->proc_get_wqlockptr(p));
	1028	}
	1029
	1030	static void
	1031	workqueue_unlock(proc_t p)
	1032	{
	1033	lck_spin_unlock(pthread_kern->proc_get_wqlockptr(p));
	1034	}
	1035
	1036	#pragma mark workqueue add timer
	1037
	1038	/**
	1039	* Sets up the timer which will call out to workqueue_add_timer
	1040	*/
	1041	static void
	1042	workqueue_interval_timer_start(struct workqueue *wq)
	1043	{
	1044	uint64_t deadline;
	1045
	1046	/* n.b. wq_timer_interval is reset to 0 in workqueue_add_timer if the
	1047	ATIMER_RUNNING flag is not present. The net effect here is that if a
	1048	sequence of threads is required, we'll double the time before we give out
	1049	the next one. */
	1050	if (wq->wq_timer_interval == 0) {
	1051	wq->wq_timer_interval = wq_stalled_window_usecs;
	1052
	1053	} else {
	1054	wq->wq_timer_interval = wq->wq_timer_interval * 2;
	1055
	1056	if (wq->wq_timer_interval > wq_max_timer_interval_usecs) {
	1057	wq->wq_timer_interval = wq_max_timer_interval_usecs;
	1058	}
	1059	}
	1060	clock_interval_to_deadline(wq->wq_timer_interval, 1000, &deadline);
	1061
	1062	thread_call_enter_delayed(wq->wq_atimer_call, deadline);
	1063
	1064	PTHREAD_TRACE(TRACE_wq_start_add_timer, wq, wq->wq_reqcount, wq->wq_flags, wq->wq_timer_interval, 0);
	1065	}
	1066
	1067	/**
	1068	* returns whether lastblocked_tsp is within wq_stalled_window_usecs of cur_ts
	1069	*/
	1070	static boolean_t
	1071	wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
	1072	{
	1073	clock_sec_t secs;
	1074	clock_usec_t usecs;
	1075	uint64_t lastblocked_ts;
	1076	uint64_t elapsed;
	1077
	1078	/*
	1079	* the timestamp is updated atomically w/o holding the workqueue lock
	1080	* so we need to do an atomic read of the 64 bits so that we don't see
	1081	* a mismatched pair of 32 bit reads... we accomplish this in an architecturally
	1082	* independent fashion by using OSCompareAndSwap64 to write back the
	1083	* value we grabbed... if it succeeds, then we have a good timestamp to
	1084	* evaluate... if it fails, we straddled grabbing the timestamp while it
	1085	* was being updated... treat a failed update as a busy thread since
	1086	* it implies we are about to see a really fresh timestamp anyway
	1087	*/
	1088	lastblocked_ts = *lastblocked_tsp;
	1089
	1090	if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp))
	1091	return (TRUE);
	1092
	1093	if (lastblocked_ts >= cur_ts) {
	1094	/*
	1095	* because the update of the timestamp when a thread blocks isn't
	1096	* serialized against us looking at it (i.e. we don't hold the workq lock)
	1097	* it's possible to have a timestamp that matches the current time or
	1098	* that even looks to be in the future relative to when we grabbed the current
	1099	* time... just treat this as a busy thread since it must have just blocked.
	1100	*/
	1101	return (TRUE);
	1102	}
	1103	elapsed = cur_ts - lastblocked_ts;
	1104
	1105	pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
	1106
	1107	if (secs == 0 && usecs < wq_stalled_window_usecs)
	1108	return (TRUE);
	1109	return (FALSE);
	1110	}
	1111
	1112	#define WQ_TIMER_NEEDED(wq, start_timer) do { \
	1113	int oldflags = wq->wq_flags; \
	1114	\
	1115	if ( !(oldflags & (WQ_EXITING \| WQ_ATIMER_RUNNING))) { \
	1116	if (OSCompareAndSwap(oldflags, oldflags \| WQ_ATIMER_RUNNING, (UInt32 *)&wq->wq_flags)) \
	1117	start_timer = TRUE; \
	1118	} \
	1119	} while (0)
	1120
	1121	/**
	1122	* handler function for the timer
	1123	*/
	1124	static void
	1125	workqueue_add_timer(struct workqueue *wq, __unused int param1)
	1126	{
	1127	proc_t p;
	1128	boolean_t start_timer = FALSE;
	1129	boolean_t retval;
	1130
	1131	PTHREAD_TRACE(TRACE_wq_add_timer \| DBG_FUNC_START, wq, wq->wq_flags, wq->wq_nthreads, wq->wq_thidlecount, 0);
	1132
	1133	p = wq->wq_proc;
	1134
	1135	workqueue_lock_spin(p);
	1136
	1137	/*
	1138	* because workqueue_callback now runs w/o taking the workqueue lock
	1139	* we are unsynchronized w/r to a change in state of the running threads...
	1140	* to make sure we always evaluate that change, we allow it to start up
	1141	* a new timer if the current one is actively evalutating the state
	1142	* however, we do not need more than 2 timers fired up (1 active and 1 pending)
	1143	* and we certainly do not want 2 active timers evaluating the state
	1144	* simultaneously... so use WQL_ATIMER_BUSY to serialize the timers...
	1145	* note that WQL_ATIMER_BUSY is in a different flag word from WQ_ATIMER_RUNNING since
	1146	* it is always protected by the workq lock... WQ_ATIMER_RUNNING is evaluated
	1147	* and set atomimcally since the callback function needs to manipulate it
	1148	* w/o holding the workq lock...
	1149	*
	1150	* !WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY == no pending timer, no active timer
	1151	* !WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY == no pending timer, 1 active timer
	1152	* WQ_ATIMER_RUNNING && !WQL_ATIMER_BUSY == 1 pending timer, no active timer
	1153	* WQ_ATIMER_RUNNING && WQL_ATIMER_BUSY == 1 pending timer, 1 active timer
	1154	*/
	1155	while (wq->wq_lflags & WQL_ATIMER_BUSY) {
	1156	wq->wq_lflags \|= WQL_ATIMER_WAITING;
	1157
	1158	assert_wait((caddr_t)wq, (THREAD_UNINT));
	1159	workqueue_unlock(p);
	1160
	1161	thread_block(THREAD_CONTINUE_NULL);
	1162
	1163	workqueue_lock_spin(p);
	1164	}
	1165	wq->wq_lflags \|= WQL_ATIMER_BUSY;
	1166
	1167	/*
	1168	* the workq lock will protect us from seeing WQ_EXITING change state, but we
	1169	* still need to update this atomically in case someone else tries to start
	1170	* the timer just as we're releasing it
	1171	*/
	1172	while ( !(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags & ~WQ_ATIMER_RUNNING), (UInt32 *)&wq->wq_flags)));
	1173
	1174	again:
	1175	retval = TRUE;
	1176	if ( !(wq->wq_flags & WQ_EXITING)) {
	1177	boolean_t add_thread = FALSE;
	1178	/*
	1179	* check to see if the stall frequency was beyond our tolerance
	1180	* or we have work on the queue, but haven't scheduled any
	1181	* new work within our acceptable time interval because
	1182	* there were no idle threads left to schedule
	1183	*/
	1184	if (wq->wq_reqcount) {
	1185	uint32_t priclass = 0;
	1186	uint32_t thactive_count = 0;
	1187	uint64_t curtime = mach_absolute_time();
	1188	uint64_t busycount = 0;
	1189
	1190	if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
	1191	wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
	1192	priclass = WORKQUEUE_EVENT_MANAGER_BUCKET;
	1193	} else {
	1194	for (priclass = 0; priclass < WORKQUEUE_NUM_BUCKETS; priclass++) {
	1195	if (wq->wq_requests[priclass])
	1196	break;
	1197	}
	1198	}
	1199
	1200	if (priclass < WORKQUEUE_EVENT_MANAGER_BUCKET){
	1201	/*
	1202	* Compute a metric for many how many threads are active. We
	1203	* find the highest priority request outstanding and then add up
	1204	* the number of active threads in that and all higher-priority
	1205	* buckets. We'll also add any "busy" threads which are not
	1206	* active but blocked recently enough that we can't be sure
	1207	* they've gone idle yet. We'll then compare this metric to our
	1208	* max concurrency to decide whether to add a new thread.
	1209	*/
	1210	for (uint32_t i = 0; i <= priclass; i++) {
	1211	thactive_count += wq->wq_thactive_count[i];
	1212
	1213	// XXX why isn't this checking thscheduled_count < thactive_count ?
	1214	if (wq->wq_thscheduled_count[i]) {
	1215	if (wq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i]))
	1216	busycount++;
	1217	}
	1218	}
	1219	}
	1220
	1221	if (thactive_count + busycount < wq->wq_max_concurrency \|\|
	1222	priclass == WORKQUEUE_EVENT_MANAGER_BUCKET) {
	1223
	1224	if (wq->wq_thidlecount == 0) {
	1225	/*
	1226	* if we have no idle threads, try to add one
	1227	*/
	1228	retval = workqueue_addnewthread(wq, priclass == WORKQUEUE_EVENT_MANAGER_BUCKET);
	1229	}
	1230	add_thread = TRUE;
	1231	}
	1232
	1233	if (wq->wq_reqcount) {
	1234	/*
	1235	* as long as we have threads to schedule, and we successfully
	1236	* scheduled new work, keep trying
	1237	*/
	1238	while (wq->wq_thidlecount && !(wq->wq_flags & WQ_EXITING)) {
	1239	/*
	1240	* workqueue_run_nextreq is responsible for
	1241	* dropping the workqueue lock in all cases
	1242	*/
	1243	retval = workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_DEFAULT, 0);
	1244	workqueue_lock_spin(p);
	1245
	1246	if (retval == FALSE)
	1247	break;
	1248	}
	1249	if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_reqcount) {
	1250
	1251	if (wq->wq_thidlecount == 0 && retval == TRUE && add_thread == TRUE)
	1252	goto again;
	1253
	1254	if (wq->wq_thidlecount == 0 \|\| busycount)
	1255	WQ_TIMER_NEEDED(wq, start_timer);
	1256
	1257	PTHREAD_TRACE(TRACE_wq_add_timer \| DBG_FUNC_NONE, wq, wq->wq_reqcount, wq->wq_thidlecount, busycount, 0);
	1258	}
	1259	}
	1260	}
	1261	}
	1262
	1263	/*
	1264	* If we called WQ_TIMER_NEEDED above, then this flag will be set if that
	1265	* call marked the timer running. If so, we let the timer interval grow.
	1266	* Otherwise, we reset it back to 0.
	1267	*/
	1268	if ( !(wq->wq_flags & WQ_ATIMER_RUNNING))
	1269	wq->wq_timer_interval = 0;
	1270
	1271	wq->wq_lflags &= ~WQL_ATIMER_BUSY;
	1272
	1273	if ((wq->wq_flags & WQ_EXITING) \|\| (wq->wq_lflags & WQL_ATIMER_WAITING)) {
	1274	/*
	1275	* wakeup the thread hung up in workqueue_exit or workqueue_add_timer waiting for this timer
	1276	* to finish getting out of the way
	1277	*/
	1278	wq->wq_lflags &= ~WQL_ATIMER_WAITING;
	1279	wakeup(wq);
	1280	}
	1281
	1282	PTHREAD_TRACE(TRACE_wq_add_timer \| DBG_FUNC_END, wq, start_timer, wq->wq_nthreads, wq->wq_thidlecount, 0);
	1283
	1284	workqueue_unlock(p);
	1285
	1286	if (start_timer == TRUE)
	1287	workqueue_interval_timer_start(wq);
	1288	}
	1289
	1290	#pragma mark thread state tracking
	1291
	1292	// called by spinlock code when trying to yield to lock owner
	1293	void
	1294	_workqueue_thread_yielded(void)
	1295	{
	1296	struct workqueue *wq;
	1297	proc_t p;
	1298
	1299	p = current_proc();
	1300
	1301	if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL \|\| wq->wq_reqcount == 0)
	1302	return;
	1303
	1304	workqueue_lock_spin(p);
	1305
	1306	if (wq->wq_reqcount) {
	1307	uint64_t curtime;
	1308	uint64_t elapsed;
	1309	clock_sec_t secs;
	1310	clock_usec_t usecs;
	1311
	1312	if (wq->wq_thread_yielded_count++ == 0)
	1313	wq->wq_thread_yielded_timestamp = mach_absolute_time();
	1314
	1315	if (wq->wq_thread_yielded_count < wq_yielded_threshold) {
	1316	workqueue_unlock(p);
	1317	return;
	1318	}
	1319
	1320	PTHREAD_TRACE(TRACE_wq_thread_yielded \| DBG_FUNC_START, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 0, 0);
	1321
	1322	wq->wq_thread_yielded_count = 0;
	1323
	1324	curtime = mach_absolute_time();
	1325	elapsed = curtime - wq->wq_thread_yielded_timestamp;
	1326	pthread_kern->absolutetime_to_microtime(elapsed, &secs, &usecs);
	1327
	1328	if (secs == 0 && usecs < wq_yielded_window_usecs) {
	1329
	1330	if (wq->wq_thidlecount == 0) {
	1331	workqueue_addnewthread(wq, TRUE);
	1332	/*
	1333	* 'workqueue_addnewthread' drops the workqueue lock
	1334	* when creating the new thread and then retakes it before
	1335	* returning... this window allows other threads to process
	1336	* requests, so we need to recheck for available work
	1337	* if none found, we just return... the newly created thread
	1338	* will eventually get used (if it hasn't already)...
	1339	*/
	1340	if (wq->wq_reqcount == 0) {
	1341	workqueue_unlock(p);
	1342	return;
	1343	}
	1344	}
	1345	if (wq->wq_thidlecount) {
	1346	(void)workqueue_run_nextreq(p, wq, THREAD_NULL, RUN_NEXTREQ_UNCONSTRAINED, 0);
	1347	/*
	1348	* workqueue_run_nextreq is responsible for
	1349	* dropping the workqueue lock in all cases
	1350	*/
	1351	PTHREAD_TRACE(TRACE_wq_thread_yielded \| DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 1, 0);
	1352
	1353	return;
	1354	}
	1355	}
	1356	PTHREAD_TRACE(TRACE_wq_thread_yielded \| DBG_FUNC_END, wq, wq->wq_thread_yielded_count, wq->wq_reqcount, 2, 0);
	1357	}
	1358	workqueue_unlock(p);
	1359	}
	1360
	1361
	1362
	1363	static void
	1364	workqueue_callback(int type, thread_t thread)
	1365	{
	1366	struct uthread *uth;
	1367	struct threadlist *tl;
	1368	struct workqueue *wq;
	1369
	1370	uth = pthread_kern->get_bsdthread_info(thread);
	1371	tl = pthread_kern->uthread_get_threadlist(uth);
	1372	wq = tl->th_workq;
	1373
	1374	switch (type) {
	1375	case SCHED_CALL_BLOCK: {
	1376	uint32_t old_activecount;
	1377	boolean_t start_timer = FALSE;
	1378
	1379	old_activecount = OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
	1380
	1381	/*
	1382	* If we blocked and were at the requested concurrency previously, we may
	1383	* need to spin up a new thread. Of course, if it's the event manager
	1384	* then that's moot, so ignore that case.
	1385	*/
	1386	if (old_activecount == wq->wq_reqconc[tl->th_priority] &&
	1387	tl->th_priority != WORKQUEUE_EVENT_MANAGER_BUCKET) {
	1388	uint64_t curtime;
	1389	UInt64 *lastblocked_ptr;
	1390
	1391	/*
	1392	* the number of active threads at this priority
	1393	* has fallen below the maximum number of concurrent
	1394	* threads that we're allowed to run
	1395	*/
	1396	lastblocked_ptr = (UInt64 *)&wq->wq_lastblocked_ts[tl->th_priority];
	1397	curtime = mach_absolute_time();
	1398
	1399	/*
	1400	* if we collide with another thread trying to update the last_blocked (really unlikely
	1401	* since another thread would have to get scheduled and then block after we start down
	1402	* this path), it's not a problem. Either timestamp is adequate, so no need to retry
	1403	*/
	1404
	1405	OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr);
	1406
	1407	if (wq->wq_reqcount) {
	1408	/*
	1409	* we have work to do so start up the timer
	1410	* if it's not running... we'll let it sort
	1411	* out whether we really need to start up
	1412	* another thread
	1413	*/
	1414	WQ_TIMER_NEEDED(wq, start_timer);
	1415	}
	1416
	1417	if (start_timer == TRUE) {
	1418	workqueue_interval_timer_start(wq);
	1419	}
	1420	}
	1421	PTHREAD_TRACE1(TRACE_wq_thread_block \| DBG_FUNC_START, wq, old_activecount, tl->th_priority, start_timer, thread_tid(thread));
	1422	break;
	1423	}
	1424	case SCHED_CALL_UNBLOCK:
	1425	/*
	1426	* we cannot take the workqueue_lock here...
	1427	* an UNBLOCK can occur from a timer event which
	1428	* is run from an interrupt context... if the workqueue_lock
	1429	* is already held by this processor, we'll deadlock...
	1430	* the thread lock for the thread being UNBLOCKED
	1431	* is also held
	1432	*/
	1433	OSAddAtomic(1, &wq->wq_thactive_count[tl->th_priority]);
	1434
	1435	PTHREAD_TRACE1(TRACE_wq_thread_block \| DBG_FUNC_END, wq, wq->wq_threads_scheduled, tl->th_priority, 0, thread_tid(thread));
	1436
	1437	break;
	1438	}
	1439	}
	1440
	1441	sched_call_t
	1442	_workqueue_get_sched_callback(void)
	1443	{
	1444	return workqueue_callback;
	1445	}
	1446
	1447	#pragma mark thread addition/removal
	1448
	1449	/**
	1450	* pop goes the thread
	1451	*/
	1452	static void
	1453	workqueue_removethread(struct threadlist *tl, int fromexit)
	1454	{
	1455	struct workqueue *wq;
	1456	struct uthread * uth;
	1457
	1458	/*
	1459	* If fromexit is set, the call is from workqueue_exit(,
	1460	* so some cleanups are to be avoided.
	1461	*/
	1462	wq = tl->th_workq;
	1463
	1464	TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
	1465
	1466	if (fromexit == 0) {
	1467	wq->wq_nthreads--;
	1468	wq->wq_thidlecount--;
	1469	}
	1470
	1471	/*
	1472	* Clear the threadlist pointer in uthread so
	1473	* blocked thread on wakeup for termination will
	1474	* not access the thread list as it is going to be
	1475	* freed.
	1476	*/
	1477	pthread_kern->thread_sched_call(tl->th_thread, NULL);
	1478
	1479	uth = pthread_kern->get_bsdthread_info(tl->th_thread);
	1480	if (uth != (struct uthread *)0) {
	1481	pthread_kern->uthread_set_threadlist(uth, NULL);
	1482	}
	1483	if (fromexit == 0) {
	1484	/* during exit the lock is not held */
	1485	workqueue_unlock(wq->wq_proc);
	1486	}
	1487
	1488	if ( (tl->th_flags & TH_LIST_SUSPENDED) ) {
	1489	/*
	1490	* thread was created, but never used...
	1491	* need to clean up the stack and port ourselves
	1492	* since we're not going to spin up through the
	1493	* normal exit path triggered from Libc
	1494	*/
	1495	if (fromexit == 0) {
	1496	/* vm map is already deallocated when this is called from exit */
	1497	(void)mach_vm_deallocate(wq->wq_map, tl->th_stackaddr, tl->th_allocsize);
	1498	}
	1499	(void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(wq->wq_task), tl->th_thport);
	1500
	1501	PTHREAD_TRACE1(TRACE_wq_thread_suspend \| DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
	1502	} else {
	1503
	1504	PTHREAD_TRACE1(TRACE_wq_thread_park \| DBG_FUNC_END, wq, (uintptr_t)thread_tid(current_thread()), wq->wq_nthreads, 0xdead, thread_tid(tl->th_thread));
	1505	}
	1506	/*
	1507	* drop our ref on the thread
	1508	*/
	1509	thread_deallocate(tl->th_thread);
	1510
	1511	kfree(tl, sizeof(struct threadlist));
	1512	}
	1513
	1514
	1515	/**
	1516	* Try to add a new workqueue thread.
	1517	*
	1518	* - called with workq lock held
	1519	* - dropped and retaken around thread creation
	1520	* - return with workq lock held
	1521	*/
	1522	static boolean_t
	1523	workqueue_addnewthread(struct workqueue *wq, boolean_t ignore_constrained_thread_limit)
	1524	{
	1525	struct threadlist *tl;
	1526	struct uthread *uth;
	1527	kern_return_t kret;
	1528	thread_t th;
	1529	proc_t p;
	1530	void *sright;
	1531	mach_vm_offset_t stackaddr;
	1532
	1533	if ((wq->wq_flags & WQ_EXITING) == WQ_EXITING) {
	1534	PTHREAD_TRACE(TRACE_wq_thread_add_during_exit \| DBG_FUNC_NONE, wq, 0, 0, 0, 0);
	1535	return (FALSE);
	1536	}
	1537
	1538	if (wq->wq_nthreads >= wq_max_threads \|\| wq->wq_nthreads >= (pthread_kern->config_thread_max - 20)) {
	1539	wq->wq_lflags \|= WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
	1540
	1541	PTHREAD_TRACE(TRACE_wq_thread_limit_exceeded \| DBG_FUNC_NONE, wq, wq->wq_nthreads, wq_max_threads,
	1542	pthread_kern->config_thread_max - 20, 0);
	1543	return (FALSE);
	1544	}
	1545	wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
	1546
	1547	if (ignore_constrained_thread_limit == FALSE &&
	1548	wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
	1549	/*
	1550	* If we're not creating this thread to service an overcommit or
	1551	* event manager request, then we check to see if we are over our
	1552	* constrained thread limit, in which case we error out.
	1553	*/
	1554	wq->wq_lflags \|= WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
	1555
	1556	PTHREAD_TRACE(TRACE_wq_thread_constrained_maxed \| DBG_FUNC_NONE, wq, wq->wq_constrained_threads_scheduled,
	1557	wq_max_constrained_threads, 0, 0);
	1558	return (FALSE);
	1559	}
	1560	if (wq->wq_constrained_threads_scheduled < wq_max_constrained_threads)
	1561	wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
	1562
	1563	wq->wq_nthreads++;
	1564
	1565	p = wq->wq_proc;
	1566	workqueue_unlock(p);
	1567
	1568	kret = pthread_kern->thread_create_workq(wq->wq_task, (thread_continue_t)wq_unsuspend_continue, &th);
	1569	if (kret != KERN_SUCCESS) {
	1570	PTHREAD_TRACE(TRACE_wq_thread_create_failed \| DBG_FUNC_NONE, wq, kret, 0, 0, 0);
	1571	goto failed;
	1572	}
	1573
	1574	tl = kalloc(sizeof(struct threadlist));
	1575	bzero(tl, sizeof(struct threadlist));
	1576
	1577	stackaddr = stackaddr_hint(p);
	1578
	1579	mach_vm_size_t guardsize = vm_map_page_size(wq->wq_map);
	1580	mach_vm_size_t pthread_size =
	1581	vm_map_round_page_mask(pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET, vm_map_page_mask(wq->wq_map));
	1582	tl->th_allocsize = guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
	1583
	1584	kret = mach_vm_map(wq->wq_map, &stackaddr,
	1585	tl->th_allocsize,
	1586	page_size-1,
	1587	VM_MAKE_TAG(VM_MEMORY_STACK)\| VM_FLAGS_ANYWHERE , NULL,
	1588	0, FALSE, VM_PROT_DEFAULT, VM_PROT_ALL,
	1589	VM_INHERIT_DEFAULT);
	1590
	1591	if (kret != KERN_SUCCESS) {
	1592	PTHREAD_TRACE(TRACE_wq_thread_create_failed \| DBG_FUNC_NONE, wq, kret, 1, 0, 0);
	1593
	1594	kret = mach_vm_allocate(wq->wq_map,
	1595	&stackaddr, tl->th_allocsize,
	1596	VM_MAKE_TAG(VM_MEMORY_STACK) \| VM_FLAGS_ANYWHERE);
	1597	}
	1598	if (kret == KERN_SUCCESS) {
	1599	/*
	1600	* The guard page is at the lowest address
	1601	* The stack base is the highest address
	1602	*/
	1603	kret = mach_vm_protect(wq->wq_map, stackaddr, guardsize, FALSE, VM_PROT_NONE);
	1604
	1605	if (kret != KERN_SUCCESS) {
	1606	(void) mach_vm_deallocate(wq->wq_map, stackaddr, tl->th_allocsize);
	1607	PTHREAD_TRACE(TRACE_wq_thread_create_failed \| DBG_FUNC_NONE, wq, kret, 2, 0, 0);
	1608	}
	1609	}
	1610	if (kret != KERN_SUCCESS) {
	1611	(void) thread_terminate(th);
	1612	thread_deallocate(th);
	1613
	1614	kfree(tl, sizeof(struct threadlist));
	1615	goto failed;
	1616	}
	1617	thread_reference(th);
	1618
	1619	sright = (void *)pthread_kern->convert_thread_to_port(th);
	1620	tl->th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(wq->wq_task));
	1621
	1622	pthread_kern->thread_static_param(th, TRUE);
	1623
	1624	tl->th_flags = TH_LIST_INITED \| TH_LIST_SUSPENDED;
	1625
	1626	tl->th_thread = th;
	1627	tl->th_workq = wq;
	1628	tl->th_stackaddr = stackaddr;
	1629	tl->th_priority = WORKQUEUE_NUM_BUCKETS;
	1630	tl->th_policy = -1;
	1631
	1632	uth = pthread_kern->get_bsdthread_info(tl->th_thread);
	1633
	1634	workqueue_lock_spin(p);
	1635
	1636	pthread_kern->uthread_set_threadlist(uth, tl);
	1637	TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
	1638
	1639	wq->wq_thidlecount++;
	1640
	1641	PTHREAD_TRACE1(TRACE_wq_thread_suspend \| DBG_FUNC_START, wq, wq->wq_nthreads, 0, thread_tid(current_thread()), thread_tid(tl->th_thread));
	1642
	1643	return (TRUE);
	1644
	1645	failed:
	1646	workqueue_lock_spin(p);
	1647	wq->wq_nthreads--;
	1648
	1649	return (FALSE);
	1650	}
	1651
	1652	/**
	1653	* Setup per-process state for the workqueue.
	1654	*/
	1655	int
	1656	_workq_open(struct proc p, __unused int32_t retval)
	1657	{
	1658	struct workqueue * wq;
	1659	int wq_size;
	1660	char * ptr;
	1661	uint32_t i;
	1662	uint32_t num_cpus;
	1663	int error = 0;
	1664	boolean_t need_wakeup = FALSE;
	1665
	1666	if (pthread_kern->proc_get_register(p) == 0) {
	1667	return EINVAL;
	1668	}
	1669
	1670	num_cpus = pthread_kern->ml_get_max_cpus();
	1671
	1672	if (wq_init_constrained_limit) {
	1673	uint32_t limit;
	1674	/*
	1675	* set up the limit for the constrained pool
	1676	* this is a virtual pool in that we don't
	1677	* maintain it on a separate idle and run list
	1678	*/
	1679	limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
	1680
	1681	if (limit > wq_max_constrained_threads)
	1682	wq_max_constrained_threads = limit;
	1683
	1684	wq_init_constrained_limit = 0;
	1685	}
	1686	workqueue_lock_spin(p);
	1687
	1688	if (pthread_kern->proc_get_wqptr(p) == NULL) {
	1689
	1690	while (*pthread_kern->proc_get_wqinitingptr(p) == TRUE) {
	1691
	1692	assert_wait((caddr_t)pthread_kern->proc_get_wqinitingptr(p), THREAD_UNINT);
	1693	workqueue_unlock(p);
	1694
	1695	thread_block(THREAD_CONTINUE_NULL);
	1696
	1697	workqueue_lock_spin(p);
	1698	}
	1699	if (pthread_kern->proc_get_wqptr(p) != NULL) {
	1700	goto out;
	1701	}
	1702
	1703	*(pthread_kern->proc_get_wqinitingptr(p)) = TRUE;
	1704
	1705	workqueue_unlock(p);
	1706
	1707	wq_size = sizeof(struct workqueue);
	1708
	1709	ptr = (char *)kalloc(wq_size);
	1710	bzero(ptr, wq_size);
	1711
	1712	wq = (struct workqueue *)ptr;
	1713	wq->wq_flags = WQ_LIST_INITED;
	1714	wq->wq_proc = p;
	1715	wq->wq_max_concurrency = wq_max_concurrency;
	1716	wq->wq_task = current_task();
	1717	wq->wq_map = pthread_kern->current_map();
	1718
	1719	for (i = 0; i < WORKQUEUE_NUM_BUCKETS; i++)
	1720	wq->wq_reqconc[i] = (uint16_t)wq->wq_max_concurrency;
	1721
	1722	// The event manager bucket is special, so its gets a concurrency of 1
	1723	// though we shouldn't ever read this value for that bucket
	1724	wq->wq_reqconc[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
	1725
	1726	// Always start the event manager at BACKGROUND
	1727	wq->wq_event_manager_priority = (uint32_t)pthread_qos_class_get_priority(THREAD_QOS_BACKGROUND) \| _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
	1728
	1729	TAILQ_INIT(&wq->wq_thrunlist);
	1730	TAILQ_INIT(&wq->wq_thidlelist);
	1731
	1732	wq->wq_atimer_call = thread_call_allocate((thread_call_func_t)workqueue_add_timer, (thread_call_param_t)wq);
	1733
	1734	workqueue_lock_spin(p);
	1735
	1736	pthread_kern->proc_set_wqptr(p, wq);
	1737	pthread_kern->proc_set_wqsize(p, wq_size);
	1738
	1739	*(pthread_kern->proc_get_wqinitingptr(p)) = FALSE;
	1740	need_wakeup = TRUE;
	1741	}
	1742	out:
	1743	workqueue_unlock(p);
	1744
	1745	if (need_wakeup == TRUE) {
	1746	wakeup(pthread_kern->proc_get_wqinitingptr(p));
	1747	}
	1748	return(error);
	1749	}
	1750
	1751	/*
	1752	* Routine: workqueue_mark_exiting
	1753	*
	1754	* Function: Mark the work queue such that new threads will not be added to the
	1755	* work queue after we return.
	1756	*
	1757	* Conditions: Called against the current process.
	1758	*/
	1759	void
	1760	_workqueue_mark_exiting(struct proc *p)
	1761	{
	1762	struct workqueue *wq = pthread_kern->proc_get_wqptr(p);
	1763
	1764	if (wq != NULL) {
	1765
	1766	PTHREAD_TRACE(TRACE_wq_pthread_exit\|DBG_FUNC_START, wq, 0, 0, 0, 0);
	1767
	1768	workqueue_lock_spin(p);
	1769
	1770	/*
	1771	* we now arm the timer in the callback function w/o holding the workq lock...
	1772	* we do this by setting WQ_ATIMER_RUNNING via OSCompareAndSwap in order to
	1773	* insure only a single timer if running and to notice that WQ_EXITING has
	1774	* been set (we don't want to start a timer once WQ_EXITING is posted)
	1775	*
	1776	* so once we have successfully set WQ_EXITING, we cannot fire up a new timer...
	1777	* therefor no need to clear the timer state atomically from the flags
	1778	*
	1779	* since we always hold the workq lock when dropping WQ_ATIMER_RUNNING
	1780	* the check for and sleep until clear is protected
	1781	*/
	1782	while (!(OSCompareAndSwap(wq->wq_flags, (wq->wq_flags \| WQ_EXITING), (UInt32 *)&wq->wq_flags)));
	1783
	1784	if (wq->wq_flags & WQ_ATIMER_RUNNING) {
	1785	if (thread_call_cancel(wq->wq_atimer_call) == TRUE) {
	1786	wq->wq_flags &= ~WQ_ATIMER_RUNNING;
	1787	}
	1788	}
	1789	while ((wq->wq_flags & WQ_ATIMER_RUNNING) \|\| (wq->wq_lflags & WQL_ATIMER_BUSY)) {
	1790	assert_wait((caddr_t)wq, (THREAD_UNINT));
	1791	workqueue_unlock(p);
	1792
	1793	thread_block(THREAD_CONTINUE_NULL);
	1794
	1795	workqueue_lock_spin(p);
	1796	}
	1797	workqueue_unlock(p);
	1798
	1799	PTHREAD_TRACE(TRACE_wq_pthread_exit\|DBG_FUNC_END, 0, 0, 0, 0, 0);
	1800	}
	1801	}
	1802
	1803	/*
	1804	* Routine: workqueue_exit
	1805	*
	1806	* Function: clean up the work queue structure(s) now that there are no threads
	1807	* left running inside the work queue (except possibly current_thread).
	1808	*
	1809	* Conditions: Called by the last thread in the process.
	1810	* Called against current process.
	1811	*/
	1812	void
	1813	_workqueue_exit(struct proc *p)
	1814	{
	1815	struct workqueue * wq;
	1816	struct threadlist * tl, *tlist;
	1817	struct uthread *uth;
	1818	int wq_size = 0;
	1819
	1820	wq = pthread_kern->proc_get_wqptr(p);
	1821	if (wq != NULL) {
	1822
	1823	PTHREAD_TRACE(TRACE_wq_workqueue_exit\|DBG_FUNC_START, wq, 0, 0, 0, 0);
	1824
	1825	wq_size = pthread_kern->proc_get_wqsize(p);
	1826	pthread_kern->proc_set_wqptr(p, NULL);
	1827	pthread_kern->proc_set_wqsize(p, 0);
	1828
	1829	/*
	1830	* Clean up workqueue data structures for threads that exited and
	1831	* didn't get a chance to clean up after themselves.
	1832	*/
	1833	TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
	1834	pthread_kern->thread_sched_call(tl->th_thread, NULL);
	1835
	1836	uth = pthread_kern->get_bsdthread_info(tl->th_thread);
	1837	if (uth != (struct uthread *)0) {
	1838	pthread_kern->uthread_set_threadlist(uth, NULL);
	1839	}
	1840	TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
	1841
	1842	/*
	1843	* drop our last ref on the thread
	1844	*/
	1845	thread_deallocate(tl->th_thread);
	1846
	1847	kfree(tl, sizeof(struct threadlist));
	1848	}
	1849	TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
	1850	workqueue_removethread(tl, 1);
	1851	}
	1852	thread_call_free(wq->wq_atimer_call);
	1853
	1854	kfree(wq, wq_size);
	1855
	1856	PTHREAD_TRACE(TRACE_wq_workqueue_exit\|DBG_FUNC_END, 0, 0, 0, 0, 0);
	1857	}
	1858	}
	1859
	1860
	1861	#pragma mark workqueue thread manipulation
	1862
	1863	/**
	1864	* Entry point for libdispatch to ask for threads
	1865	*/
	1866	static int wqops_queue_reqthreads(struct proc *p, int reqcount, pthread_priority_t priority){
	1867	struct workqueue *wq;
	1868
	1869	boolean_t overcommit = (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0;
	1870	int class = pthread_priority_get_class_index(priority);
	1871
	1872	boolean_t event_manager = (_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) != 0;
	1873	if (event_manager){
	1874	class = WORKQUEUE_EVENT_MANAGER_BUCKET;
	1875	}
	1876
	1877	if ((reqcount <= 0) \|\| (class < 0) \|\| (class >= WORKQUEUE_NUM_BUCKETS) \|\| (overcommit && event_manager)) {
	1878	return EINVAL;
	1879	}
	1880
	1881	workqueue_lock_spin(p);
	1882
	1883	if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
	1884	workqueue_unlock(p);
	1885
	1886	return EINVAL;
	1887	}
	1888
	1889	if (overcommit == 0 && event_manager == 0) {
	1890	wq->wq_reqcount += reqcount;
	1891	wq->wq_requests[class] += reqcount;
	1892
	1893	PTHREAD_TRACE(TRACE_wq_req_threads \| DBG_FUNC_NONE, wq, priority, wq->wq_requests[class], reqcount, 0);
	1894
	1895	while (wq->wq_reqcount) {
	1896	if (!workqueue_run_one(p, wq, overcommit, 0))
	1897	break;
	1898	}
	1899	} else if (overcommit){
	1900	PTHREAD_TRACE(TRACE_wq_req_octhreads \| DBG_FUNC_NONE, wq, priority, wq->wq_ocrequests[class], reqcount, 0);
	1901
	1902	while (reqcount) {
	1903	if (!workqueue_run_one(p, wq, overcommit, priority))
	1904	break;
	1905	reqcount--;
	1906	}
	1907	if (reqcount) {
	1908	/*
	1909	* we need to delay starting some of the overcommit requests...
	1910	* we should only fail to create the overcommit threads if
	1911	* we're at the max thread limit... as existing threads
	1912	* return to the kernel, we'll notice the ocrequests
	1913	* and spin them back to user space as the overcommit variety
	1914	*/
	1915	wq->wq_reqcount += reqcount;
	1916	wq->wq_requests[class] += reqcount;
	1917	wq->wq_ocrequests[class] += reqcount;
	1918
	1919	PTHREAD_TRACE(TRACE_wq_delay_octhreads \| DBG_FUNC_NONE, wq, priority, wq->wq_ocrequests[class], reqcount, 0);
	1920
	1921	/* if we delayed this thread coming up but we're not constrained
	1922	* or at max threads then we need to start the timer so we don't
	1923	* risk dropping this request on the floor.
	1924	*/
	1925	if ((wq->wq_lflags & (WQL_EXCEEDED_TOTAL_THREAD_LIMIT \| WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT)) == 0) {
	1926	boolean_t start_timer = FALSE;
	1927	WQ_TIMER_NEEDED(wq, start_timer);
	1928
	1929	if (start_timer) {
	1930	workqueue_interval_timer_start(wq);
	1931	}
	1932	}
	1933	}
	1934	} else if (event_manager) {
	1935	PTHREAD_TRACE(TRACE_wq_req_event_manager \| DBG_FUNC_NONE, wq, wq->wq_event_manager_priority, wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET], wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET], 0);
	1936
	1937	if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
	1938	wq->wq_reqcount += 1;
	1939	wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
	1940	}
	1941
	1942	// We've recorded the request for an event manager thread above. We'll
	1943	// let the timer pick it up as we would for a kernel callout. We can
	1944	// do a direct add/wakeup when that support is added for the kevent path.
	1945	boolean_t start_timer = FALSE;
	1946	if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0)
	1947	WQ_TIMER_NEEDED(wq, start_timer);
	1948	if (start_timer == TRUE)
	1949	workqueue_interval_timer_start(wq);
	1950	}
	1951	workqueue_unlock(p);
	1952
	1953	return 0;
	1954	}
	1955
	1956	/* Used by the kevent system to request threads. Currently count is ignored
	1957	* and we always return one thread per invocation.
	1958	*/
	1959	thread_t _workq_reqthreads(struct proc *p, int requests_count, workq_reqthreads_req_t requests){
	1960	boolean_t start_timer = FALSE;
	1961	assert(requests_count > 0);
	1962
	1963	#if DEBUG
	1964	// Make sure that the requests array is sorted, highest priority first
	1965	if (requests_count > 1){
	1966	__assert_only qos_class_t priority = _pthread_priority_get_qos_newest(requests[0].priority);
	1967	__assert_only unsigned long flags = ((_pthread_priority_get_flags(requests[0].priority) & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG\|_PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) != 0);
	1968	for (int i = 1; i < requests_count; i++){
	1969	if (requests[i].count == 0) continue;
	1970	__assert_only qos_class_t next_priority = _pthread_priority_get_qos_newest(requests[i].priority);
	1971	__assert_only unsigned long next_flags = ((_pthread_priority_get_flags(requests[i].priority) & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG\|_PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) != 0);
	1972	if (next_flags != flags){
	1973	flags = next_flags;
	1974	priority = next_priority;
	1975	} else {
	1976	assert(next_priority <= priority);
	1977	}
	1978	}
	1979	}
	1980	#endif // DEBUG
	1981
	1982	int error = 0;
	1983	struct workqueue *wq;
	1984
	1985	workqueue_lock_spin(p);
	1986
	1987	if ((wq = (struct workqueue *)pthread_kern->proc_get_wqptr(p)) == NULL) {
	1988	error = EINVAL;
	1989	goto done;
	1990	}
	1991
	1992	PTHREAD_TRACE(TRACE_wq_kevent_req_threads \| DBG_FUNC_START, wq, requests_count, 0, 0, 0);
	1993
	1994	// Look for overcommit or event-manager-only requests.
	1995	boolean_t have_overcommit = FALSE;
	1996	pthread_priority_t priority = 0;
	1997	for (int i = 0; i < requests_count; i++){
	1998	if (requests[i].count == 0)
	1999	continue;
	2000	priority = requests[i].priority;
	2001	if (_pthread_priority_get_qos_newest(priority) == QOS_CLASS_UNSPECIFIED){
	2002	priority \|= _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
	2003	}
	2004	if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) != 0){
	2005	goto event_manager;
	2006	}
	2007	if ((_pthread_priority_get_flags(priority) & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) != 0){
	2008	have_overcommit = TRUE;
	2009	break;
	2010	}
	2011	}
	2012
	2013	if (have_overcommit){
	2014	// I can't make this call, since it's not safe from some contexts yet,
	2015	// so just setup a delayed overcommit and let the timer do the work
	2016	//boolean_t success = workqueue_run_one(p, wq, TRUE, priority);
	2017	if (/* !success */ TRUE){
	2018	int class = pthread_priority_get_class_index(priority);
	2019	wq->wq_reqcount += 1;
	2020	wq->wq_requests[class] += 1;
	2021	wq->wq_kevent_ocrequests[class] += 1;
	2022
	2023	PTHREAD_TRACE(TRACE_wq_req_kevent_octhreads \| DBG_FUNC_NONE, wq, priority, wq->wq_kevent_ocrequests[class], 1, 0);
	2024
	2025	WQ_TIMER_NEEDED(wq, start_timer);
	2026	}
	2027	goto done;
	2028	}
	2029
	2030	// Having no overcommit requests, try to find any request that can start
	2031	// There's no TOCTTOU since we hold the workqueue lock
	2032	for (int i = 0; i < requests_count; i++){
	2033	workq_reqthreads_req_t req = requests + i;
	2034	priority = req->priority;
	2035
	2036	if (req->count == 0)
	2037	continue;
	2038
	2039	int class = pthread_priority_get_class_index(priority);
	2040
	2041	// Ask if we can start a new thread at the given class. Pass NUM_BUCKETS as
	2042	// my class to indicate we won't reuse this thread
	2043	if (may_start_constrained_thread(wq, class, WORKQUEUE_NUM_BUCKETS, NULL)){
	2044	wq->wq_reqcount += 1;
	2045	wq->wq_requests[class] += 1;
	2046	wq->wq_kevent_requests[class] += 1;
	2047
	2048	PTHREAD_TRACE(TRACE_wq_req_kevent_threads \| DBG_FUNC_NONE, wq, priority, wq->wq_kevent_requests[class], 1, 0);
	2049
	2050	// I can't make this call because it's not yet safe to make from
	2051	// scheduler callout context, so instead we'll just start up the timer
	2052	// which will spin up the thread when it files.
	2053	// workqueue_run_one(p, wq, FALSE, priority);
	2054
	2055	WQ_TIMER_NEEDED(wq, start_timer);
	2056
	2057	goto done;
	2058	}
	2059	}
	2060
	2061	// Okay, here's the fun case: we can't spin up any of the non-overcommit threads
	2062	// that we've seen a request for, so we kick this over to the event manager thread
	2063
	2064	event_manager:
	2065	PTHREAD_TRACE(TRACE_wq_req_event_manager \| DBG_FUNC_NONE, wq, wq->wq_event_manager_priority, wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET], wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET], 0);
	2066
	2067	if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0){
	2068	wq->wq_reqcount += 1;
	2069	wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
	2070	}
	2071	wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] = 1;
	2072
	2073	if (wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0)
	2074	WQ_TIMER_NEEDED(wq, start_timer);
	2075
	2076	done:
	2077	workqueue_unlock(p);
	2078
	2079	if (start_timer == TRUE)
	2080	workqueue_interval_timer_start(wq);
	2081
	2082	PTHREAD_TRACE(TRACE_wq_kevent_req_threads \| DBG_FUNC_END, wq, start_timer, 0, 0, 0);
	2083
	2084	return THREAD_NULL;
	2085	}
	2086
	2087
	2088	static int wqops_thread_return(struct proc *p){
	2089	thread_t th = current_thread();
	2090	struct uthread *uth = pthread_kern->get_bsdthread_info(th);
	2091	struct threadlist *tl = util_get_thread_threadlist_entry(th);
	2092
	2093	/* reset signal mask on the workqueue thread to default state */
	2094	if (pthread_kern->uthread_get_sigmask(uth) != (sigset_t)(~workq_threadmask)) {
	2095	pthread_kern->proc_lock(p);
	2096	pthread_kern->uthread_set_sigmask(uth, ~workq_threadmask);
	2097	pthread_kern->proc_unlock(p);
	2098	}
	2099
	2100	/* dropping WQ override counts has to be done outside the wq lock. */
	2101	wq_thread_override_reset(th, THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD);
	2102
	2103	workqueue_lock_spin(p);
	2104
	2105	struct workqueue wq = (struct workqueue )pthread_kern->proc_get_wqptr(p);
	2106	if (wq == NULL \|\| !tl) {
	2107	workqueue_unlock(p);
	2108
	2109	return EINVAL;
	2110	}
	2111	PTHREAD_TRACE(TRACE_wq_runitem \| DBG_FUNC_END, wq, 0, 0, 0, 0);
	2112
	2113	(void)workqueue_run_nextreq(p, wq, th, RUN_NEXTREQ_DEFAULT, 0);
	2114	/*
	2115	* workqueue_run_nextreq is responsible for
	2116	* dropping the workqueue lock in all cases
	2117	*/
	2118	return 0;
	2119	}
	2120
	2121	/**
	2122	* Multiplexed call to interact with the workqueue mechanism
	2123	*/
	2124	int
	2125	_workq_kernreturn(struct proc *p,
	2126	int options,
	2127	__unused user_addr_t item,
	2128	int arg2,
	2129	int arg3,
	2130	int32_t *retval)
	2131	{
	2132	int error = 0;
	2133
	2134	if (pthread_kern->proc_get_register(p) == 0) {
	2135	return EINVAL;
	2136	}
	2137
	2138	switch (options) {
	2139	case WQOPS_QUEUE_NEWSPISUPP: {
	2140	/*
	2141	* arg2 = offset of serialno into dispatch queue
	2142	* arg3 = kevent support
	2143	*/
	2144	int offset = arg2;
	2145	if (arg3 & 0x01){
	2146	// If we get here, then userspace has indicated support for kevent delivery.
	2147	}
	2148
	2149	pthread_kern->proc_set_dispatchqueue_serialno_offset(p, (uint64_t)offset);
	2150	break;
	2151	}
	2152	case WQOPS_QUEUE_REQTHREADS: {
	2153	/*
	2154	* arg2 = number of threads to start
	2155	* arg3 = priority
	2156	*/
	2157	error = wqops_queue_reqthreads(p, arg2, arg3);
	2158	break;
	2159	}
	2160	case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
	2161	/*
	2162	* arg2 = priority for the manager thread
	2163	*
	2164	* if _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG is set, the
	2165	* ~_PTHREAD_PRIORITY_FLAGS_MASK contains a scheduling priority instead
	2166	* of a QOS value
	2167	*/
	2168	pthread_priority_t pri = arg2;
	2169
	2170	workqueue_lock_spin(p);
	2171	struct workqueue wq = (struct workqueue )pthread_kern->proc_get_wqptr(p);
	2172	if (wq == NULL ) {
	2173	workqueue_unlock(p);
	2174	error = EINVAL;
	2175	break;
	2176	}
	2177	if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
	2178	// If userspace passes a scheduling priority, that takes precidence
	2179	// over any QoS. (So, userspace should take care not to accidenatally
	2180	// lower the priority this way.)
	2181	uint32_t sched_pri = pri & (~_PTHREAD_PRIORITY_FLAGS_MASK);
	2182	if (wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
	2183	wq->wq_event_manager_priority = MAX(sched_pri, wq->wq_event_manager_priority & (~_PTHREAD_PRIORITY_FLAGS_MASK))
	2184	\| _PTHREAD_PRIORITY_SCHED_PRI_FLAG \| _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
	2185	} else {
	2186	wq->wq_event_manager_priority = sched_pri
	2187	\| _PTHREAD_PRIORITY_SCHED_PRI_FLAG \| _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
	2188	}
	2189	} else if ((wq->wq_event_manager_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) == 0){
	2190	int cur_qos = pthread_priority_get_qos_class(wq->wq_event_manager_priority);
	2191	int new_qos = pthread_priority_get_qos_class(pri);
	2192	wq->wq_event_manager_priority = (uint32_t)pthread_qos_class_get_priority(MAX(cur_qos, new_qos)) \| _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
	2193	}
	2194	workqueue_unlock(p);
	2195	break;
	2196	}
	2197	case WQOPS_THREAD_KEVENT_RETURN: {
	2198	int32_t kevent_retval;
	2199	int ret = kevent_qos_internal(p, -1, item, arg2, item, arg2, NULL, NULL, KEVENT_FLAG_WORKQ \| KEVENT_FLAG_IMMEDIATE \| KEVENT_FLAG_ERROR_EVENTS, &kevent_retval);
	2200	// We shouldn't be getting more errors out than events we put in, so
	2201	// reusing the input buffer should always provide enough space
	2202	assert(ret == KERN_SUCCESS && kevent_retval >= 0);
	2203	if (ret != KERN_SUCCESS){
	2204	error = ret;
	2205	break;
	2206	} else if (kevent_retval > 0){
	2207	assert(kevent_retval <= arg2);
	2208	*retval = kevent_retval;
	2209	error = 0;
	2210	break;
	2211	}
	2212	} /* FALLTHROUGH */
	2213	case WQOPS_THREAD_RETURN: {
	2214	error = wqops_thread_return(p);
	2215	// NOT REACHED except in case of error
	2216	assert(error);
	2217	break;
	2218	}
	2219	default:
	2220	error = EINVAL;
	2221	break;
	2222	}
	2223	return (error);
	2224	}
	2225
	2226
	2227	static boolean_t
	2228	workqueue_run_one(proc_t p, struct workqueue *wq, boolean_t overcommit, pthread_priority_t priority)
	2229	{
	2230	boolean_t ran_one;
	2231
	2232	if (wq->wq_thidlecount == 0) {
	2233	if (overcommit == FALSE) {
	2234	if (wq->wq_constrained_threads_scheduled < wq->wq_max_concurrency)
	2235	workqueue_addnewthread(wq, overcommit);
	2236	} else {
	2237	workqueue_addnewthread(wq, overcommit);
	2238
	2239	if (wq->wq_thidlecount == 0)
	2240	return (FALSE);
	2241	}
	2242	}
	2243	ran_one = workqueue_run_nextreq(p, wq, THREAD_NULL, overcommit ? RUN_NEXTREQ_OVERCOMMIT : RUN_NEXTREQ_DEFAULT, priority);
	2244	/*
	2245	* workqueue_run_nextreq is responsible for
	2246	* dropping the workqueue lock in all cases
	2247	*/
	2248	workqueue_lock_spin(p);
	2249
	2250	return (ran_one);
	2251	}
	2252
	2253	/*
	2254	* this is a workqueue thread with no more
	2255	* work to do... park it for now
	2256	*/
	2257	static void
	2258	parkit(struct workqueue wq, struct threadlist tl, thread_t thread)
	2259	{
	2260	uint32_t us_to_wait;
	2261
	2262	TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
	2263	tl->th_flags &= ~TH_LIST_RUNNING;
	2264
	2265	tl->th_flags \|= TH_LIST_BLOCKED;
	2266	TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
	2267
	2268	pthread_kern->thread_sched_call(thread, NULL);
	2269
	2270	OSAddAtomic(-1, &wq->wq_thactive_count[tl->th_priority]);
	2271	wq->wq_thscheduled_count[tl->th_priority]--;
	2272	wq->wq_threads_scheduled--;
	2273
	2274	if (tl->th_flags & TH_LIST_CONSTRAINED) {
	2275	wq->wq_constrained_threads_scheduled--;
	2276	wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
	2277	tl->th_flags &= ~TH_LIST_CONSTRAINED;
	2278	}
	2279
	2280	if (wq->wq_thidlecount < 100)
	2281	us_to_wait = wq_reduce_pool_window_usecs - (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100));
	2282	else
	2283	us_to_wait = wq_reduce_pool_window_usecs / 100;
	2284
	2285	wq->wq_thidlecount++;
	2286	wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT;
	2287
	2288	assert_wait_timeout_with_leeway((caddr_t)tl, (THREAD_INTERRUPTIBLE),
	2289	TIMEOUT_URGENCY_SYS_BACKGROUND\|TIMEOUT_URGENCY_LEEWAY, us_to_wait,
	2290	wq_reduce_pool_window_usecs, NSEC_PER_USEC);
	2291
	2292	PTHREAD_TRACE1(TRACE_wq_thread_park \| DBG_FUNC_START, wq, wq->wq_threads_scheduled, wq->wq_thidlecount, us_to_wait, thread_tid(thread));
	2293	}
	2294
	2295	static boolean_t may_start_constrained_thread(struct workqueue wq, uint32_t at_priclass, uint32_t my_priclass, boolean_t start_timer){
	2296	if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
	2297	/*
	2298	* we need 1 or more constrained threads to return to the kernel before
	2299	* we can dispatch additional work
	2300	*/
	2301	return FALSE;
	2302	}
	2303
	2304	uint32_t busycount = 0;
	2305	uint32_t thactive_count = wq->wq_thactive_count[at_priclass];
	2306
	2307	// Has our most recently blocked thread blocked recently enough that we
	2308	// should still consider it busy?
	2309	// XXX should this be wq->wq_thscheduled_count[at_priclass] > thactive_count ?
	2310	if (wq->wq_thscheduled_count[at_priclass]) {
	2311	if (wq_thread_is_busy(mach_absolute_time(), &wq->wq_lastblocked_ts[at_priclass])) {
	2312	busycount++;
	2313	}
	2314	}
	2315
	2316	if (my_priclass < WORKQUEUE_NUM_BUCKETS && my_priclass == at_priclass){
	2317	/*
	2318	* dont't count this thread as currently active
	2319	*/
	2320	thactive_count--;
	2321	}
	2322
	2323	if (thactive_count + busycount >= wq->wq_max_concurrency) {
	2324	if (busycount && start_timer) {
	2325	/*
	2326	* we found at least 1 thread in the
	2327	* 'busy' state... make sure we start
	2328	* the timer because if they are the only
	2329	* threads keeping us from scheduling
	2330	* this work request, we won't get a callback
	2331	* to kick off the timer... we need to
	2332	* start it now...
	2333	*/
	2334	WQ_TIMER_NEEDED(wq, *start_timer);
	2335	}
	2336
	2337	PTHREAD_TRACE(TRACE_wq_overcommitted\|DBG_FUNC_NONE, wq, (start_timer ? 1<<7 : 0) \| pthread_priority_from_class_index(at_priclass), thactive_count, busycount, 0);
	2338
	2339	return FALSE;
	2340	}
	2341	return TRUE;
	2342	}
	2343
	2344	static struct threadlist pop_from_thidlelist(struct workqueue wq, uint32_t priclass, int upcall_flags, int wake_thread){
	2345	struct threadlist *tl = TAILQ_FIRST(&wq->wq_thidlelist);
	2346	TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
	2347	wq->wq_thidlecount--;
	2348
	2349	TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
	2350
	2351	if ((tl->th_flags & TH_LIST_SUSPENDED) == TH_LIST_SUSPENDED) {
	2352	tl->th_flags &= ~TH_LIST_SUSPENDED;
	2353	*upcall_flags &= ~WQ_FLAG_THREAD_REUSE;
	2354
	2355	} else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) {
	2356	tl->th_flags &= ~TH_LIST_BLOCKED;
	2357	*wake_thread = 1;
	2358	}
	2359	tl->th_flags \|= TH_LIST_RUNNING \| TH_LIST_BUSY;
	2360
	2361	wq->wq_threads_scheduled++;
	2362	wq->wq_thscheduled_count[priclass]++;
	2363	OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
	2364
	2365	return tl;
	2366	}
	2367
	2368	static void
	2369	reset_to_priority(struct threadlist *tl, pthread_priority_t pri){
	2370	kern_return_t ret;
	2371	thread_t th = tl->th_thread;
	2372
	2373	if (tl->th_flags & TH_LIST_EVENT_MGR_SCHED_PRI){
	2374	thread_precedence_policy_data_t precedinfo = {
	2375	.importance = 0
	2376	};
	2377	ret = pthread_kern->thread_policy_set_internal(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
	2378	assert(ret == KERN_SUCCESS \|\| ret == KERN_TERMINATED);
	2379	tl->th_flags &= ~TH_LIST_EVENT_MGR_SCHED_PRI;
	2380	}
	2381
	2382	thread_qos_policy_data_t qosinfo = {
	2383	.qos_tier = pthread_priority_get_qos_class(pri),
	2384	.tier_importance = 0
	2385	};
	2386	ret = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qosinfo, THREAD_QOS_POLICY_COUNT);
	2387	assert(ret == KERN_SUCCESS \|\| ret == KERN_TERMINATED);
	2388	}
	2389
	2390	static void
	2391	reset_to_schedpri(struct threadlist *tl, pthread_priority_t pri){
	2392	kern_return_t ret;
	2393	thread_t th = tl->th_thread;
	2394
	2395	thread_qos_policy_data_t qosinfo = {
	2396	.qos_tier = THREAD_QOS_UNSPECIFIED,
	2397	.tier_importance = 0
	2398	};
	2399	ret = pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY, (thread_policy_t)&qosinfo, THREAD_QOS_POLICY_COUNT);
	2400	assert(ret == KERN_SUCCESS \|\| ret == KERN_TERMINATED);
	2401
	2402	thread_precedence_policy_data_t precedinfo = {
	2403	.importance = ((pri & (~_PTHREAD_PRIORITY_FLAGS_MASK)) - BASEPRI_DEFAULT)
	2404	};
	2405	ret = pthread_kern->thread_policy_set_internal(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
	2406	assert(ret == KERN_SUCCESS \|\| ret == KERN_TERMINATED);
	2407
	2408	tl->th_flags \|= TH_LIST_EVENT_MGR_SCHED_PRI;
	2409	}
	2410
	2411	/**
	2412	* grabs a thread for a request
	2413	*
	2414	* - called with the workqueue lock held...
	2415	* - responsible for dropping it in all cases
	2416	* - if provided mode is for overcommit, doesn't consume a reqcount
	2417	*
	2418	*/
	2419	static boolean_t
	2420	workqueue_run_nextreq(proc_t p, struct workqueue *wq, thread_t thread,
	2421	enum run_nextreq_mode mode, pthread_priority_t oc_prio)
	2422	{
	2423	thread_t th_to_run = THREAD_NULL;
	2424	int wake_thread = 0;
	2425	int upcall_flags = WQ_FLAG_THREAD_REUSE;
	2426	uint32_t priclass;
	2427	struct threadlist *tl = NULL;
	2428	struct uthread *uth = NULL;
	2429	boolean_t start_timer = FALSE;
	2430
	2431	// valid modes to call this function with
	2432	assert(mode == RUN_NEXTREQ_DEFAULT \|\| mode == RUN_NEXTREQ_OVERCOMMIT \|\| mode == RUN_NEXTREQ_UNCONSTRAINED);
	2433	// may only have a priority if in OVERCOMMIT mode
	2434	assert(mode == RUN_NEXTREQ_OVERCOMMIT \|\| oc_prio == 0);
	2435	// thread == thread_null means "please spin up a new workqueue thread, we can't reuse this"
	2436	// thread != thread_null is thread reuse, and must be the current thread
	2437	assert(thread == THREAD_NULL \|\| thread == current_thread());
	2438
	2439	PTHREAD_TRACE(TRACE_wq_run_nextitem\|DBG_FUNC_START, wq, thread, wq->wq_thidlecount, wq->wq_reqcount, 0);
	2440
	2441	if (thread != THREAD_NULL) {
	2442	uth = pthread_kern->get_bsdthread_info(thread);
	2443
	2444	if ((tl = pthread_kern->uthread_get_threadlist(uth)) == NULL) {
	2445	panic("wq thread with no threadlist");
	2446	}
	2447	}
	2448
	2449	/*
	2450	* from here until we drop the workq lock
	2451	* we can't be pre-empted since we hold
	2452	* the lock in spin mode... this is important
	2453	* since we have to independently update the priority that
	2454	* the thread is associated with and the priorty based
	2455	* counters that "workqueue_callback" also changes and bases
	2456	* decisons on.
	2457	*/
	2458
	2459	if (mode == RUN_NEXTREQ_OVERCOMMIT) {
	2460	priclass = pthread_priority_get_class_index(oc_prio);
	2461	upcall_flags \|= WQ_FLAG_THREAD_OVERCOMMIT;
	2462	} else if (wq->wq_reqcount == 0){
	2463	// no work to do. we'll check again when new work arrives.
	2464	goto done;
	2465	} else if (wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] &&
	2466	((wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 0) \|\|
	2467	(thread != THREAD_NULL && tl->th_priority == WORKQUEUE_EVENT_MANAGER_BUCKET))){
	2468	// There's an event manager request and either:
	2469	// - no event manager currently running
	2470	// - we are re-using the event manager
	2471	mode = RUN_NEXTREQ_EVENT_MANAGER;
	2472	priclass = WORKQUEUE_EVENT_MANAGER_BUCKET;
	2473	upcall_flags \|= WQ_FLAG_THREAD_EVENT_MANAGER;
	2474	if (wq->wq_kevent_requests[WORKQUEUE_EVENT_MANAGER_BUCKET])
	2475	upcall_flags \|= WQ_FLAG_THREAD_KEVENT;
	2476	} else {
	2477	// Find highest priority and check for special request types
	2478	for (priclass = 0; priclass < WORKQUEUE_EVENT_MANAGER_BUCKET; priclass++) {
	2479	if (wq->wq_requests[priclass])
	2480	break;
	2481	}
	2482	if (priclass == WORKQUEUE_EVENT_MANAGER_BUCKET){
	2483	// only request should have been event manager since it's not in a bucket,
	2484	// but we weren't able to handle it since there's already an event manager running,
	2485	// so we fell into this case
	2486	assert(wq->wq_requests[WORKQUEUE_EVENT_MANAGER_BUCKET] == 1 &&
	2487	wq->wq_thscheduled_count[WORKQUEUE_EVENT_MANAGER_BUCKET] == 1 &&
	2488	wq->wq_reqcount == 1);
	2489	goto done;
	2490	}
	2491
	2492	if (wq->wq_kevent_ocrequests[priclass]){
	2493	mode = RUN_NEXTREQ_DEFERRED_OVERCOMMIT;
	2494	upcall_flags \|= WQ_FLAG_THREAD_KEVENT;
	2495	upcall_flags \|= WQ_FLAG_THREAD_OVERCOMMIT;
	2496	} else if (wq->wq_ocrequests[priclass]){
	2497	mode = RUN_NEXTREQ_DEFERRED_OVERCOMMIT;
	2498	upcall_flags \|= WQ_FLAG_THREAD_OVERCOMMIT;
	2499	} else if (wq->wq_kevent_requests[priclass]){
	2500	upcall_flags \|= WQ_FLAG_THREAD_KEVENT;
	2501	}
	2502	}
	2503
	2504	if (mode == RUN_NEXTREQ_DEFAULT /* non-overcommit */){
	2505	uint32_t my_priclass = (thread != THREAD_NULL) ? tl->th_priority : WORKQUEUE_NUM_BUCKETS;
	2506	if (may_start_constrained_thread(wq, priclass, my_priclass, &start_timer) == FALSE){
	2507	// per policy, we won't start another constrained thread
	2508	goto done;
	2509	}
	2510	}
	2511
	2512	if (thread != THREAD_NULL) {
	2513	/*
	2514	* thread is non-NULL here when we return from userspace
	2515	* in workq_kernreturn, rather than trying to find a thread
	2516	* we pick up new work for this specific thread.
	2517	*/
	2518	th_to_run = thread;
	2519	} else if (wq->wq_thidlecount == 0) {
	2520	/*
	2521	* we have no additional threads waiting to pick up
	2522	* work, however, there is additional work to do.
	2523	*/
	2524	WQ_TIMER_NEEDED(wq, start_timer);
	2525
	2526	PTHREAD_TRACE(TRACE_wq_stalled, wq, wq->wq_nthreads, start_timer, 0, 0);
	2527
	2528	goto done;
	2529	} else {
	2530	// there is both work available and an idle thread, so activate a thread
	2531	tl = pop_from_thidlelist(wq, priclass, &upcall_flags, &wake_thread);
	2532	th_to_run = tl->th_thread;
	2533	}
	2534
	2535	// Adjust counters and thread flags AKA consume the request
	2536	// TODO: It would be lovely if OVERCOMMIT consumed reqcount
	2537	switch (mode) {
	2538	case RUN_NEXTREQ_DEFAULT:
	2539	case RUN_NEXTREQ_UNCONSTRAINED:
	2540	wq->wq_reqcount--;
	2541	wq->wq_requests[priclass]--;
	2542
	2543	if (mode == RUN_NEXTREQ_DEFAULT){
	2544	if (!(tl->th_flags & TH_LIST_CONSTRAINED)) {
	2545	wq->wq_constrained_threads_scheduled++;
	2546	tl->th_flags \|= TH_LIST_CONSTRAINED;
	2547	}
	2548	} else if (mode == RUN_NEXTREQ_UNCONSTRAINED){
	2549	if (tl->th_flags & TH_LIST_CONSTRAINED) {
	2550	// XXX: Why aren't we unsetting CONSTRAINED_THREAD_LIMIT here
	2551	wq->wq_constrained_threads_scheduled--;
	2552	tl->th_flags &= ~TH_LIST_CONSTRAINED;
	2553	}
	2554	}
	2555	if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
	2556	wq->wq_kevent_requests[priclass]--;
	2557	}
	2558	break;
	2559
	2560	case RUN_NEXTREQ_EVENT_MANAGER:
	2561	wq->wq_reqcount--;
	2562	wq->wq_requests[priclass]--;
	2563
	2564	if (tl->th_flags & TH_LIST_CONSTRAINED) {
	2565	wq->wq_constrained_threads_scheduled--;
	2566	tl->th_flags &= ~TH_LIST_CONSTRAINED;
	2567	}
	2568	if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
	2569	wq->wq_kevent_requests[priclass]--;
	2570	}
	2571	break;
	2572
	2573	case RUN_NEXTREQ_DEFERRED_OVERCOMMIT:
	2574	wq->wq_reqcount--;
	2575	wq->wq_requests[priclass]--;
	2576	if (upcall_flags & WQ_FLAG_THREAD_KEVENT){
	2577	wq->wq_kevent_ocrequests[priclass]--;
	2578	} else {
	2579	wq->wq_ocrequests[priclass]--;
	2580	}
	2581	/* FALLTHROUGH */
	2582	case RUN_NEXTREQ_OVERCOMMIT:
	2583	if (tl->th_flags & TH_LIST_CONSTRAINED) {
	2584	wq->wq_constrained_threads_scheduled--;
	2585	tl->th_flags &= ~TH_LIST_CONSTRAINED;
	2586	}
	2587	break;
	2588	}
	2589
	2590	// Confirm we've maintained our counter invariants
	2591	assert(wq->wq_requests[priclass] < UINT16_MAX);
	2592	assert(wq->wq_ocrequests[priclass] < UINT16_MAX);
	2593	assert(wq->wq_kevent_requests[priclass] < UINT16_MAX);
	2594	assert(wq->wq_kevent_ocrequests[priclass] < UINT16_MAX);
	2595	assert(wq->wq_ocrequests[priclass] + wq->wq_kevent_requests[priclass] +
	2596	wq->wq_kevent_ocrequests[priclass] <=
	2597	wq->wq_requests[priclass]);
	2598
	2599	uint32_t orig_class = tl->th_priority;
	2600	tl->th_priority = (uint8_t)priclass;
	2601
	2602	if ((thread != THREAD_NULL) && (orig_class != priclass)) {
	2603	/*
	2604	* we need to adjust these counters based on this
	2605	* thread's new disposition w/r to priority
	2606	*/
	2607	OSAddAtomic(-1, &wq->wq_thactive_count[orig_class]);
	2608	OSAddAtomic(1, &wq->wq_thactive_count[priclass]);
	2609
	2610	wq->wq_thscheduled_count[orig_class]--;
	2611	wq->wq_thscheduled_count[priclass]++;
	2612	}
	2613	wq->wq_thread_yielded_count = 0;
	2614
	2615	workqueue_unlock(p);
	2616
	2617	pthread_priority_t outgoing_priority;
	2618	if (mode == RUN_NEXTREQ_EVENT_MANAGER){
	2619	outgoing_priority = wq->wq_event_manager_priority;
	2620	} else {
	2621	outgoing_priority = pthread_priority_from_class_index(priclass);
	2622	}
	2623
	2624	PTHREAD_TRACE(TRACE_wq_reset_priority \| DBG_FUNC_START, wq, thread_tid(tl->th_thread), outgoing_priority, 0, 0);
	2625	if (outgoing_priority & _PTHREAD_PRIORITY_SCHED_PRI_FLAG){
	2626	reset_to_schedpri(tl, outgoing_priority & (~_PTHREAD_PRIORITY_FLAGS_MASK));
	2627	} else if (orig_class != priclass) {
	2628	reset_to_priority(tl, outgoing_priority);
	2629	}
	2630	PTHREAD_TRACE(TRACE_wq_reset_priority \| DBG_FUNC_END, wq, thread_tid(tl->th_thread), outgoing_priority, 0, 0);
	2631
	2632	/*
	2633	* if current thread is reused for work request, does not return via unix_syscall
	2634	*/
	2635	wq_runreq(p, outgoing_priority, th_to_run, tl, upcall_flags, wake_thread, (thread == th_to_run));
	2636
	2637	PTHREAD_TRACE(TRACE_wq_run_nextitem\|DBG_FUNC_END, wq, thread_tid(th_to_run), mode == RUN_NEXTREQ_OVERCOMMIT, 1, 0);
	2638
	2639	return (TRUE);
	2640
	2641	done:
	2642	if (thread != THREAD_NULL){
	2643	parkit(wq,tl,thread);
	2644	}
	2645
	2646	workqueue_unlock(p);
	2647
	2648	if (start_timer)
	2649	workqueue_interval_timer_start(wq);
	2650
	2651	PTHREAD_TRACE(TRACE_wq_run_nextitem \| DBG_FUNC_END, wq, thread_tid(thread), start_timer, 3, 0);
	2652
	2653	if (thread != THREAD_NULL){
	2654	thread_block((thread_continue_t)wq_unpark_continue);
	2655	/* NOT REACHED */
	2656	}
	2657
	2658	return (FALSE);
	2659	}
	2660
	2661	/**
	2662	* Called when a new thread is created
	2663	*/
	2664	static void
	2665	wq_unsuspend_continue(void)
	2666	{
	2667	struct uthread *uth = NULL;
	2668	thread_t th_to_unsuspend;
	2669	struct threadlist *tl;
	2670	proc_t p;
	2671
	2672	th_to_unsuspend = current_thread();
	2673	uth = pthread_kern->get_bsdthread_info(th_to_unsuspend);
	2674
	2675	if (uth != NULL && (tl = pthread_kern->uthread_get_threadlist(uth)) != NULL) {
	2676
	2677	if ((tl->th_flags & (TH_LIST_RUNNING \| TH_LIST_BUSY)) == TH_LIST_RUNNING) {
	2678	/*
	2679	* most likely a normal resume of this thread occurred...
	2680	* it's also possible that the thread was aborted after we
	2681	* finished setting it up so that it could be dispatched... if
	2682	* so, thread_bootstrap_return will notice the abort and put
	2683	* the thread on the path to self-destruction
	2684	*/
	2685	normal_resume_to_user:
	2686	pthread_kern->thread_sched_call(th_to_unsuspend, workqueue_callback);
	2687	pthread_kern->thread_bootstrap_return();
	2688	}
	2689	/*
	2690	* if we get here, it's because we've been resumed due to
	2691	* an abort of this thread (process is crashing)
	2692	*/
	2693	p = current_proc();
	2694
	2695	workqueue_lock_spin(p);
	2696
	2697	if (tl->th_flags & TH_LIST_SUSPENDED) {
	2698	/*
	2699	* thread has been aborted while still on our idle
	2700	* queue... remove it from our domain...
	2701	* workqueue_removethread consumes the lock
	2702	*/
	2703	workqueue_removethread(tl, 0);
	2704	pthread_kern->thread_bootstrap_return();
	2705	}
	2706	while ((tl->th_flags & TH_LIST_BUSY)) {
	2707	/*
	2708	* this thread was aborted after we started making
	2709	* it runnable, but before we finished dispatching it...
	2710	* we need to wait for that process to finish,
	2711	* and we need to ask for a wakeup instead of a
	2712	* thread_resume since the abort has already resumed us
	2713	*/
	2714	tl->th_flags \|= TH_LIST_NEED_WAKEUP;
	2715
	2716	assert_wait((caddr_t)tl, (THREAD_UNINT));
	2717
	2718	workqueue_unlock(p);
	2719	thread_block(THREAD_CONTINUE_NULL);
	2720	workqueue_lock_spin(p);
	2721	}
	2722	workqueue_unlock(p);
	2723	/*
	2724	* we have finished setting up the thread's context...
	2725	* thread_bootstrap_return will take us through the abort path
	2726	* where the thread will self destruct
	2727	*/
	2728	goto normal_resume_to_user;
	2729	}
	2730	pthread_kern->thread_bootstrap_return();
	2731	}
	2732
	2733	/**
	2734	* parked thread wakes up
	2735	*/
	2736	static void
	2737	wq_unpark_continue(void)
	2738	{
	2739	struct uthread *uth;
	2740	struct threadlist *tl;
	2741
	2742	thread_t th_to_unpark = current_thread();
	2743
	2744	if ((uth = pthread_kern->get_bsdthread_info(th_to_unpark)) == NULL)
	2745	goto done;
	2746	if ((tl = pthread_kern->uthread_get_threadlist(uth)) == NULL)
	2747	goto done;
	2748
	2749	/*
	2750	* check if a normal wakeup of this thread occurred... if so, there's no need
	2751	* for any synchronization with the timer and wq_runreq so we just skip all this.
	2752	*/
	2753	if ((tl->th_flags & (TH_LIST_RUNNING \| TH_LIST_BUSY)) != TH_LIST_RUNNING) {
	2754	proc_t p = current_proc();
	2755
	2756	workqueue_lock_spin(p);
	2757
	2758	if ( !(tl->th_flags & TH_LIST_RUNNING)) {
	2759	/*
	2760	* the timer popped us out and we've not
	2761	* been moved off of the idle list
	2762	* so we should now self-destruct
	2763	*
	2764	* workqueue_removethread consumes the lock
	2765	*/
	2766	workqueue_removethread(tl, 0);
	2767	pthread_kern->unix_syscall_return(0);
	2768	}
	2769
	2770	/*
	2771	* the timer woke us up, but we have already
	2772	* started to make this a runnable thread,
	2773	* but have not yet finished that process...
	2774	* so wait for the normal wakeup
	2775	*/
	2776	while ((tl->th_flags & TH_LIST_BUSY)) {
	2777
	2778	assert_wait((caddr_t)tl, (THREAD_UNINT));
	2779
	2780	workqueue_unlock(p);
	2781
	2782	thread_block(THREAD_CONTINUE_NULL);
	2783
	2784	workqueue_lock_spin(p);
	2785	}
	2786
	2787	/*
	2788	* we have finished setting up the thread's context
	2789	* now we can return as if we got a normal wakeup
	2790	*/
	2791	workqueue_unlock(p);
	2792	}
	2793
	2794	pthread_kern->thread_sched_call(th_to_unpark, workqueue_callback);
	2795
	2796	// FIXME: What's this?
	2797	PTHREAD_TRACE(0xefffd018 \| DBG_FUNC_END, tl->th_workq, 0, 0, 0, 0);
	2798
	2799	done:
	2800
	2801	// XXX should be using unix_syscall_return(EJUSTRETURN)
	2802	pthread_kern->thread_exception_return();
	2803	}
	2804
	2805
	2806
	2807	static void
	2808	wq_runreq(proc_t p, pthread_priority_t priority, thread_t th, struct threadlist *tl,
	2809	int flags, int wake_thread, int return_directly)
	2810	{
	2811	int ret = 0;
	2812	boolean_t need_resume = FALSE;
	2813
	2814	PTHREAD_TRACE1(TRACE_wq_runitem \| DBG_FUNC_START, tl->th_workq, flags, priority, thread_tid(current_thread()), thread_tid(th));
	2815
	2816	ret = _setup_wqthread(p, th, priority, flags, tl);
	2817
	2818	if (ret != 0)
	2819	panic("setup_wqthread failed %x\n", ret);
	2820
	2821	if (return_directly) {
	2822	PTHREAD_TRACE(TRACE_wq_run_nextitem\|DBG_FUNC_END, tl->th_workq, 0, 0, 4, 0);
	2823
	2824	// XXX should be using unix_syscall_return(EJUSTRETURN)
	2825	pthread_kern->thread_exception_return();
	2826	panic("wq_runreq: thread_exception_return returned ...\n");
	2827	}
	2828	if (wake_thread) {
	2829	workqueue_lock_spin(p);
	2830
	2831	tl->th_flags &= ~TH_LIST_BUSY;
	2832	wakeup(tl);
	2833
	2834	workqueue_unlock(p);
	2835	} else {
	2836	PTHREAD_TRACE1(TRACE_wq_thread_suspend \| DBG_FUNC_END, tl->th_workq, 0, 0, thread_tid(current_thread()), thread_tid(th));
	2837
	2838	workqueue_lock_spin(p);
	2839
	2840	if (tl->th_flags & TH_LIST_NEED_WAKEUP) {
	2841	wakeup(tl);
	2842	} else {
	2843	need_resume = TRUE;
	2844	}
	2845
	2846	tl->th_flags &= ~(TH_LIST_BUSY \| TH_LIST_NEED_WAKEUP);
	2847
	2848	workqueue_unlock(p);
	2849
	2850	if (need_resume) {
	2851	/*
	2852	* need to do this outside of the workqueue spin lock
	2853	* since thread_resume locks the thread via a full mutex
	2854	*/
	2855	pthread_kern->thread_resume(th);
	2856	}
	2857	}
	2858	}
	2859
	2860	#define KEVENT_LIST_LEN 16
	2861	#define KEVENT_DATA_SIZE (32 * 1024)
	2862
	2863	/**
	2864	* configures initial thread stack/registers to jump into:
	2865	* _pthread_wqthread(pthread_t self, mach_port_t kport, void stackaddr, void keventlist, int flags, int nkevents);
	2866	* to get there we jump through assembily stubs in pthread_asm.s. Those
	2867	* routines setup a stack frame, using the current stack pointer, and marshall
	2868	* arguments from registers to the stack as required by the ABI.
	2869	*
	2870	* One odd thing we do here is to start the pthread_t 4k below what would be the
	2871	* top of the stack otherwise. This is because usually only the first 4k of the
	2872	* pthread_t will be used and so we want to put it on the same 16k page as the
	2873	* top of the stack to save memory.
	2874	*
	2875	* When we are done the stack will look like:
	2876	* \|-----------\| th_stackaddr + th_allocsize
	2877	* \|pthread_t \| th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
	2878	* \|kevent list\| optionally - at most KEVENT_LIST_LEN events
	2879	* \|kevent data\| optionally - at most KEVENT_DATA_SIZE bytes
	2880	* \|stack gap \| bottom aligned to 16 bytes, and at least as big as stack_gap_min
	2881	* \| STACK \|
	2882	* \| ⇓ \|
	2883	* \| \|
	2884	* \|guard page \| guardsize
	2885	* \|-----------\| th_stackaddr
	2886	*/
	2887	int
	2888	_setup_wqthread(proc_t p, thread_t th, pthread_priority_t priority, int flags, struct threadlist *tl)
	2889	{
	2890	int error = 0;
	2891
	2892	const vm_size_t guardsize = vm_map_page_size(tl->th_workq->wq_map);
	2893	const vm_size_t stack_gap_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_REDZONE_LEN;
	2894	const vm_size_t stack_align_min = (proc_is64bit(p) == 0) ? C_32_STK_ALIGN : C_64_STK_ALIGN;
	2895
	2896	user_addr_t pthread_self_addr = (user_addr_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET);
	2897	user_addr_t stack_top_addr = (user_addr_t)((pthread_self_addr - stack_gap_min) & -stack_align_min);
	2898	user_addr_t stack_bottom_addr = (user_addr_t)(tl->th_stackaddr + guardsize);
	2899
	2900	/* Put the QoS class value into the lower bits of the reuse_thread register, this is where
	2901	* the thread priority used to be stored anyway.
	2902	*/
	2903	flags \|= (_pthread_priority_get_qos_newest(priority) & WQ_FLAG_THREAD_PRIOMASK);
	2904
	2905	flags \|= WQ_FLAG_THREAD_NEWSPI;
	2906
	2907	user_addr_t kevent_list = NULL;
	2908	int kevent_count = 0;
	2909	if (flags & WQ_FLAG_THREAD_KEVENT){
	2910	kevent_list = pthread_self_addr - KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
	2911	kevent_count = KEVENT_LIST_LEN;
	2912
	2913	user_addr_t kevent_data_buf = kevent_list - KEVENT_DATA_SIZE;
	2914	user_size_t kevent_data_available = KEVENT_DATA_SIZE;
	2915
	2916	int32_t events_out = 0;
	2917
	2918	int ret = kevent_qos_internal(p, -1, NULL, 0, kevent_list, kevent_count,
	2919	kevent_data_buf, &kevent_data_available,
	2920	KEVENT_FLAG_WORKQ \| KEVENT_FLAG_STACK_DATA \| KEVENT_FLAG_STACK_EVENTS \| KEVENT_FLAG_IMMEDIATE,
	2921	&events_out);
	2922
	2923	// squash any errors into just empty output on non-debug builds
	2924	assert(ret == KERN_SUCCESS && events_out != -1);
	2925	if (ret != KERN_SUCCESS \|\| events_out == -1){
	2926	events_out = 0;
	2927	kevent_data_available = KEVENT_DATA_SIZE;
	2928	}
	2929
	2930	// We shouldn't get data out if there aren't events available
	2931	assert(events_out != 0 \|\| kevent_data_available == KEVENT_DATA_SIZE);
	2932
	2933	if (events_out >= 0){
	2934	kevent_count = events_out;
	2935	kevent_list = pthread_self_addr - kevent_count * sizeof(struct kevent_qos_s);
	2936
	2937	if (kevent_data_available == KEVENT_DATA_SIZE){
	2938	stack_top_addr = (kevent_list - stack_gap_min) & -stack_align_min;
	2939	} else {
	2940	stack_top_addr = (kevent_data_buf + kevent_data_available - stack_gap_min) & -stack_align_min;
	2941	}
	2942	} else {
	2943	kevent_list = NULL;
	2944	kevent_count = 0;
	2945	}
	2946	}
	2947
	2948	#if defined(__i386__) \|\| defined(__x86_64__)
	2949	int isLP64 = proc_is64bit(p);
	2950
	2951	if (isLP64 == 0) {
	2952	x86_thread_state32_t state = {
	2953	.eip = (unsigned int)pthread_kern->proc_get_wqthread(p),
	2954	.eax = /* arg0 */ (unsigned int)pthread_self_addr,
	2955	.ebx = /* arg1 */ (unsigned int)tl->th_thport,
	2956	.ecx = /* arg2 */ (unsigned int)stack_bottom_addr,
	2957	.edx = /* arg3 */ (unsigned int)kevent_list,
	2958	.edi = /* arg4 */ (unsigned int)flags,
	2959	.esi = /* arg5 */ (unsigned int)kevent_count,
	2960
	2961	.esp = (int)((vm_offset_t)stack_top_addr),
	2962	};
	2963
	2964	(void)pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
	2965	} else {
	2966	x86_thread_state64_t state64 = {
	2967	// x86-64 already passes all the arguments in registers, so we just put them in their final place here
	2968	.rip = (uint64_t)pthread_kern->proc_get_wqthread(p),
	2969	.rdi = (uint64_t)pthread_self_addr,
	2970	.rsi = (uint64_t)tl->th_thport,
	2971	.rdx = (uint64_t)stack_bottom_addr,
	2972	.rcx = (uint64_t)kevent_list,
	2973	.r8 = (uint64_t)flags,
	2974	.r9 = (uint64_t)kevent_count,
	2975
	2976	.rsp = (uint64_t)(stack_top_addr)
	2977	};
	2978
	2979	error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
	2980	if (error != KERN_SUCCESS) {
	2981	error = EINVAL;
	2982	}
	2983	}
	2984	#else
	2985	#error setup_wqthread not defined for this architecture
	2986	#endif
	2987
	2988	return error;
	2989	}
	2990
	2991	#if DEBUG
	2992	static int wq_kevent_test SYSCTL_HANDLER_ARGS {
	2993	//(struct sysctl_oid oidp, void arg1, int arg2, struct sysctl_req *req)
	2994	#pragma unused(oidp, arg1, arg2)
	2995	int error;
	2996	struct workq_reqthreads_req_s requests[64] = {};
	2997
	2998	if (req->newlen > sizeof(requests) \|\| req->newlen < sizeof(struct workq_reqthreads_req_s))
	2999	return EINVAL;
	3000
	3001	error = copyin(req->newptr, requests, req->newlen);
	3002	if (error) return error;
	3003
	3004	_workq_reqthreads(req->p, (int)(req->newlen / sizeof(struct workq_reqthreads_req_s)), requests);
	3005
	3006	return 0;
	3007	}
	3008	#endif // DEBUG
	3009
	3010	#pragma mark - Misc
	3011
	3012	int
	3013	_fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
	3014	{
	3015	struct workqueue * wq;
	3016	int error = 0;
	3017	int activecount;
	3018	uint32_t pri;
	3019
	3020	workqueue_lock_spin(p);
	3021	if ((wq = pthread_kern->proc_get_wqptr(p)) == NULL) {
	3022	error = EINVAL;
	3023	goto out;
	3024	}
	3025	activecount = 0;
	3026
	3027	for (pri = 0; pri < WORKQUEUE_NUM_BUCKETS; pri++) {
	3028	activecount += wq->wq_thactive_count[pri];
	3029	}
	3030	pwqinfo->pwq_nthreads = wq->wq_nthreads;
	3031	pwqinfo->pwq_runthreads = activecount;
	3032	pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
	3033	pwqinfo->pwq_state = 0;
	3034
	3035	if (wq->wq_lflags & WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT) {
	3036	pwqinfo->pwq_state \|= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
	3037	}
	3038
	3039	if (wq->wq_lflags & WQL_EXCEEDED_TOTAL_THREAD_LIMIT) {
	3040	pwqinfo->pwq_state \|= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
	3041	}
	3042
	3043	out:
	3044	workqueue_unlock(p);
	3045	return(error);
	3046	}
	3047
	3048	int
	3049	_thread_selfid(__unused struct proc p, uint64_t retval)
	3050	{
	3051	thread_t thread = current_thread();
	3052	*retval = thread_tid(thread);
	3053	return KERN_SUCCESS;
	3054	}
	3055
	3056	void
	3057	_pthread_init(void)
	3058	{
	3059	pthread_lck_grp_attr = lck_grp_attr_alloc_init();
	3060	pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
	3061
	3062	/*
	3063	* allocate the lock attribute for pthread synchronizers
	3064	*/
	3065	pthread_lck_attr = lck_attr_alloc_init();
	3066
	3067	_workqueue_init_lock((proc_t)get_bsdtask_info(kernel_task));
	3068	pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
	3069
	3070	pth_global_hashinit();
	3071	psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
	3072	psynch_zoneinit();
	3073
	3074	/*
	3075	* register sysctls
	3076	*/
	3077	sysctl_register_oid(&sysctl__kern_wq_yielded_threshold);
	3078	sysctl_register_oid(&sysctl__kern_wq_yielded_window_usecs);
	3079	sysctl_register_oid(&sysctl__kern_wq_stalled_window_usecs);
	3080	sysctl_register_oid(&sysctl__kern_wq_reduce_pool_window_usecs);
	3081	sysctl_register_oid(&sysctl__kern_wq_max_timer_interval_usecs);
	3082	sysctl_register_oid(&sysctl__kern_wq_max_threads);
	3083	sysctl_register_oid(&sysctl__kern_wq_max_constrained_threads);
	3084	sysctl_register_oid(&sysctl__kern_pthread_debug_tracing);
	3085
	3086	#if DEBUG
	3087	sysctl_register_oid(&sysctl__kern_wq_max_concurrency);
	3088	sysctl_register_oid(&sysctl__debug_wq_kevent_test);
	3089	#endif
	3090
	3091	wq_max_concurrency = pthread_kern->ml_get_max_cpus();
	3092
	3093	}