git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2015 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1982, 1986, 1989, 1993
	31	* The Regents of the University of California. All rights reserved.
	32	* (c) UNIX System Laboratories, Inc.
	33	* All or some portions of this file are derived from material licensed
	34	* to the University of California by American Telephone and Telegraph
	35	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	36	* the permission of UNIX System Laboratories, Inc.
	37	*
	38	* Redistribution and use in source and binary forms, with or without
	39	* modification, are permitted provided that the following conditions
	40	* are met:
	41	* 1. Redistributions of source code must retain the above copyright
	42	* notice, this list of conditions and the following disclaimer.
	43	* 2. Redistributions in binary form must reproduce the above copyright
	44	* notice, this list of conditions and the following disclaimer in the
	45	* documentation and/or other materials provided with the distribution.
	46	* 3. All advertising materials mentioning features or use of this software
	47	* must display the following acknowledgement:
	48	* This product includes software developed by the University of
	49	* California, Berkeley and its contributors.
	50	* 4. Neither the name of the University nor the names of its contributors
	51	* may be used to endorse or promote products derived from this software
	52	* without specific prior written permission.
	53	*
	54	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	55	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	56	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	57	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	58	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	59	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	60	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	61	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	62	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	63	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	64	* SUCH DAMAGE.
	65	*
	66	* @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
	67	*/
	68	/*
	69	* NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
	70	* support for mandatory and extensible security protections. This notice
	71	* is included in support of clause 2.2 (b) of the Apple Public License,
	72	* Version 2.0.
	73	*/
	74
	75	#include <sys/param.h>
	76	#include <sys/systm.h>
	77	#include <sys/filedesc.h>
	78	#include <sys/ioctl.h>
	79	#include <sys/file_internal.h>
	80	#include <sys/proc_internal.h>
	81	#include <sys/socketvar.h>
	82	#include <sys/uio_internal.h>
	83	#include <sys/kernel.h>
	84	#include <sys/guarded.h>
	85	#include <sys/stat.h>
	86	#include <sys/malloc.h>
	87	#include <sys/sysproto.h>
	88
	89	#include <sys/mount_internal.h>
	90	#include <sys/protosw.h>
	91	#include <sys/ev.h>
	92	#include <sys/user.h>
	93	#include <sys/kdebug.h>
	94	#include <sys/poll.h>
	95	#include <sys/event.h>
	96	#include <sys/eventvar.h>
	97	#include <sys/proc.h>
	98	#include <sys/kauth.h>
	99
	100	#include <machine/smp.h>
	101	#include <mach/mach_types.h>
	102	#include <kern/kern_types.h>
	103	#include <kern/assert.h>
	104	#include <kern/kalloc.h>
	105	#include <kern/thread.h>
	106	#include <kern/clock.h>
	107	#include <kern/ledger.h>
	108	#include <kern/task.h>
	109	#include <kern/telemetry.h>
	110	#include <kern/waitq.h>
	111	#include <kern/sched_prim.h>
	112	#include <kern/mpsc_queue.h>
	113	#include <kern/debug.h>
	114
	115	#include <sys/mbuf.h>
	116	#include <sys/domain.h>
	117	#include <sys/socket.h>
	118	#include <sys/socketvar.h>
	119	#include <sys/errno.h>
	120	#include <sys/syscall.h>
	121	#include <sys/pipe.h>
	122
	123	#include <security/audit/audit.h>
	124
	125	#include <net/if.h>
	126	#include <net/route.h>
	127
	128	#include <netinet/in.h>
	129	#include <netinet/in_systm.h>
	130	#include <netinet/ip.h>
	131	#include <netinet/in_pcb.h>
	132	#include <netinet/ip_var.h>
	133	#include <netinet/ip6.h>
	134	#include <netinet/tcp.h>
	135	#include <netinet/tcp_fsm.h>
	136	#include <netinet/tcp_seq.h>
	137	#include <netinet/tcp_timer.h>
	138	#include <netinet/tcp_var.h>
	139	#include <netinet/tcpip.h>
	140	#include <netinet/tcp_debug.h>
	141	/* for wait queue based select */
	142	#include <kern/waitq.h>
	143	#include <sys/vnode_internal.h>
	144	/* for remote time api*/
	145	#include <kern/remote_time.h>
	146	#include <os/log.h>
	147	#include <sys/log_data.h>
	148
	149	#if CONFIG_MACF
	150	#include <security/mac_framework.h>
	151	#endif
	152
	153	/* for entitlement check */
	154	#include <IOKit/IOBSD.h>
	155	/*
	156	* If you need accounting for KM_SELECT consider using
	157	* KALLOC_HEAP_DEFINE to define a view.
	158	*/
	159	#define KM_SELECT KHEAP_DEFAULT
	160
	161	/* XXX should be in a header file somewhere */
	162	extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
	163
	164	int rd_uio(struct proc p, int fdes, uio_t uio, int is_preadv, user_ssize_t retval);
	165	int wr_uio(struct proc p, int fdes, uio_t uio, int is_pwritev, user_ssize_t retval);
	166	int do_uiowrite(struct proc p, struct fileproc fp, uio_t uio, int flags, user_ssize_t *retval);
	167
	168	__private_extern__ int dofileread(vfs_context_t ctx, struct fileproc *fp,
	169	user_addr_t bufp, user_size_t nbyte,
	170	off_t offset, int flags, user_ssize_t *retval);
	171	__private_extern__ int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
	172	user_addr_t bufp, user_size_t nbyte,
	173	off_t offset, int flags, user_ssize_t *retval);
	174	static int preparefileread(struct proc p, struct fileproc *fp_ret, int fd, int check_for_vnode);
	175
	176	/* Conflict wait queue for when selects collide (opaque type) */
	177	struct waitq select_conflict_queue;
	178
	179	/*
	180	* Init routine called from bsd_init.c
	181	*/
	182	void select_waitq_init(void);
	183	void
	184	select_waitq_init(void)
	185	{
	186	waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO);
	187	}
	188
	189	#define f_flag fp_glob->fg_flag
	190	#define f_type fp_glob->fg_ops->fo_type
	191	#define f_cred fp_glob->fg_cred
	192	#define f_ops fp_glob->fg_ops
	193	#define f_data fp_glob->fg_data
	194
	195	/*
	196	* Read system call.
	197	*
	198	* Returns: 0 Success
	199	* preparefileread:EBADF
	200	* preparefileread:ESPIPE
	201	* preparefileread:ENXIO
	202	* preparefileread:EBADF
	203	* dofileread:???
	204	*/
	205	int
	206	read(struct proc p, struct read_args uap, user_ssize_t *retval)
	207	{
	208	__pthread_testcancel(1);
	209	return read_nocancel(p, (struct read_nocancel_args *)uap, retval);
	210	}
	211
	212	int
	213	read_nocancel(struct proc p, struct read_nocancel_args uap, user_ssize_t *retval)
	214	{
	215	struct fileproc *fp;
	216	int error;
	217	int fd = uap->fd;
	218	struct vfs_context context;
	219
	220	if ((error = preparefileread(p, &fp, fd, 0))) {
	221	return error;
	222	}
	223
	224	context = *(vfs_context_current());
	225	context.vc_ucred = fp->fp_glob->fg_cred;
	226
	227	error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
	228	(off_t)-1, 0, retval);
	229
	230	fp_drop(p, fd, fp, 0);
	231
	232	return error;
	233	}
	234
	235	/*
	236	* Pread system call
	237	*
	238	* Returns: 0 Success
	239	* preparefileread:EBADF
	240	* preparefileread:ESPIPE
	241	* preparefileread:ENXIO
	242	* preparefileread:EBADF
	243	* dofileread:???
	244	*/
	245	int
	246	pread(struct proc p, struct pread_args uap, user_ssize_t *retval)
	247	{
	248	__pthread_testcancel(1);
	249	return pread_nocancel(p, (struct pread_nocancel_args *)uap, retval);
	250	}
	251
	252	int
	253	pread_nocancel(struct proc p, struct pread_nocancel_args uap, user_ssize_t *retval)
	254	{
	255	struct fileproc fp = NULL; / fp set by preparefileread() */
	256	int fd = uap->fd;
	257	int error;
	258	struct vfs_context context;
	259
	260	if ((error = preparefileread(p, &fp, fd, 1))) {
	261	goto out;
	262	}
	263
	264	context = *(vfs_context_current());
	265	context.vc_ucred = fp->fp_glob->fg_cred;
	266
	267	error = dofileread(&context, fp, uap->buf, uap->nbyte,
	268	uap->offset, FOF_OFFSET, retval);
	269
	270	fp_drop(p, fd, fp, 0);
	271
	272	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) \| DBG_FUNC_NONE),
	273	uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
	274
	275	out:
	276	return error;
	277	}
	278
	279	/*
	280	* Code common for read and pread
	281	*/
	282
	283	/*
	284	* Returns: 0 Success
	285	* EBADF
	286	* ESPIPE
	287	* ENXIO
	288	* fp_lookup:EBADF
	289	*/
	290	static int
	291	preparefileread(struct proc p, struct fileproc *fp_ret, int fd, int check_for_pread)
	292	{
	293	vnode_t vp;
	294	int error;
	295	struct fileproc *fp;
	296
	297	AUDIT_ARG(fd, fd);
	298
	299	proc_fdlock_spin(p);
	300
	301	error = fp_lookup(p, fd, &fp, 1);
	302
	303	if (error) {
	304	proc_fdunlock(p);
	305	return error;
	306	}
	307	if ((fp->f_flag & FREAD) == 0) {
	308	error = EBADF;
	309	goto out;
	310	}
	311	if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
	312	error = ESPIPE;
	313	goto out;
	314	}
	315	if (fp->f_type == DTYPE_VNODE) {
	316	vp = (struct vnode *)fp->fp_glob->fg_data;
	317
	318	if (check_for_pread && (vnode_isfifo(vp))) {
	319	error = ESPIPE;
	320	goto out;
	321	}
	322	if (check_for_pread && (vp->v_flag & VISTTY)) {
	323	error = ENXIO;
	324	goto out;
	325	}
	326	}
	327
	328	*fp_ret = fp;
	329
	330	proc_fdunlock(p);
	331	return 0;
	332
	333	out:
	334	fp_drop(p, fd, fp, 1);
	335	proc_fdunlock(p);
	336	return error;
	337	}
	338
	339
	340	/*
	341	* Returns: 0 Success
	342	* EINVAL
	343	* fo_read:???
	344	*/
	345	__private_extern__ int
	346	dofileread(vfs_context_t ctx, struct fileproc *fp,
	347	user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
	348	user_ssize_t *retval)
	349	{
	350	uio_t auio;
	351	user_ssize_t bytecnt;
	352	int error = 0;
	353	char uio_buf[UIO_SIZEOF(1)];
	354
	355	if (nbyte > INT_MAX) {
	356	return EINVAL;
	357	}
	358
	359	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
	360	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
	361	&uio_buf[0], sizeof(uio_buf));
	362	} else {
	363	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
	364	&uio_buf[0], sizeof(uio_buf));
	365	}
	366	if (uio_addiov(auio, bufp, nbyte) != 0) {
	367	*retval = 0;
	368	return EINVAL;
	369	}
	370
	371	bytecnt = nbyte;
	372
	373	if ((error = fo_read(fp, auio, flags, ctx))) {
	374	if (uio_resid(auio) != bytecnt && (error == ERESTART \|\|
	375	error == EINTR \|\| error == EWOULDBLOCK)) {
	376	error = 0;
	377	}
	378	}
	379	bytecnt -= uio_resid(auio);
	380
	381	*retval = bytecnt;
	382
	383	return error;
	384	}
	385
	386	/*
	387	* Vector read.
	388	*
	389	* Returns: 0 Success
	390	* EINVAL
	391	* ENOMEM
	392	* preparefileread:EBADF
	393	* preparefileread:ESPIPE
	394	* preparefileread:ENXIO
	395	* preparefileread:EBADF
	396	* copyin:EFAULT
	397	* rd_uio:???
	398	*/
	399	static int
	400	readv_preadv_uio(struct proc *p, int fdes,
	401	user_addr_t user_iovp, int iovcnt, off_t offset, int is_preadv,
	402	user_ssize_t *retval)
	403	{
	404	uio_t auio = NULL;
	405	int error;
	406	struct user_iovec *iovp;
	407
	408	/* Verify range before calling uio_create() */
	409	if (iovcnt <= 0 \|\| iovcnt > UIO_MAXIOV) {
	410	return EINVAL;
	411	}
	412
	413	/* allocate a uio large enough to hold the number of iovecs passed */
	414	auio = uio_create(iovcnt, offset,
	415	(IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
	416	UIO_READ);
	417
	418	/* get location of iovecs within the uio. then copyin the iovecs from
	419	* user space.
	420	*/
	421	iovp = uio_iovsaddr(auio);
	422	if (iovp == NULL) {
	423	error = ENOMEM;
	424	goto ExitThisRoutine;
	425	}
	426	error = copyin_user_iovec_array(user_iovp,
	427	IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
	428	iovcnt, iovp);
	429	if (error) {
	430	goto ExitThisRoutine;
	431	}
	432
	433	/* finalize uio_t for use and do the IO
	434	*/
	435	error = uio_calculateresid(auio);
	436	if (error) {
	437	goto ExitThisRoutine;
	438	}
	439	error = rd_uio(p, fdes, auio, is_preadv, retval);
	440
	441	ExitThisRoutine:
	442	if (auio != NULL) {
	443	uio_free(auio);
	444	}
	445	return error;
	446	}
	447
	448	/*
	449	* Scatter read system call.
	450	*/
	451	int
	452	readv(struct proc p, struct readv_args uap, user_ssize_t *retval)
	453	{
	454	__pthread_testcancel(1);
	455	return readv_nocancel(p, (struct readv_nocancel_args *)uap, retval);
	456	}
	457
	458	int
	459	readv_nocancel(struct proc p, struct readv_nocancel_args uap, user_ssize_t *retval)
	460	{
	461	return readv_preadv_uio(p, uap->fd, uap->iovp, uap->iovcnt, 0, 0, retval);
	462	}
	463
	464	/*
	465	* Preadv system call
	466	*/
	467	int
	468	sys_preadv(struct proc p, struct preadv_args uap, user_ssize_t *retval)
	469	{
	470	__pthread_testcancel(1);
	471	return sys_preadv_nocancel(p, (struct preadv_nocancel_args *)uap, retval);
	472	}
	473
	474	int
	475	sys_preadv_nocancel(struct proc p, struct preadv_nocancel_args uap, user_ssize_t *retval)
	476	{
	477	return readv_preadv_uio(p, uap->fd, uap->iovp, uap->iovcnt, uap->offset, 1, retval);
	478	}
	479
	480	/*
	481	* Write system call
	482	*
	483	* Returns: 0 Success
	484	* EBADF
	485	* fp_lookup:EBADF
	486	* dofilewrite:???
	487	*/
	488	int
	489	write(struct proc p, struct write_args uap, user_ssize_t *retval)
	490	{
	491	__pthread_testcancel(1);
	492	return write_nocancel(p, (struct write_nocancel_args *)uap, retval);
	493	}
	494
	495	int
	496	write_nocancel(struct proc p, struct write_nocancel_args uap, user_ssize_t *retval)
	497	{
	498	struct fileproc *fp;
	499	int error;
	500	int fd = uap->fd;
	501
	502	AUDIT_ARG(fd, fd);
	503
	504	error = fp_lookup(p, fd, &fp, 0);
	505	if (error) {
	506	return error;
	507	}
	508	if ((fp->f_flag & FWRITE) == 0) {
	509	error = EBADF;
	510	} else if (fp_isguarded(fp, GUARD_WRITE)) {
	511	proc_fdlock(p);
	512	error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
	513	proc_fdunlock(p);
	514	} else {
	515	struct vfs_context context = *(vfs_context_current());
	516	context.vc_ucred = fp->fp_glob->fg_cred;
	517
	518	error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
	519	(off_t)-1, 0, retval);
	520	}
	521	fp_drop(p, fd, fp, 0);
	522	return error;
	523	}
	524
	525	/*
	526	* pwrite system call
	527	*
	528	* Returns: 0 Success
	529	* EBADF
	530	* ESPIPE
	531	* ENXIO
	532	* EINVAL
	533	* fp_lookup:EBADF
	534	* dofilewrite:???
	535	*/
	536	int
	537	pwrite(struct proc p, struct pwrite_args uap, user_ssize_t *retval)
	538	{
	539	__pthread_testcancel(1);
	540	return pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval);
	541	}
	542
	543	int
	544	pwrite_nocancel(struct proc p, struct pwrite_nocancel_args uap, user_ssize_t *retval)
	545	{
	546	struct fileproc *fp;
	547	int error;
	548	int fd = uap->fd;
	549	vnode_t vp = (vnode_t)0;
	550
	551	AUDIT_ARG(fd, fd);
	552
	553	error = fp_get_ftype(p, fd, DTYPE_VNODE, ESPIPE, &fp);
	554	if (error) {
	555	return error;
	556	}
	557
	558	if ((fp->f_flag & FWRITE) == 0) {
	559	error = EBADF;
	560	} else if (fp_isguarded(fp, GUARD_WRITE)) {
	561	proc_fdlock(p);
	562	error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
	563	proc_fdunlock(p);
	564	} else {
	565	struct vfs_context context = *vfs_context_current();
	566	context.vc_ucred = fp->fp_glob->fg_cred;
	567
	568	vp = (vnode_t)fp->fp_glob->fg_data;
	569	if (vnode_isfifo(vp)) {
	570	error = ESPIPE;
	571	goto errout;
	572	}
	573	if ((vp->v_flag & VISTTY)) {
	574	error = ENXIO;
	575	goto errout;
	576	}
	577	if (uap->offset == (off_t)-1) {
	578	error = EINVAL;
	579	goto errout;
	580	}
	581
	582	error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
	583	uap->offset, FOF_OFFSET, retval);
	584	}
	585	errout:
	586	fp_drop(p, fd, fp, 0);
	587
	588	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) \| DBG_FUNC_NONE),
	589	uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
	590
	591	return error;
	592	}
	593
	594	/*
	595	* Returns: 0 Success
	596	* EINVAL
	597	* <fo_write>:EPIPE
	598	* <fo_write>:??? [indirect through struct fileops]
	599	*/
	600	__private_extern__ int
	601	dofilewrite(vfs_context_t ctx, struct fileproc *fp,
	602	user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
	603	user_ssize_t *retval)
	604	{
	605	uio_t auio;
	606	int error = 0;
	607	user_ssize_t bytecnt;
	608	char uio_buf[UIO_SIZEOF(1)];
	609
	610	if (nbyte > INT_MAX) {
	611	*retval = 0;
	612	return EINVAL;
	613	}
	614
	615	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
	616	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
	617	&uio_buf[0], sizeof(uio_buf));
	618	} else {
	619	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
	620	&uio_buf[0], sizeof(uio_buf));
	621	}
	622	if (uio_addiov(auio, bufp, nbyte) != 0) {
	623	*retval = 0;
	624	return EINVAL;
	625	}
	626
	627	bytecnt = nbyte;
	628	if ((error = fo_write(fp, auio, flags, ctx))) {
	629	if (uio_resid(auio) != bytecnt && (error == ERESTART \|\|
	630	error == EINTR \|\| error == EWOULDBLOCK)) {
	631	error = 0;
	632	}
	633	/* The socket layer handles SIGPIPE */
	634	if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
	635	(fp->fp_glob->fg_lflags & FG_NOSIGPIPE) == 0) {
	636	/* XXX Raise the signal on the thread? */
	637	psignal(vfs_context_proc(ctx), SIGPIPE);
	638	}
	639	}
	640	bytecnt -= uio_resid(auio);
	641	if (bytecnt) {
	642	os_atomic_or(&fp->fp_glob->fg_flag, FWASWRITTEN, relaxed);
	643	}
	644	*retval = bytecnt;
	645
	646	return error;
	647	}
	648
	649	/*
	650	* Returns: 0 Success
	651	* EBADF
	652	* ESPIPE
	653	* ENXIO
	654	* fp_lookup:EBADF
	655	* fp_guard_exception:???
	656	*/
	657	static int
	658	preparefilewrite(struct proc p, struct fileproc *fp_ret, int fd, int check_for_pwrite)
	659	{
	660	vnode_t vp;
	661	int error;
	662	struct fileproc *fp;
	663
	664	AUDIT_ARG(fd, fd);
	665
	666	proc_fdlock_spin(p);
	667
	668	error = fp_lookup(p, fd, &fp, 1);
	669
	670	if (error) {
	671	proc_fdunlock(p);
	672	return error;
	673	}
	674	if ((fp->f_flag & FWRITE) == 0) {
	675	error = EBADF;
	676	goto ExitThisRoutine;
	677	}
	678	if (fp_isguarded(fp, GUARD_WRITE)) {
	679	error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
	680	goto ExitThisRoutine;
	681	}
	682	if (check_for_pwrite) {
	683	if (fp->f_type != DTYPE_VNODE) {
	684	error = ESPIPE;
	685	goto ExitThisRoutine;
	686	}
	687
	688	vp = (vnode_t)fp->fp_glob->fg_data;
	689	if (vnode_isfifo(vp)) {
	690	error = ESPIPE;
	691	goto ExitThisRoutine;
	692	}
	693	if ((vp->v_flag & VISTTY)) {
	694	error = ENXIO;
	695	goto ExitThisRoutine;
	696	}
	697	}
	698
	699	*fp_ret = fp;
	700
	701	proc_fdunlock(p);
	702	return 0;
	703
	704	ExitThisRoutine:
	705	fp_drop(p, fd, fp, 1);
	706	proc_fdunlock(p);
	707	return error;
	708	}
	709
	710	static int
	711	writev_prwritev_uio(struct proc *p, int fd,
	712	user_addr_t user_iovp, int iovcnt, off_t offset, int is_pwritev,
	713	user_ssize_t *retval)
	714	{
	715	uio_t auio = NULL;
	716	int error;
	717	struct user_iovec *iovp;
	718
	719	/* Verify range before calling uio_create() */
	720	if (iovcnt <= 0 \|\| iovcnt > UIO_MAXIOV \|\| offset < 0) {
	721	return EINVAL;
	722	}
	723
	724	/* allocate a uio large enough to hold the number of iovecs passed */
	725	auio = uio_create(iovcnt, offset,
	726	(IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
	727	UIO_WRITE);
	728
	729	/* get location of iovecs within the uio. then copyin the iovecs from
	730	* user space.
	731	*/
	732	iovp = uio_iovsaddr(auio);
	733	if (iovp == NULL) {
	734	error = ENOMEM;
	735	goto ExitThisRoutine;
	736	}
	737	error = copyin_user_iovec_array(user_iovp,
	738	IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
	739	iovcnt, iovp);
	740	if (error) {
	741	goto ExitThisRoutine;
	742	}
	743
	744	/* finalize uio_t for use and do the IO
	745	*/
	746	error = uio_calculateresid(auio);
	747	if (error) {
	748	goto ExitThisRoutine;
	749	}
	750
	751	error = wr_uio(p, fd, auio, is_pwritev, retval);
	752
	753	ExitThisRoutine:
	754	if (auio != NULL) {
	755	uio_free(auio);
	756	}
	757	return error;
	758	}
	759
	760	/*
	761	* Gather write system call
	762	*/
	763	int
	764	writev(struct proc p, struct writev_args uap, user_ssize_t *retval)
	765	{
	766	__pthread_testcancel(1);
	767	return writev_nocancel(p, (struct writev_nocancel_args *)uap, retval);
	768	}
	769
	770	int
	771	writev_nocancel(struct proc p, struct writev_nocancel_args uap, user_ssize_t *retval)
	772	{
	773	return writev_prwritev_uio(p, uap->fd, uap->iovp, uap->iovcnt, 0, 0, retval);
	774	}
	775
	776	/*
	777	* Pwritev system call
	778	*/
	779	int
	780	sys_pwritev(struct proc p, struct pwritev_args uap, user_ssize_t *retval)
	781	{
	782	__pthread_testcancel(1);
	783	return sys_pwritev_nocancel(p, (struct pwritev_nocancel_args *)uap, retval);
	784	}
	785
	786	int
	787	sys_pwritev_nocancel(struct proc p, struct pwritev_nocancel_args uap, user_ssize_t *retval)
	788	{
	789	return writev_prwritev_uio(p, uap->fd, uap->iovp, uap->iovcnt, uap->offset, 1, retval);
	790	}
	791
	792	/*
	793	* Returns: 0 Success
	794	* preparefileread:EBADF
	795	* preparefileread:ESPIPE
	796	* preparefileread:ENXIO
	797	* preparefileread:???
	798	* fo_write:???
	799	*/
	800	int
	801	wr_uio(struct proc p, int fd, uio_t uio, int is_pwritev, user_ssize_t retval)
	802	{
	803	struct fileproc *fp;
	804	int error;
	805	int flags;
	806
	807	if ((error = preparefilewrite(p, &fp, fd, is_pwritev))) {
	808	return error;
	809	}
	810
	811	flags = is_pwritev ? FOF_OFFSET : 0;
	812	error = do_uiowrite(p, fp, uio, flags, retval);
	813
	814	fp_drop(p, fd, fp, 0);
	815
	816	return error;
	817	}
	818
	819	int
	820	do_uiowrite(struct proc p, struct fileproc fp, uio_t uio, int flags, user_ssize_t *retval)
	821	{
	822	int error;
	823	user_ssize_t count;
	824	struct vfs_context context = *vfs_context_current();
	825
	826	count = uio_resid(uio);
	827
	828	context.vc_ucred = fp->f_cred;
	829	error = fo_write(fp, uio, flags, &context);
	830	if (error) {
	831	if (uio_resid(uio) != count && (error == ERESTART \|\|
	832	error == EINTR \|\| error == EWOULDBLOCK)) {
	833	error = 0;
	834	}
	835	/* The socket layer handles SIGPIPE */
	836	if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
	837	(fp->fp_glob->fg_lflags & FG_NOSIGPIPE) == 0) {
	838	psignal(p, SIGPIPE);
	839	}
	840	}
	841	count -= uio_resid(uio);
	842	if (count) {
	843	os_atomic_or(&fp->fp_glob->fg_flag, FWASWRITTEN, relaxed);
	844	}
	845	*retval = count;
	846
	847	return error;
	848	}
	849
	850	/*
	851	* Returns: 0 Success
	852	* preparefileread:EBADF
	853	* preparefileread:ESPIPE
	854	* preparefileread:ENXIO
	855	* fo_read:???
	856	*/
	857	int
	858	rd_uio(struct proc p, int fdes, uio_t uio, int is_preadv, user_ssize_t retval)
	859	{
	860	struct fileproc *fp;
	861	int error;
	862	user_ssize_t count;
	863	struct vfs_context context = *vfs_context_current();
	864
	865	if ((error = preparefileread(p, &fp, fdes, is_preadv))) {
	866	return error;
	867	}
	868
	869	count = uio_resid(uio);
	870
	871	context.vc_ucred = fp->f_cred;
	872
	873	int flags = is_preadv ? FOF_OFFSET : 0;
	874	error = fo_read(fp, uio, flags, &context);
	875
	876	if (error) {
	877	if (uio_resid(uio) != count && (error == ERESTART \|\|
	878	error == EINTR \|\| error == EWOULDBLOCK)) {
	879	error = 0;
	880	}
	881	}
	882	*retval = count - uio_resid(uio);
	883
	884	fp_drop(p, fdes, fp, 0);
	885
	886	return error;
	887	}
	888
	889	/*
	890	* Ioctl system call
	891	*
	892	* Returns: 0 Success
	893	* EBADF
	894	* ENOTTY
	895	* ENOMEM
	896	* ESRCH
	897	* copyin:EFAULT
	898	* copyoutEFAULT
	899	* fp_lookup:EBADF Bad file descriptor
	900	* fo_ioctl:???
	901	*/
	902	int
	903	ioctl(struct proc p, struct ioctl_args uap, __unused int32_t *retval)
	904	{
	905	struct fileproc *fp = NULL;
	906	int error = 0;
	907	u_int size = 0;
	908	caddr_t datap = NULL, memp = NULL;
	909	boolean_t is64bit = FALSE;
	910	int tmp = 0;
	911	#define STK_PARAMS 128
	912	char stkbuf[STK_PARAMS] = {};
	913	int fd = uap->fd;
	914	u_long com = uap->com;
	915	struct vfs_context context = *vfs_context_current();
	916
	917	AUDIT_ARG(fd, uap->fd);
	918	AUDIT_ARG(addr, uap->data);
	919
	920	is64bit = proc_is64bit(p);
	921	#if CONFIG_AUDIT
	922	if (is64bit) {
	923	AUDIT_ARG(value64, com);
	924	} else {
	925	AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
	926	}
	927	#endif /* CONFIG_AUDIT */
	928
	929	/*
	930	* Interpret high order word to find amount of data to be
	931	* copied to/from the user's address space.
	932	*/
	933	size = IOCPARM_LEN(com);
	934	if (size > IOCPARM_MAX) {
	935	return ENOTTY;
	936	}
	937	if (size > sizeof(stkbuf)) {
	938	memp = (caddr_t)kheap_alloc(KHEAP_TEMP, size, Z_WAITOK);
	939	if (memp == 0) {
	940	return ENOMEM;
	941	}
	942	datap = memp;
	943	} else {
	944	datap = &stkbuf[0];
	945	}
	946	if (com & IOC_IN) {
	947	if (size) {
	948	error = copyin(uap->data, datap, size);
	949	if (error) {
	950	goto out_nofp;
	951	}
	952	} else {
	953	/* XXX - IOC_IN and no size? we should proably return an error here!! */
	954	if (is64bit) {
	955	(user_addr_t )datap = uap->data;
	956	} else {
	957	(uint32_t )datap = (uint32_t)uap->data;
	958	}
	959	}
	960	} else if ((com & IOC_OUT) && size) {
	961	/*
	962	* Zero the buffer so the user always
	963	* gets back something deterministic.
	964	*/
	965	bzero(datap, size);
	966	} else if (com & IOC_VOID) {
	967	/* XXX - this is odd since IOC_VOID means no parameters */
	968	if (is64bit) {
	969	(user_addr_t )datap = uap->data;
	970	} else {
	971	(uint32_t )datap = (uint32_t)uap->data;
	972	}
	973	}
	974
	975	proc_fdlock(p);
	976	error = fp_lookup(p, fd, &fp, 1);
	977	if (error) {
	978	proc_fdunlock(p);
	979	goto out_nofp;
	980	}
	981
	982	AUDIT_ARG(file, p, fp);
	983
	984	if ((fp->f_flag & (FREAD \| FWRITE)) == 0) {
	985	error = EBADF;
	986	goto out;
	987	}
	988
	989	context.vc_ucred = fp->fp_glob->fg_cred;
	990
	991	#if CONFIG_MACF
	992	error = mac_file_check_ioctl(context.vc_ucred, fp->fp_glob, com);
	993	if (error) {
	994	goto out;
	995	}
	996	#endif
	997
	998	switch (com) {
	999	case FIONCLEX:
	1000	*fdflags(p, fd) &= ~UF_EXCLOSE;
	1001	break;
	1002
	1003	case FIOCLEX:
	1004	*fdflags(p, fd) \|= UF_EXCLOSE;
	1005	break;
	1006
	1007	case FIONBIO:
	1008	// FIXME (rdar://54898652)
	1009	//
	1010	// this code is broken if fnctl(F_SETFL), ioctl() are
	1011	// called concurrently for the same fileglob.
	1012	if ((tmp = (int )datap)) {
	1013	os_atomic_or(&fp->f_flag, FNONBLOCK, relaxed);
	1014	} else {
	1015	os_atomic_andnot(&fp->f_flag, FNONBLOCK, relaxed);
	1016	}
	1017	error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
	1018	break;
	1019
	1020	case FIOASYNC:
	1021	// FIXME (rdar://54898652)
	1022	//
	1023	// this code is broken if fnctl(F_SETFL), ioctl() are
	1024	// called concurrently for the same fileglob.
	1025	if ((tmp = (int )datap)) {
	1026	os_atomic_or(&fp->f_flag, FASYNC, relaxed);
	1027	} else {
	1028	os_atomic_andnot(&fp->f_flag, FASYNC, relaxed);
	1029	}
	1030	error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
	1031	break;
	1032
	1033	case FIOSETOWN:
	1034	tmp = (int )datap;
	1035	if (fp->f_type == DTYPE_SOCKET) {
	1036	((struct socket *)fp->f_data)->so_pgid = tmp;
	1037	break;
	1038	}
	1039	if (fp->f_type == DTYPE_PIPE) {
	1040	error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
	1041	break;
	1042	}
	1043	if (tmp <= 0) {
	1044	tmp = -tmp;
	1045	} else {
	1046	struct proc *p1 = proc_find(tmp);
	1047	if (p1 == 0) {
	1048	error = ESRCH;
	1049	break;
	1050	}
	1051	tmp = p1->p_pgrpid;
	1052	proc_rele(p1);
	1053	}
	1054	error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
	1055	break;
	1056
	1057	case FIOGETOWN:
	1058	if (fp->f_type == DTYPE_SOCKET) {
	1059	(int )datap = ((struct socket *)fp->f_data)->so_pgid;
	1060	break;
	1061	}
	1062	error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
	1063	(int )datap = -(int )datap;
	1064	break;
	1065
	1066	default:
	1067	error = fo_ioctl(fp, com, datap, &context);
	1068	/*
	1069	* Copy any data to user, size was
	1070	* already set and checked above.
	1071	*/
	1072	if (error == 0 && (com & IOC_OUT) && size) {
	1073	error = copyout(datap, uap->data, (u_int)size);
	1074	}
	1075	break;
	1076	}
	1077	out:
	1078	fp_drop(p, fd, fp, 1);
	1079	proc_fdunlock(p);
	1080
	1081	out_nofp:
	1082	if (memp) {
	1083	kheap_free(KHEAP_TEMP, memp, size);
	1084	}
	1085	return error;
	1086	}
	1087
	1088	int selwait, nselcoll;
	1089	#define SEL_FIRSTPASS 1
	1090	#define SEL_SECONDPASS 2
	1091	extern int selcontinue(int error);
	1092	extern int selprocess(int error, int sel_pass);
	1093	static int selscan(struct proc p, struct _select sel, struct _select_data * seldata,
	1094	int nfd, int32_t retval, int sel_pass, struct waitq_set wqset);
	1095	static int selcount(struct proc p, u_int32_t ibits, int nfd, int *count);
	1096	static int seldrop_locked(struct proc p, u_int32_t ibits, int nfd, int lim, int *need_wakeup);
	1097	static int seldrop(struct proc p, u_int32_t ibits, int nfd, int lim);
	1098	static int select_internal(struct proc p, struct select_nocancel_args uap, uint64_t timeout, int32_t *retval);
	1099
	1100	/*
	1101	* Select system call.
	1102	*
	1103	* Returns: 0 Success
	1104	* EINVAL Invalid argument
	1105	* EAGAIN Nonconformant error if allocation fails
	1106	*/
	1107	int
	1108	select(struct proc p, struct select_args uap, int32_t *retval)
	1109	{
	1110	__pthread_testcancel(1);
	1111	return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
	1112	}
	1113
	1114	int
	1115	select_nocancel(struct proc p, struct select_nocancel_args uap, int32_t *retval)
	1116	{
	1117	uint64_t timeout = 0;
	1118
	1119	if (uap->tv) {
	1120	int err;
	1121	struct timeval atv;
	1122	if (IS_64BIT_PROCESS(p)) {
	1123	struct user64_timeval atv64;
	1124	err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
	1125	/* Loses resolution - assume timeout < 68 years */
	1126	atv.tv_sec = (__darwin_time_t)atv64.tv_sec;
	1127	atv.tv_usec = atv64.tv_usec;
	1128	} else {
	1129	struct user32_timeval atv32;
	1130	err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
	1131	atv.tv_sec = atv32.tv_sec;
	1132	atv.tv_usec = atv32.tv_usec;
	1133	}
	1134	if (err) {
	1135	return err;
	1136	}
	1137
	1138	if (itimerfix(&atv)) {
	1139	err = EINVAL;
	1140	return err;
	1141	}
	1142
	1143	clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
	1144	}
	1145
	1146	return select_internal(p, uap, timeout, retval);
	1147	}
	1148
	1149	int
	1150	pselect(struct proc p, struct pselect_args uap, int32_t *retval)
	1151	{
	1152	__pthread_testcancel(1);
	1153	return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
	1154	}
	1155
	1156	int
	1157	pselect_nocancel(struct proc p, struct pselect_nocancel_args uap, int32_t *retval)
	1158	{
	1159	int err;
	1160	struct uthread *ut;
	1161	uint64_t timeout = 0;
	1162
	1163	if (uap->ts) {
	1164	struct timespec ts;
	1165
	1166	if (IS_64BIT_PROCESS(p)) {
	1167	struct user64_timespec ts64;
	1168	err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
	1169	ts.tv_sec = (__darwin_time_t)ts64.tv_sec;
	1170	ts.tv_nsec = (long)ts64.tv_nsec;
	1171	} else {
	1172	struct user32_timespec ts32;
	1173	err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
	1174	ts.tv_sec = ts32.tv_sec;
	1175	ts.tv_nsec = ts32.tv_nsec;
	1176	}
	1177	if (err) {
	1178	return err;
	1179	}
	1180
	1181	if (!timespec_is_valid(&ts)) {
	1182	return EINVAL;
	1183	}
	1184	clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
	1185	}
	1186
	1187	ut = get_bsdthread_info(current_thread());
	1188
	1189	if (uap->mask != USER_ADDR_NULL) {
	1190	/* save current mask, then copyin and set new mask */
	1191	sigset_t newset;
	1192	err = copyin(uap->mask, &newset, sizeof(sigset_t));
	1193	if (err) {
	1194	return err;
	1195	}
	1196	ut->uu_oldmask = ut->uu_sigmask;
	1197	ut->uu_flag \|= UT_SAS_OLDMASK;
	1198	ut->uu_sigmask = (newset & ~sigcantmask);
	1199	}
	1200
	1201	err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
	1202
	1203	if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
	1204	/*
	1205	* Restore old mask (direct return case). NOTE: EINTR can also be returned
	1206	* if the thread is cancelled. In that case, we don't reset the signal
	1207	* mask to its original value (which usually happens in the signal
	1208	* delivery path). This behavior is permitted by POSIX.
	1209	*/
	1210	ut->uu_sigmask = ut->uu_oldmask;
	1211	ut->uu_oldmask = 0;
	1212	ut->uu_flag &= ~UT_SAS_OLDMASK;
	1213	}
	1214
	1215	return err;
	1216	}
	1217
	1218	void
	1219	select_cleanup_uthread(struct _select *sel)
	1220	{
	1221	kheap_free(KHEAP_DATA_BUFFERS, sel->ibits, 2 * sel->nbytes);
	1222	sel->ibits = sel->obits = NULL;
	1223	sel->nbytes = 0;
	1224	}
	1225
	1226	static int
	1227	select_grow_uthread_cache(struct _select *sel, uint32_t nbytes)
	1228	{
	1229	uint32_t *buf;
	1230
	1231	buf = kheap_alloc(KHEAP_DATA_BUFFERS, 2 * nbytes, Z_WAITOK \| Z_ZERO);
	1232	if (buf) {
	1233	select_cleanup_uthread(sel);
	1234	sel->ibits = buf;
	1235	sel->obits = buf + nbytes / sizeof(uint32_t);
	1236	sel->nbytes = nbytes;
	1237	return true;
	1238	}
	1239	return false;
	1240	}
	1241
	1242	static void
	1243	select_bzero_uthread_cache(struct _select *sel)
	1244	{
	1245	bzero(sel->ibits, sel->nbytes * 2);
	1246	}
	1247
	1248	/*
	1249	* Generic implementation of {,p}select. Care: we type-pun uap across the two
	1250	* syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
	1251	* are identical. The 5th (timeout) argument points to different types, so we
	1252	* unpack in the syscall-specific code, but the generic code still does a null
	1253	* check on this argument to determine if a timeout was specified.
	1254	*/
	1255	static int
	1256	select_internal(struct proc p, struct select_nocancel_args uap, uint64_t timeout, int32_t *retval)
	1257	{
	1258	int error = 0;
	1259	u_int ni, nw;
	1260	thread_t th_act;
	1261	struct uthread *uth;
	1262	struct _select *sel;
	1263	struct _select_data *seldata;
	1264	int count = 0;
	1265	size_t sz = 0;
	1266
	1267	th_act = current_thread();
	1268	uth = get_bsdthread_info(th_act);
	1269	sel = &uth->uu_select;
	1270	seldata = &uth->uu_save.uus_select_data;
	1271	*retval = 0;
	1272
	1273	seldata->args = uap;
	1274	seldata->retval = retval;
	1275	seldata->wqp = NULL;
	1276	seldata->count = 0;
	1277
	1278	if (uap->nd < 0) {
	1279	return EINVAL;
	1280	}
	1281
	1282	/* select on thread of process that already called proc_exit() */
	1283	if (p->p_fd == NULL) {
	1284	return EBADF;
	1285	}
	1286
	1287	if (uap->nd > p->p_fd->fd_nfiles) {
	1288	uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
	1289	}
	1290	nw = howmany(uap->nd, NFDBITS);
	1291	ni = nw * sizeof(fd_mask);
	1292
	1293	/*
	1294	* if the previously allocated space for the bits is smaller than
	1295	* what is requested or no space has yet been allocated for this
	1296	* thread, allocate enough space now.
	1297	*
	1298	* Note: If this process fails, select() will return EAGAIN; this
	1299	* is the same thing pool() returns in a no-memory situation, but
	1300	* it is not a POSIX compliant error code for select().
	1301	*/
	1302	if (sel->nbytes < (3 * ni)) {
	1303	if (!select_grow_uthread_cache(sel, 3 * ni)) {
	1304	return EAGAIN;
	1305	}
	1306	} else {
	1307	select_bzero_uthread_cache(sel);
	1308	}
	1309
	1310	/*
	1311	* get the bits from the user address space
	1312	*/
	1313	#define getbits(name, x) \
	1314	do { \
	1315	if (uap->name && (error = copyin(uap->name, \
	1316	(caddr_t)&sel->ibits[(x) * nw], ni))) \
	1317	goto continuation; \
	1318	} while (0)
	1319
	1320	getbits(in, 0);
	1321	getbits(ou, 1);
	1322	getbits(ex, 2);
	1323	#undef getbits
	1324
	1325	seldata->abstime = timeout;
	1326
	1327	if ((error = selcount(p, sel->ibits, uap->nd, &count))) {
	1328	goto continuation;
	1329	}
	1330
	1331	/*
	1332	* We need an array of waitq pointers. This is due to the new way
	1333	* in which waitqs are linked to sets. When a thread selects on a
	1334	* file descriptor, a waitq (embedded in a selinfo structure) is
	1335	* added to the thread's local waitq set. There is no longer any
	1336	* way to directly iterate over all members of a given waitq set.
	1337	* The process of linking a waitq into a set may allocate a link
	1338	* table object. Because we can't iterate over all the waitqs to
	1339	* which our thread waitq set belongs, we need a way of removing
	1340	* this link object!
	1341	*
	1342	* Thus we need a buffer which will hold one waitq pointer
	1343	* per FD being selected. During the tear-down phase we can use
	1344	* these pointers to dis-associate the underlying selinfo's waitq
	1345	* from our thread's waitq set.
	1346	*
	1347	* Because we also need to allocate a waitq set for this thread,
	1348	* we use a bare buffer pointer to hold all the memory. Note that
	1349	* this memory is cached in the thread pointer and not reaped until
	1350	* the thread exists. This is generally OK because threads that
	1351	* call select tend to keep calling select repeatedly.
	1352	*/
	1353	sz = ALIGN(sizeof(struct waitq_set)) + (count * sizeof(uint64_t));
	1354	if (sz > uth->uu_wqstate_sz) {
	1355	/* (re)allocate a buffer to hold waitq pointers */
	1356	if (uth->uu_wqset) {
	1357	if (waitq_set_is_valid(uth->uu_wqset)) {
	1358	waitq_set_deinit(uth->uu_wqset);
	1359	}
	1360	kheap_free(KM_SELECT, uth->uu_wqset, uth->uu_wqstate_sz);
	1361	} else if (uth->uu_wqstate_sz && !uth->uu_wqset) {
	1362	panic("select: thread structure corrupt! "
	1363	"uu_wqstate_sz:%ld, wqstate_buf == NULL",
	1364	uth->uu_wqstate_sz);
	1365	}
	1366	uth->uu_wqstate_sz = sz;
	1367	uth->uu_wqset = kheap_alloc(KM_SELECT, sz, Z_WAITOK);
	1368	if (!uth->uu_wqset) {
	1369	panic("can't allocate %ld bytes for wqstate buffer",
	1370	uth->uu_wqstate_sz);
	1371	}
	1372	waitq_set_init(uth->uu_wqset,
	1373	SYNC_POLICY_FIFO \| SYNC_POLICY_PREPOST, NULL, NULL);
	1374	}
	1375
	1376	if (!waitq_set_is_valid(uth->uu_wqset)) {
	1377	waitq_set_init(uth->uu_wqset,
	1378	SYNC_POLICY_FIFO \| SYNC_POLICY_PREPOST, NULL, NULL);
	1379	}
	1380
	1381	/* the last chunk of our buffer is an array of waitq pointers */
	1382	seldata->wqp = (uint64_t )((char )(uth->uu_wqset) + ALIGN(sizeof(struct waitq_set)));
	1383	bzero(seldata->wqp, sz - ALIGN(sizeof(struct waitq_set)));
	1384
	1385	seldata->count = count;
	1386
	1387	continuation:
	1388
	1389	if (error) {
	1390	/*
	1391	* We have already cleaned up any state we established,
	1392	* either locally or as a result of selcount(). We don't
	1393	* need to wait_subqueue_unlink_all(), since we haven't set
	1394	* anything at this point.
	1395	*/
	1396	return error;
	1397	}
	1398
	1399	return selprocess(0, SEL_FIRSTPASS);
	1400	}
	1401
	1402	int
	1403	selcontinue(int error)
	1404	{
	1405	return selprocess(error, SEL_SECONDPASS);
	1406	}
	1407
	1408
	1409	/*
	1410	* selprocess
	1411	*
	1412	* Parameters: error The error code from our caller
	1413	* sel_pass The pass we are on
	1414	*/
	1415	int
	1416	selprocess(int error, int sel_pass)
	1417	{
	1418	int ncoll;
	1419	u_int ni, nw;
	1420	thread_t th_act;
	1421	struct uthread *uth;
	1422	struct proc *p;
	1423	struct select_nocancel_args *uap;
	1424	int *retval;
	1425	struct _select *sel;
	1426	struct _select_data *seldata;
	1427	int unwind = 1;
	1428	int prepost = 0;
	1429	int somewakeup = 0;
	1430	int doretry = 0;
	1431	wait_result_t wait_result;
	1432
	1433	p = current_proc();
	1434	th_act = current_thread();
	1435	uth = get_bsdthread_info(th_act);
	1436	sel = &uth->uu_select;
	1437	seldata = &uth->uu_save.uus_select_data;
	1438	uap = seldata->args;
	1439	retval = seldata->retval;
	1440
	1441	if ((error != 0) && (sel_pass == SEL_FIRSTPASS)) {
	1442	unwind = 0;
	1443	}
	1444	if (seldata->count == 0) {
	1445	unwind = 0;
	1446	}
	1447	retry:
	1448	if (error != 0) {
	1449	goto done;
	1450	}
	1451
	1452	ncoll = nselcoll;
	1453	OSBitOrAtomic(P_SELECT, &p->p_flag);
	1454
	1455	/* skip scans if the select is just for timeouts */
	1456	if (seldata->count) {
	1457	error = selscan(p, sel, seldata, uap->nd, retval, sel_pass, uth->uu_wqset);
	1458	if (error \|\| *retval) {
	1459	goto done;
	1460	}
	1461	if (prepost \|\| somewakeup) {
	1462	/*
	1463	* if the select of log, then we can wakeup and
	1464	* discover some one else already read the data;
	1465	* go to select again if time permits
	1466	*/
	1467	prepost = 0;
	1468	somewakeup = 0;
	1469	doretry = 1;
	1470	}
	1471	}
	1472
	1473	if (uap->tv) {
	1474	uint64_t now;
	1475
	1476	clock_get_uptime(&now);
	1477	if (now >= seldata->abstime) {
	1478	goto done;
	1479	}
	1480	}
	1481
	1482	if (doretry) {
	1483	/* cleanup obits and try again */
	1484	doretry = 0;
	1485	sel_pass = SEL_FIRSTPASS;
	1486	goto retry;
	1487	}
	1488
	1489	/*
	1490	* To effect a poll, the timeout argument should be
	1491	* non-nil, pointing to a zero-valued timeval structure.
	1492	*/
	1493	if (uap->tv && seldata->abstime == 0) {
	1494	goto done;
	1495	}
	1496
	1497	/* No spurious wakeups due to colls,no need to check for them */
	1498	if ((sel_pass == SEL_SECONDPASS) \|\| ((p->p_flag & P_SELECT) == 0)) {
	1499	sel_pass = SEL_FIRSTPASS;
	1500	goto retry;
	1501	}
	1502
	1503	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1504
	1505	/* if the select is just for timeout skip check */
	1506	if (seldata->count && (sel_pass == SEL_SECONDPASS)) {
	1507	panic("selprocess: 2nd pass assertwaiting");
	1508	}
	1509
	1510	/* waitq_set has waitqueue as first element */
	1511	wait_result = waitq_assert_wait64_leeway((struct waitq *)uth->uu_wqset,
	1512	NO_EVENT64, THREAD_ABORTSAFE,
	1513	TIMEOUT_URGENCY_USER_NORMAL,
	1514	seldata->abstime,
	1515	TIMEOUT_NO_LEEWAY);
	1516	if (wait_result != THREAD_AWAKENED) {
	1517	/* there are no preposted events */
	1518	error = tsleep1(NULL, PSOCK \| PCATCH,
	1519	"select", 0, selcontinue);
	1520	} else {
	1521	prepost = 1;
	1522	error = 0;
	1523	}
	1524
	1525	if (error == 0) {
	1526	sel_pass = SEL_SECONDPASS;
	1527	if (!prepost) {
	1528	somewakeup = 1;
	1529	}
	1530	goto retry;
	1531	}
	1532	done:
	1533	if (unwind) {
	1534	seldrop(p, sel->ibits, uap->nd, seldata->count);
	1535	waitq_set_deinit(uth->uu_wqset);
	1536	/*
	1537	* zero out the waitq pointer array to avoid use-after free
	1538	* errors in the selcount error path (seldrop_locked) if/when
	1539	* the thread re-calls select().
	1540	*/
	1541	bzero((void *)uth->uu_wqset, uth->uu_wqstate_sz);
	1542	}
	1543	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1544	/* select is not restarted after signals... */
	1545	if (error == ERESTART) {
	1546	error = EINTR;
	1547	}
	1548	if (error == EWOULDBLOCK) {
	1549	error = 0;
	1550	}
	1551	nw = howmany(uap->nd, NFDBITS);
	1552	ni = nw * sizeof(fd_mask);
	1553
	1554	#define putbits(name, x) \
	1555	do { \
	1556	if (uap->name && (error2 = \
	1557	copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
	1558	error = error2; \
	1559	} while (0)
	1560
	1561	if (error == 0) {
	1562	int error2;
	1563
	1564	putbits(in, 0);
	1565	putbits(ou, 1);
	1566	putbits(ex, 2);
	1567	#undef putbits
	1568	}
	1569
	1570	if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
	1571	/* restore signal mask - continuation case */
	1572	uth->uu_sigmask = uth->uu_oldmask;
	1573	uth->uu_oldmask = 0;
	1574	uth->uu_flag &= ~UT_SAS_OLDMASK;
	1575	}
	1576
	1577	return error;
	1578	}
	1579
	1580
	1581	/**
	1582	* remove the fileproc's underlying waitq from the supplied waitq set;
	1583	* clear FP_INSELECT when appropriate
	1584	*
	1585	* Parameters:
	1586	* fp File proc that is potentially currently in select
	1587	* wqset Waitq set to which the fileproc may belong
	1588	* (usually this is the thread's private waitq set)
	1589	* Conditions:
	1590	* proc_fdlock is held
	1591	*/
	1592	static void
	1593	selunlinkfp(struct fileproc fp, uint64_t wqp_id, struct waitq_set wqset)
	1594	{
	1595	int valid_set = waitq_set_is_valid(wqset);
	1596	int valid_q = !!wqp_id;
	1597
	1598	/*
	1599	* This could be called (from selcount error path) before we setup
	1600	* the thread's wqset. Check the wqset passed in, and only unlink if
	1601	* the set is valid.
	1602	*/
	1603
	1604	/* unlink the underlying waitq from the input set (thread waitq set) */
	1605	if (valid_q && valid_set) {
	1606	waitq_unlink_by_prepost_id(wqp_id, wqset);
	1607	}
	1608
	1609	/* allow passing a invalid fp for seldrop unwind */
	1610	if (!(fp->fp_flags & (FP_INSELECT \| FP_SELCONFLICT))) {
	1611	return;
	1612	}
	1613
	1614	/*
	1615	* We can always remove the conflict queue from our thread's set: this
	1616	* will not affect other threads that potentially need to be awoken on
	1617	* the conflict queue during a fileproc_drain - those sets will still
	1618	* be linked with the global conflict queue, and the last waiter
	1619	* on the fp clears the CONFLICT marker.
	1620	*/
	1621	if (valid_set && (fp->fp_flags & FP_SELCONFLICT)) {
	1622	waitq_unlink(&select_conflict_queue, wqset);
	1623	}
	1624
	1625	/* jca: TODO:
	1626	* This isn't quite right - we don't actually know if this
	1627	* fileproc is in another select or not! Here we just assume
	1628	* that if we were the first thread to select on the FD, then
	1629	* we'll be the one to clear this flag...
	1630	*/
	1631	if (valid_set && fp->fp_wset == (void *)wqset) {
	1632	fp->fp_flags &= ~FP_INSELECT;
	1633	fp->fp_wset = NULL;
	1634	}
	1635	}
	1636
	1637	/**
	1638	* connect a fileproc to the given wqset, potentially bridging to a waitq
	1639	* pointed to indirectly by wq_data
	1640	*
	1641	* Parameters:
	1642	* fp File proc potentially currently in select
	1643	* wq_data Pointer to a pointer to a waitq (could be NULL)
	1644	* wqset Waitq set to which the fileproc should now belong
	1645	* (usually this is the thread's private waitq set)
	1646	*
	1647	* Conditions:
	1648	* proc_fdlock is held
	1649	*/
	1650	static uint64_t
	1651	sellinkfp(struct fileproc fp, void wq_data, struct waitq_set wqset)
	1652	{
	1653	struct waitq *f_wq = NULL;
	1654
	1655	if ((fp->fp_flags & FP_INSELECT) != FP_INSELECT) {
	1656	if (wq_data) {
	1657	panic("non-null data:%p on fp:%p not in select?!"
	1658	"(wqset:%p)", wq_data, fp, wqset);
	1659	}
	1660	return 0;
	1661	}
	1662
	1663	if ((fp->fp_flags & FP_SELCONFLICT) == FP_SELCONFLICT) {
	1664	waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, NULL);
	1665	}
	1666
	1667	/*
	1668	* The wq_data parameter has potentially been set by selrecord called
	1669	* from a subsystems fo_select() function. If the subsystem does not
	1670	* call selrecord, then wq_data will be NULL
	1671	*
	1672	* Use memcpy to get the value into a proper pointer because
	1673	* wq_data most likely points to a stack variable that could be
	1674	* unaligned on 32-bit systems.
	1675	*/
	1676	if (wq_data) {
	1677	memcpy(&f_wq, wq_data, sizeof(f_wq));
	1678	if (!waitq_is_valid(f_wq)) {
	1679	f_wq = NULL;
	1680	}
	1681	}
	1682
	1683	/* record the first thread's wqset in the fileproc structure */
	1684	if (!fp->fp_wset) {
	1685	fp->fp_wset = (void *)wqset;
	1686	}
	1687
	1688	/* handles NULL f_wq */
	1689	return waitq_get_prepost_id(f_wq);
	1690	}
	1691
	1692
	1693	/*
	1694	* selscan
	1695	*
	1696	* Parameters: p Process performing the select
	1697	* sel The per-thread select context structure
	1698	* nfd The number of file descriptors to scan
	1699	* retval The per thread system call return area
	1700	* sel_pass Which pass this is; allowed values are
	1701	* SEL_FIRSTPASS and SEL_SECONDPASS
	1702	* wqset The per thread wait queue set
	1703	*
	1704	* Returns: 0 Success
	1705	* EIO Invalid p->p_fd field XXX Obsolete?
	1706	* EBADF One of the files in the bit vector is
	1707	* invalid.
	1708	*/
	1709	static int
	1710	selscan(struct proc p, struct _select sel, struct _select_data * seldata,
	1711	int nfd, int32_t retval, int sel_pass, struct waitq_set wqset)
	1712	{
	1713	struct filedesc *fdp = p->p_fd;
	1714	int msk, i, j, fd;
	1715	u_int32_t bits;
	1716	struct fileproc *fp;
	1717	int n = 0; /* count of bits */
	1718	int nc = 0; /* bit vector offset (nc'th bit) */
	1719	static int flag[3] = { FREAD, FWRITE, 0 };
	1720	u_int32_t iptr, optr;
	1721	u_int nw;
	1722	u_int32_t ibits, obits;
	1723	uint64_t reserved_link, *rl_ptr = NULL;
	1724	int count;
	1725	struct vfs_context context = *vfs_context_current();
	1726
	1727	/*
	1728	* Problems when reboot; due to MacOSX signal probs
	1729	* in Beaker1C ; verify that the p->p_fd is valid
	1730	*/
	1731	if (fdp == NULL) {
	1732	*retval = 0;
	1733	return EIO;
	1734	}
	1735	ibits = sel->ibits;
	1736	obits = sel->obits;
	1737
	1738	nw = howmany(nfd, NFDBITS);
	1739
	1740	count = seldata->count;
	1741
	1742	nc = 0;
	1743	if (!count) {
	1744	*retval = 0;
	1745	return 0;
	1746	}
	1747
	1748	proc_fdlock(p);
	1749	for (msk = 0; msk < 3; msk++) {
	1750	iptr = (u_int32_t )&ibits[msk nw];
	1751	optr = (u_int32_t )&obits[msk nw];
	1752
	1753	for (i = 0; i < nfd; i += NFDBITS) {
	1754	bits = iptr[i / NFDBITS];
	1755
	1756	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	1757	bits &= ~(1U << j);
	1758
	1759	fp = fp_get_noref_locked(p, fd);
	1760	if (fp == NULL) {
	1761	/*
	1762	* If we abort because of a bad
	1763	* fd, let the caller unwind...
	1764	*/
	1765	proc_fdunlock(p);
	1766	return EBADF;
	1767	}
	1768	if (sel_pass == SEL_SECONDPASS) {
	1769	reserved_link = 0;
	1770	rl_ptr = NULL;
	1771	selunlinkfp(fp, seldata->wqp[nc], wqset);
	1772	} else {
	1773	reserved_link = waitq_link_reserve((struct waitq *)wqset);
	1774	rl_ptr = &reserved_link;
	1775	if (fp->fp_flags & FP_INSELECT) {
	1776	/* someone is already in select on this fp */
	1777	fp->fp_flags \|= FP_SELCONFLICT;
	1778	} else {
	1779	fp->fp_flags \|= FP_INSELECT;
	1780	}
	1781
	1782	waitq_set_lazy_init_link(wqset);
	1783	}
	1784
	1785	context.vc_ucred = fp->f_cred;
	1786
	1787	/*
	1788	* stash this value b/c fo_select may replace
	1789	* reserved_link with a pointer to a waitq object
	1790	*/
	1791	uint64_t rsvd = reserved_link;
	1792
	1793	/* The select; set the bit, if true */
	1794	if (fp->f_ops && fp->f_type
	1795	&& fo_select(fp, flag[msk], rl_ptr, &context)) {
	1796	optr[fd / NFDBITS] \|= (1U << (fd % NFDBITS));
	1797	n++;
	1798	}
	1799	if (sel_pass == SEL_FIRSTPASS) {
	1800	waitq_link_release(rsvd);
	1801	/*
	1802	* If the fp's supporting selinfo structure was linked
	1803	* to this thread's waitq set, then 'reserved_link'
	1804	* will have been updated by selrecord to be a pointer
	1805	* to the selinfo's waitq.
	1806	*/
	1807	if (reserved_link == rsvd) {
	1808	rl_ptr = NULL; /* fo_select never called selrecord() */
	1809	}
	1810	/*
	1811	* Hook up the thread's waitq set either to
	1812	* the fileproc structure, or to the global
	1813	* conflict queue: but only on the first
	1814	* select pass.
	1815	*/
	1816	seldata->wqp[nc] = sellinkfp(fp, (void **)rl_ptr, wqset);
	1817	}
	1818	nc++;
	1819	}
	1820	}
	1821	}
	1822	proc_fdunlock(p);
	1823
	1824	*retval = n;
	1825	return 0;
	1826	}
	1827
	1828	static int poll_callback(struct kevent_qos_s *, kevent_ctx_t);
	1829
	1830	int
	1831	poll(struct proc p, struct poll_args uap, int32_t *retval)
	1832	{
	1833	__pthread_testcancel(1);
	1834	return poll_nocancel(p, (struct poll_nocancel_args *)uap, retval);
	1835	}
	1836
	1837
	1838	int
	1839	poll_nocancel(struct proc p, struct poll_nocancel_args uap, int32_t *retval)
	1840	{
	1841	struct pollfd *fds = NULL;
	1842	struct kqueue *kq = NULL;
	1843	int ncoll, error = 0;
	1844	u_int nfds = uap->nfds;
	1845	u_int rfds = 0;
	1846	rlim_t nofile = proc_limitgetcur(p, RLIMIT_NOFILE, TRUE);
	1847	size_t ni = nfds * sizeof(struct pollfd);
	1848
	1849	/*
	1850	* This is kinda bogus. We have fd limits, but that is not
	1851	* really related to the size of the pollfd array. Make sure
	1852	* we let the process use at least FD_SETSIZE entries and at
	1853	* least enough for the current limits. We want to be reasonably
	1854	* safe, but not overly restrictive.
	1855	*/
	1856	if (nfds > OPEN_MAX \|\|
	1857	(nfds > nofile && (proc_suser(p) \|\| nfds > FD_SETSIZE))) {
	1858	return EINVAL;
	1859	}
	1860
	1861	kq = kqueue_alloc(p);
	1862	if (kq == NULL) {
	1863	return EAGAIN;
	1864	}
	1865
	1866	if (nfds) {
	1867	fds = kheap_alloc(KHEAP_TEMP, ni, Z_WAITOK);
	1868	if (NULL == fds) {
	1869	error = EAGAIN;
	1870	goto out;
	1871	}
	1872
	1873	error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
	1874	if (error) {
	1875	goto out;
	1876	}
	1877	}
	1878
	1879	/* JMM - all this P_SELECT stuff is bogus */
	1880	ncoll = nselcoll;
	1881	OSBitOrAtomic(P_SELECT, &p->p_flag);
	1882	for (u_int i = 0; i < nfds; i++) {
	1883	short events = fds[i].events;
	1884	__assert_only int rc;
	1885
	1886	/* per spec, ignore fd values below zero */
	1887	if (fds[i].fd < 0) {
	1888	fds[i].revents = 0;
	1889	continue;
	1890	}
	1891
	1892	/* convert the poll event into a kqueue kevent */
	1893	struct kevent_qos_s kev = {
	1894	.ident = fds[i].fd,
	1895	.flags = EV_ADD \| EV_ONESHOT \| EV_POLL,
	1896	.udata = CAST_USER_ADDR_T(&fds[i])
	1897	};
	1898
	1899	/* Handle input events */
	1900	if (events & (POLLIN \| POLLRDNORM \| POLLPRI \| POLLRDBAND \| POLLHUP)) {
	1901	kev.filter = EVFILT_READ;
	1902	if (events & (POLLPRI \| POLLRDBAND)) {
	1903	kev.flags \|= EV_OOBAND;
	1904	}
	1905	rc = kevent_register(kq, &kev, NULL);
	1906	assert((rc & FILTER_REGISTER_WAIT) == 0);
	1907	}
	1908
	1909	/* Handle output events */
	1910	if ((kev.flags & EV_ERROR) == 0 &&
	1911	(events & (POLLOUT \| POLLWRNORM \| POLLWRBAND))) {
	1912	kev.filter = EVFILT_WRITE;
	1913	rc = kevent_register(kq, &kev, NULL);
	1914	assert((rc & FILTER_REGISTER_WAIT) == 0);
	1915	}
	1916
	1917	/* Handle BSD extension vnode events */
	1918	if ((kev.flags & EV_ERROR) == 0 &&
	1919	(events & (POLLEXTEND \| POLLATTRIB \| POLLNLINK \| POLLWRITE))) {
	1920	kev.filter = EVFILT_VNODE;
	1921	kev.fflags = 0;
	1922	if (events & POLLEXTEND) {
	1923	kev.fflags \|= NOTE_EXTEND;
	1924	}
	1925	if (events & POLLATTRIB) {
	1926	kev.fflags \|= NOTE_ATTRIB;
	1927	}
	1928	if (events & POLLNLINK) {
	1929	kev.fflags \|= NOTE_LINK;
	1930	}
	1931	if (events & POLLWRITE) {
	1932	kev.fflags \|= NOTE_WRITE;
	1933	}
	1934	rc = kevent_register(kq, &kev, NULL);
	1935	assert((rc & FILTER_REGISTER_WAIT) == 0);
	1936	}
	1937
	1938	if (kev.flags & EV_ERROR) {
	1939	fds[i].revents = POLLNVAL;
	1940	rfds++;
	1941	} else {
	1942	fds[i].revents = 0;
	1943	}
	1944	}
	1945
	1946	/*
	1947	* Did we have any trouble registering?
	1948	* If user space passed 0 FDs, then respect any timeout value passed.
	1949	* This is an extremely inefficient sleep. If user space passed one or
	1950	* more FDs, and we had trouble registering _all_ of them, then bail
	1951	* out. If a subset of the provided FDs failed to register, then we
	1952	* will still call the kqueue_scan function.
	1953	*/
	1954	if (nfds && (rfds == nfds)) {
	1955	goto done;
	1956	}
	1957
	1958	/* scan for, and possibly wait for, the kevents to trigger */
	1959	kevent_ctx_t kectx = kevent_get_context(current_thread());
	1960	*kectx = (struct kevent_ctx_s){
	1961	.kec_process_noutputs = rfds,
	1962	.kec_process_flags = KEVENT_FLAG_POLL,
	1963	.kec_deadline = 0, /* wait forever */
	1964	};
	1965
	1966	/*
	1967	* If any events have trouble registering, an event has fired and we
	1968	* shouldn't wait for events in kqueue_scan.
	1969	*/
	1970	if (rfds) {
	1971	kectx->kec_process_flags \|= KEVENT_FLAG_IMMEDIATE;
	1972	} else if (uap->timeout != -1) {
	1973	clock_interval_to_deadline(uap->timeout, NSEC_PER_MSEC,
	1974	&kectx->kec_deadline);
	1975	}
	1976
	1977	error = kqueue_scan(kq, kectx->kec_process_flags, kectx, poll_callback);
	1978	rfds = kectx->kec_process_noutputs;
	1979
	1980	done:
	1981	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1982	/* poll is not restarted after signals... */
	1983	if (error == ERESTART) {
	1984	error = EINTR;
	1985	}
	1986	if (error == 0) {
	1987	error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
	1988	*retval = rfds;
	1989	}
	1990
	1991	out:
	1992	kheap_free(KHEAP_TEMP, fds, ni);
	1993
	1994	kqueue_dealloc(kq);
	1995	return error;
	1996	}
	1997
	1998	static int
	1999	poll_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
	2000	{
	2001	struct pollfd fds = CAST_DOWN(struct pollfd , kevp->udata);
	2002	short prev_revents = fds->revents;
	2003	short mask = 0;
	2004
	2005	/* convert the results back into revents */
	2006	if (kevp->flags & EV_EOF) {
	2007	fds->revents \|= POLLHUP;
	2008	}
	2009	if (kevp->flags & EV_ERROR) {
	2010	fds->revents \|= POLLERR;
	2011	}
	2012
	2013	switch (kevp->filter) {
	2014	case EVFILT_READ:
	2015	if (fds->revents & POLLHUP) {
	2016	mask = (POLLIN \| POLLRDNORM \| POLLPRI \| POLLRDBAND);
	2017	} else {
	2018	mask = (POLLIN \| POLLRDNORM);
	2019	if (kevp->flags & EV_OOBAND) {
	2020	mask \|= (POLLPRI \| POLLRDBAND);
	2021	}
	2022	}
	2023	fds->revents \|= (fds->events & mask);
	2024	break;
	2025
	2026	case EVFILT_WRITE:
	2027	if (!(fds->revents & POLLHUP)) {
	2028	fds->revents \|= (fds->events & (POLLOUT \| POLLWRNORM \| POLLWRBAND));
	2029	}
	2030	break;
	2031
	2032	case EVFILT_VNODE:
	2033	if (kevp->fflags & NOTE_EXTEND) {
	2034	fds->revents \|= (fds->events & POLLEXTEND);
	2035	}
	2036	if (kevp->fflags & NOTE_ATTRIB) {
	2037	fds->revents \|= (fds->events & POLLATTRIB);
	2038	}
	2039	if (kevp->fflags & NOTE_LINK) {
	2040	fds->revents \|= (fds->events & POLLNLINK);
	2041	}
	2042	if (kevp->fflags & NOTE_WRITE) {
	2043	fds->revents \|= (fds->events & POLLWRITE);
	2044	}
	2045	break;
	2046	}
	2047
	2048	if (fds->revents != 0 && prev_revents == 0) {
	2049	kectx->kec_process_noutputs++;
	2050	}
	2051
	2052	return 0;
	2053	}
	2054
	2055	int
	2056	seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
	2057	{
	2058	return 1;
	2059	}
	2060
	2061	/*
	2062	* selcount
	2063	*
	2064	* Count the number of bits set in the input bit vector, and establish an
	2065	* outstanding fp->fp_iocount for each of the descriptors which will be in
	2066	* use in the select operation.
	2067	*
	2068	* Parameters: p The process doing the select
	2069	* ibits The input bit vector
	2070	* nfd The number of fd's in the vector
	2071	* countp Pointer to where to store the bit count
	2072	*
	2073	* Returns: 0 Success
	2074	* EIO Bad per process open file table
	2075	* EBADF One of the bits in the input bit vector
	2076	* references an invalid fd
	2077	*
	2078	* Implicit: *countp (modified) Count of fd's
	2079	*
	2080	* Notes: This function is the first pass under the proc_fdlock() that
	2081	* permits us to recognize invalid descriptors in the bit vector;
	2082	* the may, however, not remain valid through the drop and
	2083	* later reacquisition of the proc_fdlock().
	2084	*/
	2085	static int
	2086	selcount(struct proc p, u_int32_t ibits, int nfd, int *countp)
	2087	{
	2088	struct filedesc *fdp = p->p_fd;
	2089	int msk, i, j, fd;
	2090	u_int32_t bits;
	2091	struct fileproc *fp;
	2092	int n = 0;
	2093	u_int32_t *iptr;
	2094	u_int nw;
	2095	int error = 0;
	2096	int need_wakeup = 0;
	2097
	2098	/*
	2099	* Problems when reboot; due to MacOSX signal probs
	2100	* in Beaker1C ; verify that the p->p_fd is valid
	2101	*/
	2102	if (fdp == NULL) {
	2103	*countp = 0;
	2104	return EIO;
	2105	}
	2106	nw = howmany(nfd, NFDBITS);
	2107
	2108	proc_fdlock(p);
	2109	for (msk = 0; msk < 3; msk++) {
	2110	iptr = (u_int32_t )&ibits[msk nw];
	2111	for (i = 0; i < nfd; i += NFDBITS) {
	2112	bits = iptr[i / NFDBITS];
	2113	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	2114	bits &= ~(1U << j);
	2115
	2116	fp = fp_get_noref_locked(p, fd);
	2117	if (fp == NULL) {
	2118	*countp = 0;
	2119	error = EBADF;
	2120	goto bad;
	2121	}
	2122	os_ref_retain_locked(&fp->fp_iocount);
	2123	n++;
	2124	}
	2125	}
	2126	}
	2127	proc_fdunlock(p);
	2128
	2129	*countp = n;
	2130	return 0;
	2131
	2132	bad:
	2133	if (n == 0) {
	2134	goto out;
	2135	}
	2136	/* Ignore error return; it's already EBADF */
	2137	(void)seldrop_locked(p, ibits, nfd, n, &need_wakeup);
	2138
	2139	out:
	2140	proc_fdunlock(p);
	2141	if (need_wakeup) {
	2142	wakeup(&p->p_fpdrainwait);
	2143	}
	2144	return error;
	2145	}
	2146
	2147
	2148	/*
	2149	* seldrop_locked
	2150	*
	2151	* Drop outstanding wait queue references set up during selscan(); drop the
	2152	* outstanding per fileproc fp_iocount picked up during the selcount().
	2153	*
	2154	* Parameters: p Process performing the select
	2155	* ibits Input bit bector of fd's
	2156	* nfd Number of fd's
	2157	* lim Limit to number of vector entries to
	2158	* consider, or -1 for "all"
	2159	* inselect True if
	2160	* need_wakeup Pointer to flag to set to do a wakeup
	2161	* if f_iocont on any descriptor goes to 0
	2162	*
	2163	* Returns: 0 Success
	2164	* EBADF One or more fds in the bit vector
	2165	* were invalid, but the rest
	2166	* were successfully dropped
	2167	*
	2168	* Notes: An fd make become bad while the proc_fdlock() is not held,
	2169	* if a multithreaded application closes the fd out from under
	2170	* the in progress select. In this case, we still have to
	2171	* clean up after the set up on the remaining fds.
	2172	*/
	2173	static int
	2174	seldrop_locked(struct proc p, u_int32_t ibits, int nfd, int lim, int *need_wakeup)
	2175	{
	2176	struct filedesc *fdp = p->p_fd;
	2177	int msk, i, j, nc, fd;
	2178	u_int32_t bits;
	2179	struct fileproc *fp;
	2180	u_int32_t *iptr;
	2181	u_int nw;
	2182	int error = 0;
	2183	uthread_t uth = get_bsdthread_info(current_thread());
	2184	struct _select_data *seldata;
	2185
	2186	*need_wakeup = 0;
	2187
	2188	/*
	2189	* Problems when reboot; due to MacOSX signal probs
	2190	* in Beaker1C ; verify that the p->p_fd is valid
	2191	*/
	2192	if (fdp == NULL) {
	2193	return EIO;
	2194	}
	2195
	2196	nw = howmany(nfd, NFDBITS);
	2197	seldata = &uth->uu_save.uus_select_data;
	2198
	2199	nc = 0;
	2200	for (msk = 0; msk < 3; msk++) {
	2201	iptr = (u_int32_t )&ibits[msk nw];
	2202	for (i = 0; i < nfd; i += NFDBITS) {
	2203	bits = iptr[i / NFDBITS];
	2204	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	2205	bits &= ~(1U << j);
	2206	/*
	2207	* If we've already dropped as many as were
	2208	* counted/scanned, then we are done.
	2209	*/
	2210	if (nc >= lim) {
	2211	goto done;
	2212	}
	2213
	2214	/*
	2215	* We took an I/O reference in selcount,
	2216	* so the fp can't possibly be NULL.
	2217	*/
	2218	fp = fp_get_noref_locked_with_iocount(p, fd);
	2219	selunlinkfp(fp,
	2220	seldata->wqp ? seldata->wqp[nc] : 0,
	2221	uth->uu_wqset);
	2222
	2223	nc++;
	2224
	2225	const os_ref_count_t refc = os_ref_release_locked(&fp->fp_iocount);
	2226	if (0 == refc) {
	2227	panic("fp_iocount overdecrement!");
	2228	}
	2229
	2230	if (1 == refc) {
	2231	/*
	2232	* The last iocount is responsible for clearing
	2233	* selconfict flag - even if we didn't set it -
	2234	* and is also responsible for waking up anyone
	2235	* waiting on iocounts to drain.
	2236	*/
	2237	if (fp->fp_flags & FP_SELCONFLICT) {
	2238	fp->fp_flags &= ~FP_SELCONFLICT;
	2239	}
	2240	if (p->p_fpdrainwait) {
	2241	p->p_fpdrainwait = 0;
	2242	*need_wakeup = 1;
	2243	}
	2244	}
	2245	}
	2246	}
	2247	}
	2248	done:
	2249	return error;
	2250	}
	2251
	2252
	2253	static int
	2254	seldrop(struct proc p, u_int32_t ibits, int nfd, int lim)
	2255	{
	2256	int error;
	2257	int need_wakeup = 0;
	2258
	2259	proc_fdlock(p);
	2260	error = seldrop_locked(p, ibits, nfd, lim, &need_wakeup);
	2261	proc_fdunlock(p);
	2262	if (need_wakeup) {
	2263	wakeup(&p->p_fpdrainwait);
	2264	}
	2265	return error;
	2266	}
	2267
	2268	/*
	2269	* Record a select request.
	2270	*/
	2271	void
	2272	selrecord(__unused struct proc selector, struct selinfo sip, void *s_data)
	2273	{
	2274	thread_t cur_act = current_thread();
	2275	struct uthread * ut = get_bsdthread_info(cur_act);
	2276	/* on input, s_data points to the 64-bit ID of a reserved link object */
	2277	uint64_t reserved_link = (uint64_t )s_data;
	2278
	2279	/* need to look at collisions */
	2280
	2281	/do not record if this is second pass of select /
	2282	if (!s_data) {
	2283	return;
	2284	}
	2285
	2286	if ((sip->si_flags & SI_INITED) == 0) {
	2287	waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO);
	2288	sip->si_flags \|= SI_INITED;
	2289	sip->si_flags &= ~SI_CLEAR;
	2290	}
	2291
	2292	if (sip->si_flags & SI_RECORDED) {
	2293	sip->si_flags \|= SI_COLL;
	2294	} else {
	2295	sip->si_flags &= ~SI_COLL;
	2296	}
	2297
	2298	sip->si_flags \|= SI_RECORDED;
	2299	/* note: this checks for pre-existing linkage */
	2300	waitq_link(&sip->si_waitq, ut->uu_wqset,
	2301	WAITQ_SHOULD_LOCK, reserved_link);
	2302
	2303	/*
	2304	* Always consume the reserved link.
	2305	* We can always call waitq_link_release() safely because if
	2306	* waitq_link is successful, it consumes the link and resets the
	2307	* value to 0, in which case our call to release becomes a no-op.
	2308	* If waitq_link fails, then the following release call will actually
	2309	* release the reserved link object.
	2310	*/
	2311	waitq_link_release(*reserved_link);
	2312	*reserved_link = 0;
	2313
	2314	/*
	2315	* Use the s_data pointer as an output parameter as well
	2316	* This avoids changing the prototype for this function which is
	2317	* used by many kexts. We need to surface the waitq object
	2318	* associated with the selinfo we just added to the thread's select
	2319	* set. New waitq sets do not have back-pointers to set members, so
	2320	* the only way to clear out set linkage objects is to go from the
	2321	* waitq to the set. We use a memcpy because s_data could be
	2322	* pointing to an unaligned value on the stack
	2323	* (especially on 32-bit systems)
	2324	*/
	2325	void wqptr = (void )&sip->si_waitq;
	2326	memcpy((void )s_data, (void )&wqptr, sizeof(void *));
	2327
	2328	return;
	2329	}
	2330
	2331	void
	2332	selwakeup(struct selinfo *sip)
	2333	{
	2334	if ((sip->si_flags & SI_INITED) == 0) {
	2335	return;
	2336	}
	2337
	2338	if (sip->si_flags & SI_COLL) {
	2339	nselcoll++;
	2340	sip->si_flags &= ~SI_COLL;
	2341	#if 0
	2342	/* will not support */
	2343	//wakeup((caddr_t)&selwait);
	2344	#endif
	2345	}
	2346
	2347	if (sip->si_flags & SI_RECORDED) {
	2348	waitq_wakeup64_all(&sip->si_waitq, NO_EVENT64,
	2349	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
	2350	sip->si_flags &= ~SI_RECORDED;
	2351	}
	2352	}
	2353
	2354	void
	2355	selthreadclear(struct selinfo *sip)
	2356	{
	2357	struct waitq *wq;
	2358
	2359	if ((sip->si_flags & SI_INITED) == 0) {
	2360	return;
	2361	}
	2362	if (sip->si_flags & SI_RECORDED) {
	2363	selwakeup(sip);
	2364	sip->si_flags &= ~(SI_RECORDED \| SI_COLL);
	2365	}
	2366	sip->si_flags \|= SI_CLEAR;
	2367	sip->si_flags &= ~SI_INITED;
	2368
	2369	wq = &sip->si_waitq;
	2370
	2371	/*
	2372	* Higher level logic may have a handle on this waitq's prepost ID,
	2373	* but that's OK because the waitq_deinit will remove/invalidate the
	2374	* prepost object (as well as mark the waitq invalid). This de-couples
	2375	* us from any callers that may have a handle to this waitq via the
	2376	* prepost ID.
	2377	*/
	2378	waitq_deinit(wq);
	2379	}
	2380
	2381
	2382	/*
	2383	* gethostuuid
	2384	*
	2385	* Description: Get the host UUID from IOKit and return it to user space.
	2386	*
	2387	* Parameters: uuid_buf Pointer to buffer to receive UUID
	2388	* timeout Timespec for timout
	2389	*
	2390	* Returns: 0 Success
	2391	* EWOULDBLOCK Timeout is too short
	2392	* copyout:EFAULT Bad user buffer
	2393	* mac_system_check_info:EPERM Client not allowed to perform this operation
	2394	*
	2395	* Notes: A timeout seems redundant, since if it's tolerable to not
	2396	* have a system UUID in hand, then why ask for one?
	2397	*/
	2398	int
	2399	gethostuuid(struct proc p, struct gethostuuid_args uap, __unused int32_t *retval)
	2400	{
	2401	kern_return_t kret;
	2402	int error;
	2403	mach_timespec_t mach_ts; /* for IOKit call */
	2404	__darwin_uuid_t uuid_kern = {}; /* for IOKit call */
	2405
	2406	/* Check entitlement */
	2407	if (!IOTaskHasEntitlement(current_task(), "com.apple.private.getprivatesysid")) {
	2408	#if !defined(XNU_TARGET_OS_OSX)
	2409	#if CONFIG_MACF
	2410	if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != 0) {
	2411	/* EPERM invokes userspace upcall if present */
	2412	return error;
	2413	}
	2414	#endif
	2415	#endif
	2416	}
	2417
	2418	/* Convert the 32/64 bit timespec into a mach_timespec_t */
	2419	if (proc_is64bit(p)) {
	2420	struct user64_timespec ts;
	2421	error = copyin(uap->timeoutp, &ts, sizeof(ts));
	2422	if (error) {
	2423	return error;
	2424	}
	2425	mach_ts.tv_sec = (unsigned int)ts.tv_sec;
	2426	mach_ts.tv_nsec = (clock_res_t)ts.tv_nsec;
	2427	} else {
	2428	struct user32_timespec ts;
	2429	error = copyin(uap->timeoutp, &ts, sizeof(ts));
	2430	if (error) {
	2431	return error;
	2432	}
	2433	mach_ts.tv_sec = ts.tv_sec;
	2434	mach_ts.tv_nsec = ts.tv_nsec;
	2435	}
	2436
	2437	/* Call IOKit with the stack buffer to get the UUID */
	2438	kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
	2439
	2440	/*
	2441	* If we get it, copy out the data to the user buffer; note that a
	2442	* uuid_t is an array of characters, so this is size invariant for
	2443	* 32 vs. 64 bit.
	2444	*/
	2445	if (kret == KERN_SUCCESS) {
	2446	error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
	2447	} else {
	2448	error = EWOULDBLOCK;
	2449	}
	2450
	2451	return error;
	2452	}
	2453
	2454	/*
	2455	* ledger
	2456	*
	2457	* Description: Omnibus system call for ledger operations
	2458	*/
	2459	int
	2460	ledger(struct proc p, struct ledger_args args, __unused int32_t *retval)
	2461	{
	2462	#if !CONFIG_MACF
	2463	#pragma unused(p)
	2464	#endif
	2465	int rval, pid, len, error;
	2466	#ifdef LEDGER_DEBUG
	2467	struct ledger_limit_args lla;
	2468	#endif
	2469	task_t task;
	2470	proc_t proc;
	2471
	2472	/* Finish copying in the necessary args before taking the proc lock */
	2473	error = 0;
	2474	len = 0;
	2475	if (args->cmd == LEDGER_ENTRY_INFO) {
	2476	error = copyin(args->arg3, (char *)&len, sizeof(len));
	2477	} else if (args->cmd == LEDGER_TEMPLATE_INFO) {
	2478	error = copyin(args->arg2, (char *)&len, sizeof(len));
	2479	} else if (args->cmd == LEDGER_LIMIT)
	2480	#ifdef LEDGER_DEBUG
	2481	{ error = copyin(args->arg2, (char *)&lla, sizeof(lla));}
	2482	#else
	2483	{ return EINVAL; }
	2484	#endif
	2485	else if ((args->cmd < 0) \|\| (args->cmd > LEDGER_MAX_CMD)) {
	2486	return EINVAL;
	2487	}
	2488
	2489	if (error) {
	2490	return error;
	2491	}
	2492	if (len < 0) {
	2493	return EINVAL;
	2494	}
	2495
	2496	rval = 0;
	2497	if (args->cmd != LEDGER_TEMPLATE_INFO) {
	2498	pid = (int)args->arg1;
	2499	proc = proc_find(pid);
	2500	if (proc == NULL) {
	2501	return ESRCH;
	2502	}
	2503
	2504	#if CONFIG_MACF
	2505	error = mac_proc_check_ledger(p, proc, args->cmd);
	2506	if (error) {
	2507	proc_rele(proc);
	2508	return error;
	2509	}
	2510	#endif
	2511
	2512	task = proc->task;
	2513	}
	2514
	2515	switch (args->cmd) {
	2516	#ifdef LEDGER_DEBUG
	2517	case LEDGER_LIMIT: {
	2518	if (!kauth_cred_issuser(kauth_cred_get())) {
	2519	rval = EPERM;
	2520	}
	2521	rval = ledger_limit(task, &lla);
	2522	proc_rele(proc);
	2523	break;
	2524	}
	2525	#endif
	2526	case LEDGER_INFO: {
	2527	struct ledger_info info = {};
	2528
	2529	rval = ledger_info(task, &info);
	2530	proc_rele(proc);
	2531	if (rval == 0) {
	2532	rval = copyout(&info, args->arg2,
	2533	sizeof(info));
	2534	}
	2535	break;
	2536	}
	2537
	2538	case LEDGER_ENTRY_INFO: {
	2539	void *buf;
	2540	int sz;
	2541
	2542	rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
	2543	proc_rele(proc);
	2544	if ((rval == 0) && (len >= 0)) {
	2545	sz = len * sizeof(struct ledger_entry_info);
	2546	rval = copyout(buf, args->arg2, sz);
	2547	kheap_free(KHEAP_DATA_BUFFERS, buf, sz);
	2548	}
	2549	if (rval == 0) {
	2550	rval = copyout(&len, args->arg3, sizeof(len));
	2551	}
	2552	break;
	2553	}
	2554
	2555	case LEDGER_TEMPLATE_INFO: {
	2556	void *buf;
	2557	int sz;
	2558
	2559	rval = ledger_template_info(&buf, &len);
	2560	if ((rval == 0) && (len >= 0)) {
	2561	sz = len * sizeof(struct ledger_template_info);
	2562	rval = copyout(buf, args->arg1, sz);
	2563	kheap_free(KHEAP_DATA_BUFFERS, buf, sz);
	2564	}
	2565	if (rval == 0) {
	2566	rval = copyout(&len, args->arg2, sizeof(len));
	2567	}
	2568	break;
	2569	}
	2570
	2571	default:
	2572	panic("ledger syscall logic error -- command type %d", args->cmd);
	2573	proc_rele(proc);
	2574	rval = EINVAL;
	2575	}
	2576
	2577	return rval;
	2578	}
	2579
	2580	int
	2581	telemetry(__unused struct proc p, struct telemetry_args args, __unused int32_t *retval)
	2582	{
	2583	int error = 0;
	2584
	2585	switch (args->cmd) {
	2586	#if CONFIG_TELEMETRY
	2587	case TELEMETRY_CMD_TIMER_EVENT:
	2588	error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
	2589	break;
	2590	case TELEMETRY_CMD_PMI_SETUP:
	2591	error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval);
	2592	break;
	2593	#endif /* CONFIG_TELEMETRY */
	2594	case TELEMETRY_CMD_VOUCHER_NAME:
	2595	if (thread_set_voucher_name((mach_port_name_t)args->deadline)) {
	2596	error = EINVAL;
	2597	}
	2598	break;
	2599
	2600	default:
	2601	error = EINVAL;
	2602	break;
	2603	}
	2604
	2605	return error;
	2606	}
	2607
	2608	/*
	2609	* Logging
	2610	*
	2611	* Description: syscall to access kernel logging from userspace
	2612	*
	2613	* Args:
	2614	* tag - used for syncing with userspace on the version.
	2615	* flags - flags used by the syscall.
	2616	* buffer - userspace address of string to copy.
	2617	* size - size of buffer.
	2618	*/
	2619	int
	2620	log_data(__unused struct proc p, struct log_data_args args, int *retval)
	2621	{
	2622	unsigned int tag = args->tag;
	2623	unsigned int flags = args->flags;
	2624	user_addr_t buffer = args->buffer;
	2625	unsigned int size = args->size;
	2626	int ret = 0;
	2627	char *log_msg = NULL;
	2628	int error;
	2629	*retval = 0;
	2630
	2631	/*
	2632	* Tag synchronize the syscall version with userspace.
	2633	* Tag == 0 => flags == OS_LOG_TYPE
	2634	*/
	2635	if (tag != 0) {
	2636	return EINVAL;
	2637	}
	2638
	2639	/*
	2640	* OS_LOG_TYPE are defined in libkern/os/log.h
	2641	* In userspace they are defined in libtrace/os/log.h
	2642	*/
	2643	if (flags != OS_LOG_TYPE_DEFAULT &&
	2644	flags != OS_LOG_TYPE_INFO &&
	2645	flags != OS_LOG_TYPE_DEBUG &&
	2646	flags != OS_LOG_TYPE_ERROR &&
	2647	flags != OS_LOG_TYPE_FAULT) {
	2648	return EINVAL;
	2649	}
	2650
	2651	if (size == 0) {
	2652	return EINVAL;
	2653	}
	2654
	2655	/* truncate to OS_LOG_DATA_MAX_SIZE */
	2656	if (size > OS_LOG_DATA_MAX_SIZE) {
	2657	printf("%s: WARNING msg is going to be truncated from %u to %u\n",
	2658	__func__, size, OS_LOG_DATA_MAX_SIZE);
	2659	size = OS_LOG_DATA_MAX_SIZE;
	2660	}
	2661
	2662	log_msg = kheap_alloc(KHEAP_TEMP, size, Z_WAITOK);
	2663	if (!log_msg) {
	2664	return ENOMEM;
	2665	}
	2666
	2667	error = copyin(buffer, log_msg, size);
	2668	if (error) {
	2669	ret = EFAULT;
	2670	goto out;
	2671	}
	2672	log_msg[size - 1] = '\0';
	2673
	2674	/*
	2675	* This will log to dmesg and logd.
	2676	* The call will fail if the current
	2677	* process is not a driverKit process.
	2678	*/
	2679	os_log_driverKit(&ret, OS_LOG_DEFAULT, (os_log_type_t)flags, "%s", log_msg);
	2680
	2681	out:
	2682	if (log_msg != NULL) {
	2683	kheap_free(KHEAP_TEMP, log_msg, size);
	2684	}
	2685
	2686	return ret;
	2687	}
	2688
	2689	#if DEVELOPMENT \|\| DEBUG
	2690	#if CONFIG_WAITQ_DEBUG
	2691	static uint64_t g_wqset_num = 0;
	2692	struct g_wqset {
	2693	queue_chain_t link;
	2694	struct waitq_set *wqset;
	2695	};
	2696
	2697	static queue_head_t g_wqset_list;
	2698	static struct waitq_set *g_waitq_set = NULL;
	2699
	2700	static inline struct waitq_set *
	2701	sysctl_get_wqset(int idx)
	2702	{
	2703	struct g_wqset *gwqs;
	2704
	2705	if (!g_wqset_num) {
	2706	queue_init(&g_wqset_list);
	2707	}
	2708
	2709	/* don't bother with locks: this is test-only code! */
	2710	qe_foreach_element(gwqs, &g_wqset_list, link) {
	2711	if ((int)(wqset_id(gwqs->wqset) & 0xffffffff) == idx) {
	2712	return gwqs->wqset;
	2713	}
	2714	}
	2715
	2716	/* allocate a new one */
	2717	++g_wqset_num;
	2718	gwqs = (struct g_wqset )kalloc(sizeof(gwqs));
	2719	assert(gwqs != NULL);
	2720
	2721	gwqs->wqset = waitq_set_alloc(SYNC_POLICY_FIFO \| SYNC_POLICY_PREPOST, NULL);
	2722	enqueue_tail(&g_wqset_list, &gwqs->link);
	2723	printf("[WQ]: created new waitq set 0x%llx\n", wqset_id(gwqs->wqset));
	2724
	2725	return gwqs->wqset;
	2726	}
	2727
	2728	#define MAX_GLOBAL_TEST_QUEUES 64
	2729	static int g_wq_init = 0;
	2730	static struct waitq g_wq[MAX_GLOBAL_TEST_QUEUES];
	2731
	2732	static inline struct waitq *
	2733	global_test_waitq(int idx)
	2734	{
	2735	if (idx < 0) {
	2736	return NULL;
	2737	}
	2738
	2739	if (!g_wq_init) {
	2740	g_wq_init = 1;
	2741	for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++) {
	2742	waitq_init(&g_wq[i], SYNC_POLICY_FIFO);
	2743	}
	2744	}
	2745
	2746	return &g_wq[idx % MAX_GLOBAL_TEST_QUEUES];
	2747	}
	2748
	2749	static int sysctl_waitq_wakeup_one SYSCTL_HANDLER_ARGS
	2750	{
	2751	#pragma unused(oidp, arg1, arg2)
	2752	int error;
	2753	int index;
	2754	struct waitq *waitq;
	2755	kern_return_t kr;
	2756	int64_t event64 = 0;
	2757
	2758	error = SYSCTL_IN(req, &event64, sizeof(event64));
	2759	if (error) {
	2760	return error;
	2761	}
	2762
	2763	if (!req->newptr) {
	2764	return SYSCTL_OUT(req, &event64, sizeof(event64));
	2765	}
	2766
	2767	if (event64 < 0) {
	2768	index = (int)((-event64) & 0xffffffff);
	2769	waitq = wqset_waitq(sysctl_get_wqset(index));
	2770	index = -index;
	2771	} else {
	2772	index = (int)event64;
	2773	waitq = global_test_waitq(index);
	2774	}
	2775
	2776	event64 = 0;
	2777
	2778	printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
	2779	index, event64);
	2780	kr = waitq_wakeup64_one(waitq, (event64_t)event64, THREAD_AWAKENED,
	2781	WAITQ_ALL_PRIORITIES);
	2782	printf("[WQ]: \tkr=%d\n", kr);
	2783
	2784	return SYSCTL_OUT(req, &kr, sizeof(kr));
	2785	}
	2786	SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_one, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	2787	0, 0, sysctl_waitq_wakeup_one, "Q", "wakeup one thread waiting on given event");
	2788
	2789
	2790	static int sysctl_waitq_wakeup_all SYSCTL_HANDLER_ARGS
	2791	{
	2792	#pragma unused(oidp, arg1, arg2)
	2793	int error;
	2794	int index;
	2795	struct waitq *waitq;
	2796	kern_return_t kr;
	2797	int64_t event64 = 0;
	2798
	2799	error = SYSCTL_IN(req, &event64, sizeof(event64));
	2800	if (error) {
	2801	return error;
	2802	}
	2803
	2804	if (!req->newptr) {
	2805	return SYSCTL_OUT(req, &event64, sizeof(event64));
	2806	}
	2807
	2808	if (event64 < 0) {
	2809	index = (int)((-event64) & 0xffffffff);
	2810	waitq = wqset_waitq(sysctl_get_wqset(index));
	2811	index = -index;
	2812	} else {
	2813	index = (int)event64;
	2814	waitq = global_test_waitq(index);
	2815	}
	2816
	2817	event64 = 0;
	2818
	2819	printf("[WQ]: Waking all threads on waitq [%d] event:0x%llx\n",
	2820	index, event64);
	2821	kr = waitq_wakeup64_all(waitq, (event64_t)event64,
	2822	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
	2823	printf("[WQ]: \tkr=%d\n", kr);
	2824
	2825	return SYSCTL_OUT(req, &kr, sizeof(kr));
	2826	}
	2827	SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_all, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	2828	0, 0, sysctl_waitq_wakeup_all, "Q", "wakeup all threads waiting on given event");
	2829
	2830
	2831	static int sysctl_waitq_wait SYSCTL_HANDLER_ARGS
	2832	{
	2833	#pragma unused(oidp, arg1, arg2)
	2834	int error;
	2835	int index;
	2836	struct waitq *waitq;
	2837	kern_return_t kr;
	2838	int64_t event64 = 0;
	2839
	2840	error = SYSCTL_IN(req, &event64, sizeof(event64));
	2841	if (error) {
	2842	return error;
	2843	}
	2844
	2845	if (!req->newptr) {
	2846	return SYSCTL_OUT(req, &event64, sizeof(event64));
	2847	}
	2848
	2849	if (event64 < 0) {
	2850	index = (int)((-event64) & 0xffffffff);
	2851	waitq = wqset_waitq(sysctl_get_wqset(index));
	2852	index = -index;
	2853	} else {
	2854	index = (int)event64;
	2855	waitq = global_test_waitq(index);
	2856	}
	2857
	2858	event64 = 0;
	2859
	2860	printf("[WQ]: Current thread waiting on waitq [%d] event:0x%llx\n",
	2861	index, event64);
	2862	kr = waitq_assert_wait64(waitq, (event64_t)event64, THREAD_INTERRUPTIBLE, 0);
	2863	if (kr == THREAD_WAITING) {
	2864	thread_block(THREAD_CONTINUE_NULL);
	2865	}
	2866	printf("[WQ]: \tWoke Up: kr=%d\n", kr);
	2867
	2868	return SYSCTL_OUT(req, &kr, sizeof(kr));
	2869	}
	2870	SYSCTL_PROC(_kern, OID_AUTO, waitq_wait, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	2871	0, 0, sysctl_waitq_wait, "Q", "start waiting on given event");
	2872
	2873
	2874	static int sysctl_wqset_select SYSCTL_HANDLER_ARGS
	2875	{
	2876	#pragma unused(oidp, arg1, arg2)
	2877	int error;
	2878	struct waitq_set *wqset;
	2879	uint64_t event64 = 0;
	2880
	2881	error = SYSCTL_IN(req, &event64, sizeof(event64));
	2882	if (error) {
	2883	return error;
	2884	}
	2885
	2886	if (!req->newptr) {
	2887	goto out;
	2888	}
	2889
	2890	wqset = sysctl_get_wqset((int)(event64 & 0xffffffff));
	2891	g_waitq_set = wqset;
	2892
	2893	event64 = wqset_id(wqset);
	2894	printf("[WQ]: selected wqset 0x%llx\n", event64);
	2895
	2896	out:
	2897	if (g_waitq_set) {
	2898	event64 = wqset_id(g_waitq_set);
	2899	} else {
	2900	event64 = (uint64_t)(-1);
	2901	}
	2902
	2903	return SYSCTL_OUT(req, &event64, sizeof(event64));
	2904	}
	2905	SYSCTL_PROC(_kern, OID_AUTO, wqset_select, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	2906	0, 0, sysctl_wqset_select, "Q", "select/create a global waitq set");
	2907
	2908
	2909	static int sysctl_waitq_link SYSCTL_HANDLER_ARGS
	2910	{
	2911	#pragma unused(oidp, arg1, arg2)
	2912	int error;
	2913	int index;
	2914	struct waitq *waitq;
	2915	struct waitq_set *wqset;
	2916	kern_return_t kr;
	2917	uint64_t reserved_link = 0;
	2918	int64_t event64 = 0;
	2919
	2920	error = SYSCTL_IN(req, &event64, sizeof(event64));
	2921	if (error) {
	2922	return error;
	2923	}
	2924
	2925	if (!req->newptr) {
	2926	return SYSCTL_OUT(req, &event64, sizeof(event64));
	2927	}
	2928
	2929	if (!g_waitq_set) {
	2930	g_waitq_set = sysctl_get_wqset(1);
	2931	}
	2932	wqset = g_waitq_set;
	2933
	2934	if (event64 < 0) {
	2935	struct waitq_set *tmp;
	2936	index = (int)((-event64) & 0xffffffff);
	2937	tmp = sysctl_get_wqset(index);
	2938	if (tmp == wqset) {
	2939	goto out;
	2940	}
	2941	waitq = wqset_waitq(tmp);
	2942	index = -index;
	2943	} else {
	2944	index = (int)event64;
	2945	waitq = global_test_waitq(index);
	2946	}
	2947
	2948	printf("[WQ]: linking waitq [%d] to global wqset (0x%llx)\n",
	2949	index, wqset_id(wqset));
	2950	reserved_link = waitq_link_reserve(waitq);
	2951	kr = waitq_link(waitq, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
	2952	waitq_link_release(reserved_link);
	2953
	2954	printf("[WQ]: \tkr=%d\n", kr);
	2955
	2956	out:
	2957	return SYSCTL_OUT(req, &kr, sizeof(kr));
	2958	}
	2959	SYSCTL_PROC(_kern, OID_AUTO, waitq_link, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	2960	0, 0, sysctl_waitq_link, "Q", "link global waitq to test waitq set");
	2961
	2962
	2963	static int sysctl_waitq_unlink SYSCTL_HANDLER_ARGS
	2964	{
	2965	#pragma unused(oidp, arg1, arg2)
	2966	int error;
	2967	int index;
	2968	struct waitq *waitq;
	2969	struct waitq_set *wqset;
	2970	kern_return_t kr;
	2971	uint64_t event64 = 0;
	2972
	2973	error = SYSCTL_IN(req, &event64, sizeof(event64));
	2974	if (error) {
	2975	return error;
	2976	}
	2977
	2978	if (!req->newptr) {
	2979	return SYSCTL_OUT(req, &event64, sizeof(event64));
	2980	}
	2981
	2982	if (!g_waitq_set) {
	2983	g_waitq_set = sysctl_get_wqset(1);
	2984	}
	2985	wqset = g_waitq_set;
	2986
	2987	index = (int)event64;
	2988	waitq = global_test_waitq(index);
	2989
	2990	printf("[WQ]: unlinking waitq [%d] from global wqset (0x%llx)\n",
	2991	index, wqset_id(wqset));
	2992
	2993	kr = waitq_unlink(waitq, wqset);
	2994	printf("[WQ]: \tkr=%d\n", kr);
	2995
	2996	return SYSCTL_OUT(req, &kr, sizeof(kr));
	2997	}
	2998	SYSCTL_PROC(_kern, OID_AUTO, waitq_unlink, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	2999	0, 0, sysctl_waitq_unlink, "Q", "unlink global waitq from test waitq set");
	3000
	3001
	3002	static int sysctl_waitq_clear_prepost SYSCTL_HANDLER_ARGS
	3003	{
	3004	#pragma unused(oidp, arg1, arg2)
	3005	struct waitq *waitq;
	3006	uint64_t event64 = 0;
	3007	int error, index;
	3008
	3009	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3010	if (error) {
	3011	return error;
	3012	}
	3013
	3014	if (!req->newptr) {
	3015	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3016	}
	3017
	3018	index = (int)event64;
	3019	waitq = global_test_waitq(index);
	3020
	3021	printf("[WQ]: clearing prepost on waitq [%d]\n", index);
	3022	waitq_clear_prepost(waitq);
	3023
	3024	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3025	}
	3026	SYSCTL_PROC(_kern, OID_AUTO, waitq_clear_prepost, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3027	0, 0, sysctl_waitq_clear_prepost, "Q", "clear prepost on given waitq");
	3028
	3029
	3030	static int sysctl_wqset_unlink_all SYSCTL_HANDLER_ARGS
	3031	{
	3032	#pragma unused(oidp, arg1, arg2)
	3033	int error;
	3034	struct waitq_set *wqset;
	3035	kern_return_t kr;
	3036	uint64_t event64 = 0;
	3037
	3038	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3039	if (error) {
	3040	return error;
	3041	}
	3042
	3043	if (!req->newptr) {
	3044	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3045	}
	3046
	3047	if (!g_waitq_set) {
	3048	g_waitq_set = sysctl_get_wqset(1);
	3049	}
	3050	wqset = g_waitq_set;
	3051
	3052	printf("[WQ]: unlinking all queues from global wqset (0x%llx)\n",
	3053	wqset_id(wqset));
	3054
	3055	kr = waitq_set_unlink_all(wqset);
	3056	printf("[WQ]: \tkr=%d\n", kr);
	3057
	3058	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3059	}
	3060	SYSCTL_PROC(_kern, OID_AUTO, wqset_unlink_all, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3061	0, 0, sysctl_wqset_unlink_all, "Q", "unlink all queues from test waitq set");
	3062
	3063
	3064	static int sysctl_wqset_clear_preposts SYSCTL_HANDLER_ARGS
	3065	{
	3066	#pragma unused(oidp, arg1, arg2)
	3067	struct waitq_set *wqset = NULL;
	3068	uint64_t event64 = 0;
	3069	int error, index;
	3070
	3071	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3072	if (error) {
	3073	return error;
	3074	}
	3075
	3076	if (!req->newptr) {
	3077	goto out;
	3078	}
	3079
	3080	index = (int)((event64) & 0xffffffff);
	3081	wqset = sysctl_get_wqset(index);
	3082	assert(wqset != NULL);
	3083
	3084	printf("[WQ]: clearing preposts on wqset 0x%llx\n", wqset_id(wqset));
	3085	waitq_set_clear_preposts(wqset);
	3086
	3087	out:
	3088	if (wqset) {
	3089	event64 = wqset_id(wqset);
	3090	} else {
	3091	event64 = (uint64_t)(-1);
	3092	}
	3093
	3094	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3095	}
	3096	SYSCTL_PROC(_kern, OID_AUTO, wqset_clear_preposts, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3097	0, 0, sysctl_wqset_clear_preposts, "Q", "clear preposts on given waitq set");
	3098
	3099	#endif /* CONFIG_WAITQ_DEBUG */
	3100
	3101	static int
	3102	sysctl_waitq_set_nelem SYSCTL_HANDLER_ARGS
	3103	{
	3104	#pragma unused(oidp, arg1, arg2)
	3105	int nelem;
	3106
	3107	/* Read only */
	3108	if (req->newptr != USER_ADDR_NULL) {
	3109	return EPERM;
	3110	}
	3111
	3112	nelem = sysctl_helper_waitq_set_nelem();
	3113
	3114	return SYSCTL_OUT(req, &nelem, sizeof(nelem));
	3115	}
	3116
	3117	SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD \| CTLFLAG_LOCKED,
	3118	0, 0, sysctl_waitq_set_nelem, "I", "ltable elementis currently used");
	3119
	3120
	3121	static int
	3122	sysctl_mpsc_test_pingpong SYSCTL_HANDLER_ARGS
	3123	{
	3124	#pragma unused(oidp, arg1, arg2)
	3125	uint64_t value = 0;
	3126	int error;
	3127
	3128	error = SYSCTL_IN(req, &value, sizeof(value));
	3129	if (error) {
	3130	return error;
	3131	}
	3132
	3133	if (error == 0 && req->newptr) {
	3134	error = mpsc_test_pingpong(value, &value);
	3135	if (error == 0) {
	3136	error = SYSCTL_OUT(req, &value, sizeof(value));
	3137	}
	3138	}
	3139
	3140	return error;
	3141	}
	3142	SYSCTL_PROC(_kern, OID_AUTO, mpsc_test_pingpong, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3143	0, 0, sysctl_mpsc_test_pingpong, "Q", "MPSC tests: pingpong");
	3144
	3145	#endif /* DEVELOPMENT \|\| DEBUG */
	3146
	3147	/Remote Time api/
	3148	SYSCTL_NODE(_machdep, OID_AUTO, remotetime, CTLFLAG_RD \| CTLFLAG_LOCKED, 0, "Remote time api");
	3149
	3150	#if DEVELOPMENT \|\| DEBUG
	3151	#if CONFIG_MACH_BRIDGE_SEND_TIME
	3152	extern _Atomic uint32_t bt_init_flag;
	3153	extern uint32_t mach_bridge_timer_enable(uint32_t, int);
	3154
	3155	SYSCTL_INT(_machdep_remotetime, OID_AUTO, bridge_timer_init_flag,
	3156	CTLFLAG_RD \| CTLFLAG_LOCKED, &bt_init_flag, 0, "");
	3157
	3158	static int sysctl_mach_bridge_timer_enable SYSCTL_HANDLER_ARGS
	3159	{
	3160	#pragma unused(oidp, arg1, arg2)
	3161	uint32_t value = 0;
	3162	int error = 0;
	3163	/* User is querying buffer size */
	3164	if (req->oldptr == USER_ADDR_NULL && req->newptr == USER_ADDR_NULL) {
	3165	req->oldidx = sizeof(value);
	3166	return 0;
	3167	}
	3168	if (os_atomic_load(&bt_init_flag, acquire)) {
	3169	if (req->newptr) {
	3170	int new_value = 0;
	3171	error = SYSCTL_IN(req, &new_value, sizeof(new_value));
	3172	if (error) {
	3173	return error;
	3174	}
	3175	if (new_value == 0 \|\| new_value == 1) {
	3176	value = mach_bridge_timer_enable(new_value, 1);
	3177	} else {
	3178	return EPERM;
	3179	}
	3180	} else {
	3181	value = mach_bridge_timer_enable(0, 0);
	3182	}
	3183	}
	3184	error = SYSCTL_OUT(req, &value, sizeof(value));
	3185	return error;
	3186	}
	3187
	3188	SYSCTL_PROC(_machdep_remotetime, OID_AUTO, bridge_timer_enable,
	3189	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3190	0, 0, sysctl_mach_bridge_timer_enable, "I", "");
	3191
	3192	#endif /* CONFIG_MACH_BRIDGE_SEND_TIME */
	3193
	3194	static int sysctl_mach_bridge_remote_time SYSCTL_HANDLER_ARGS
	3195	{
	3196	#pragma unused(oidp, arg1, arg2)
	3197	uint64_t ltime = 0, rtime = 0;
	3198	if (req->oldptr == USER_ADDR_NULL) {
	3199	req->oldidx = sizeof(rtime);
	3200	return 0;
	3201	}
	3202	if (req->newptr) {
	3203	int error = SYSCTL_IN(req, &ltime, sizeof(ltime));
	3204	if (error) {
	3205	return error;
	3206	}
	3207	}
	3208	rtime = mach_bridge_remote_time(ltime);
	3209	return SYSCTL_OUT(req, &rtime, sizeof(rtime));
	3210	}
	3211	SYSCTL_PROC(_machdep_remotetime, OID_AUTO, mach_bridge_remote_time,
	3212	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3213	0, 0, sysctl_mach_bridge_remote_time, "Q", "");
	3214
	3215	#endif /* DEVELOPMENT \|\| DEBUG */
	3216
	3217	#if CONFIG_MACH_BRIDGE_RECV_TIME
	3218	extern struct bt_params bt_params_get_latest(void);
	3219
	3220	static int sysctl_mach_bridge_conversion_params SYSCTL_HANDLER_ARGS
	3221	{
	3222	#pragma unused(oidp, arg1, arg2)
	3223	struct bt_params params = {};
	3224	if (req->oldptr == USER_ADDR_NULL) {
	3225	req->oldidx = sizeof(struct bt_params);
	3226	return 0;
	3227	}
	3228	if (req->newptr) {
	3229	return EPERM;
	3230	}
	3231	params = bt_params_get_latest();
	3232	return SYSCTL_OUT(req, &params, MIN(sizeof(params), req->oldlen));
	3233	}
	3234
	3235	SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params,
	3236	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, 0,
	3237	0, sysctl_mach_bridge_conversion_params, "S,bt_params", "");
	3238
	3239	#endif /* CONFIG_MACH_BRIDGE_RECV_TIME */
	3240
	3241	#if DEVELOPMENT \|\| DEBUG
	3242
	3243	#include <pexpert/pexpert.h>
	3244	extern int32_t sysctl_get_bound_cpuid(void);
	3245	extern kern_return_t sysctl_thread_bind_cpuid(int32_t cpuid);
	3246	static int
	3247	sysctl_kern_sched_thread_bind_cpu SYSCTL_HANDLER_ARGS
	3248	{
	3249	#pragma unused(oidp, arg1, arg2)
	3250
	3251	/*
	3252	* DO NOT remove this bootarg guard or make this non-development.
	3253	* This kind of binding should only be used for tests and
	3254	* experiments in a custom configuration, never shipping code.
	3255	*/
	3256
	3257	if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
	3258	return ENOENT;
	3259	}
	3260
	3261	int32_t cpuid = sysctl_get_bound_cpuid();
	3262
	3263	int32_t new_value;
	3264	int changed;
	3265	int error = sysctl_io_number(req, cpuid, sizeof cpuid, &new_value, &changed);
	3266	if (error) {
	3267	return error;
	3268	}
	3269
	3270	if (changed) {
	3271	kern_return_t kr = sysctl_thread_bind_cpuid(new_value);
	3272
	3273	if (kr == KERN_NOT_SUPPORTED) {
	3274	return ENOTSUP;
	3275	}
	3276
	3277	if (kr == KERN_INVALID_VALUE) {
	3278	return ERANGE;
	3279	}
	3280	}
	3281
	3282	return error;
	3283	}
	3284
	3285	SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cpu, CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3286	0, 0, sysctl_kern_sched_thread_bind_cpu, "I", "");
	3287
	3288	#if __AMP__
	3289	extern char sysctl_get_bound_cluster_type(void);
	3290	extern void sysctl_thread_bind_cluster_type(char cluster_type);
	3291	static int
	3292	sysctl_kern_sched_thread_bind_cluster_type SYSCTL_HANDLER_ARGS
	3293	{
	3294	#pragma unused(oidp, arg1, arg2)
	3295	char buff[4];
	3296
	3297	if (!PE_parse_boot_argn("enable_skstb", NULL, 0)) {
	3298	return ENOENT;
	3299	}
	3300
	3301	int error = SYSCTL_IN(req, buff, 1);
	3302	if (error) {
	3303	return error;
	3304	}
	3305	char cluster_type = buff[0];
	3306
	3307	if (!req->newptr) {
	3308	goto out;
	3309	}
	3310
	3311	sysctl_thread_bind_cluster_type(cluster_type);
	3312	out:
	3313	cluster_type = sysctl_get_bound_cluster_type();
	3314	buff[0] = cluster_type;
	3315
	3316	return SYSCTL_OUT(req, buff, 1);
	3317	}
	3318
	3319	SYSCTL_PROC(_kern, OID_AUTO, sched_thread_bind_cluster_type, CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3320	0, 0, sysctl_kern_sched_thread_bind_cluster_type, "A", "");
	3321
	3322	extern char sysctl_get_task_cluster_type(void);
	3323	extern void sysctl_task_set_cluster_type(char cluster_type);
	3324	static int
	3325	sysctl_kern_sched_task_set_cluster_type SYSCTL_HANDLER_ARGS
	3326	{
	3327	#pragma unused(oidp, arg1, arg2)
	3328	char buff[4];
	3329
	3330	if (!PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
	3331	return ENOENT;
	3332	}
	3333
	3334	int error = SYSCTL_IN(req, buff, 1);
	3335	if (error) {
	3336	return error;
	3337	}
	3338	char cluster_type = buff[0];
	3339
	3340	if (!req->newptr) {
	3341	goto out;
	3342	}
	3343
	3344	sysctl_task_set_cluster_type(cluster_type);
	3345	out:
	3346	cluster_type = sysctl_get_task_cluster_type();
	3347	buff[0] = cluster_type;
	3348
	3349	return SYSCTL_OUT(req, buff, 1);
	3350	}
	3351
	3352	SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_cluster_type, CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3353	0, 0, sysctl_kern_sched_task_set_cluster_type, "A", "");
	3354
	3355	#if CONFIG_SCHED_EDGE
	3356
	3357	/*
	3358	* Edge Scheduler Sysctls
	3359	*
	3360	* The Edge scheduler uses edge configurations to decide feasability of
	3361	* migrating threads across clusters. The sysctls allow dynamic configuration
	3362	* of the edge properties and edge weights. This configuration is typically
	3363	* updated via callouts from CLPC.
	3364	*
	3365	* <Edge Multi-cluster Support Needed>
	3366	*/
	3367	extern sched_clutch_edge sched_edge_config_e_to_p;
	3368	extern sched_clutch_edge sched_edge_config_p_to_e;
	3369	extern kern_return_t sched_edge_sysctl_configure_e_to_p(uint64_t);
	3370	extern kern_return_t sched_edge_sysctl_configure_p_to_e(uint64_t);
	3371	extern sched_clutch_edge sched_edge_e_to_p(void);
	3372	extern sched_clutch_edge sched_edge_p_to_e(void);
	3373
	3374	static int sysctl_sched_edge_config_e_to_p SYSCTL_HANDLER_ARGS
	3375	{
	3376	#pragma unused(oidp, arg1, arg2)
	3377	int error;
	3378	kern_return_t kr;
	3379	int64_t edge_config = 0;
	3380
	3381	error = SYSCTL_IN(req, &edge_config, sizeof(edge_config));
	3382	if (error) {
	3383	return error;
	3384	}
	3385
	3386	if (!req->newptr) {
	3387	edge_config = sched_edge_e_to_p().sce_edge_packed;
	3388	return SYSCTL_OUT(req, &edge_config, sizeof(edge_config));
	3389	}
	3390
	3391	kr = sched_edge_sysctl_configure_e_to_p(edge_config);
	3392	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3393	}
	3394	SYSCTL_PROC(_kern, OID_AUTO, sched_edge_config_e_to_p, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3395	0, 0, sysctl_sched_edge_config_e_to_p, "Q", "Edge Scheduler Config for E-to-P cluster");
	3396
	3397	static int sysctl_sched_edge_config_p_to_e SYSCTL_HANDLER_ARGS
	3398	{
	3399	#pragma unused(oidp, arg1, arg2)
	3400	int error;
	3401	kern_return_t kr;
	3402	int64_t edge_config = 0;
	3403
	3404	error = SYSCTL_IN(req, &edge_config, sizeof(edge_config));
	3405	if (error) {
	3406	return error;
	3407	}
	3408
	3409	if (!req->newptr) {
	3410	edge_config = sched_edge_p_to_e().sce_edge_packed;
	3411	return SYSCTL_OUT(req, &edge_config, sizeof(edge_config));
	3412	}
	3413
	3414	kr = sched_edge_sysctl_configure_p_to_e(edge_config);
	3415	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3416	}
	3417	SYSCTL_PROC(_kern, OID_AUTO, sched_edge_config_p_to_e, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3418	0, 0, sysctl_sched_edge_config_p_to_e, "Q", "Edge Scheduler Config for P-to-E cluster");
	3419
	3420	extern int sched_edge_restrict_ut;
	3421	SYSCTL_INT(_kern, OID_AUTO, sched_edge_restrict_ut, CTLFLAG_RW \| CTLFLAG_LOCKED, &sched_edge_restrict_ut, 0, "Edge Scheduler Restrict UT Threads");
	3422	extern int sched_edge_restrict_bg;
	3423	SYSCTL_INT(_kern, OID_AUTO, sched_edge_restrict_bg, CTLFLAG_RW \| CTLFLAG_LOCKED, &sched_edge_restrict_ut, 0, "Edge Scheduler Restrict BG Threads");
	3424	extern int sched_edge_migrate_ipi_immediate;
	3425	SYSCTL_INT(_kern, OID_AUTO, sched_edge_migrate_ipi_immediate, CTLFLAG_RW \| CTLFLAG_LOCKED, &sched_edge_migrate_ipi_immediate, 0, "Edge Scheduler uses immediate IPIs for migration event based on execution latency");
	3426
	3427	#endif /* CONFIG_SCHED_EDGE */
	3428
	3429	#endif /* __AMP__ */
	3430
	3431	/* used for testing by exception_tests */
	3432	extern uint32_t ipc_control_port_options;
	3433	SYSCTL_INT(_kern, OID_AUTO, ipc_control_port_options,
	3434	CTLFLAG_RD \| CTLFLAG_LOCKED, &ipc_control_port_options, 0, "");
	3435
	3436	#endif /* DEVELOPMENT \|\| DEBUG */
	3437
	3438	extern uint32_t task_exc_guard_default;
	3439
	3440	SYSCTL_INT(_kern, OID_AUTO, task_exc_guard_default,
	3441	CTLFLAG_RD \| CTLFLAG_LOCKED, &task_exc_guard_default, 0, "");
	3442
	3443
	3444	static int
	3445	sysctl_kern_tcsm_available SYSCTL_HANDLER_ARGS
	3446	{
	3447	#pragma unused(oidp, arg1, arg2)
	3448	uint32_t value = machine_csv(CPUVN_CI) ? 1 : 0;
	3449
	3450	if (req->newptr) {
	3451	return EINVAL;
	3452	}
	3453
	3454	return SYSCTL_OUT(req, &value, sizeof(value));
	3455	}
	3456	SYSCTL_PROC(_kern, OID_AUTO, tcsm_available,
	3457	CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_LOCKED \| CTLFLAG_MASKED \| CTLFLAG_ANYBODY,
	3458	0, 0, sysctl_kern_tcsm_available, "I", "");
	3459
	3460
	3461	static int
	3462	sysctl_kern_tcsm_enable SYSCTL_HANDLER_ARGS
	3463	{
	3464	#pragma unused(oidp, arg1, arg2)
	3465	uint32_t soflags = 0;
	3466	uint32_t old_value = thread_get_no_smt() ? 1 : 0;
	3467
	3468	int error = SYSCTL_IN(req, &soflags, sizeof(soflags));
	3469	if (error) {
	3470	return error;
	3471	}
	3472
	3473	if (soflags && machine_csv(CPUVN_CI)) {
	3474	thread_set_no_smt(true);
	3475	machine_tecs(current_thread());
	3476	}
	3477
	3478	return SYSCTL_OUT(req, &old_value, sizeof(old_value));
	3479	}
	3480	SYSCTL_PROC(_kern, OID_AUTO, tcsm_enable,
	3481	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_MASKED \| CTLFLAG_ANYBODY,
	3482	0, 0, sysctl_kern_tcsm_enable, "I", "");
	3483
	3484
	3485	#if DEVELOPMENT \|\| DEBUG
	3486	extern void sysctl_task_set_no_smt(char no_smt);
	3487	extern char sysctl_task_get_no_smt(void);
	3488
	3489	static int
	3490	sysctl_kern_sched_task_set_no_smt SYSCTL_HANDLER_ARGS
	3491	{
	3492	#pragma unused(oidp, arg1, arg2)
	3493	char buff[4];
	3494
	3495	int error = SYSCTL_IN(req, buff, 1);
	3496	if (error) {
	3497	return error;
	3498	}
	3499	char no_smt = buff[0];
	3500
	3501	if (!req->newptr) {
	3502	goto out;
	3503	}
	3504
	3505	sysctl_task_set_no_smt(no_smt);
	3506	out:
	3507	no_smt = sysctl_task_get_no_smt();
	3508	buff[0] = no_smt;
	3509
	3510	return SYSCTL_OUT(req, buff, 1);
	3511	}
	3512
	3513	SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_no_smt, CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_ANYBODY,
	3514	0, 0, sysctl_kern_sched_task_set_no_smt, "A", "");
	3515
	3516	static int
	3517	sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid oidp, __unused void arg1, __unused int arg2, struct sysctl_req *req)
	3518	{
	3519	int new_value, changed;
	3520	int old_value = thread_get_no_smt() ? 1 : 0;
	3521	int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
	3522
	3523	if (changed) {
	3524	thread_set_no_smt(!!new_value);
	3525	}
	3526
	3527	return error;
	3528	}
	3529
	3530	SYSCTL_PROC(_kern, OID_AUTO, sched_thread_set_no_smt,
	3531	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_ANYBODY,
	3532	0, 0, sysctl_kern_sched_thread_set_no_smt, "I", "");
	3533
	3534	static int
	3535	sysctl_kern_debug_get_preoslog SYSCTL_HANDLER_ARGS
	3536	{
	3537	#pragma unused(oidp, arg1, arg2)
	3538	static bool oneshot_executed = false;
	3539	size_t preoslog_size = 0;
	3540	const char *preoslog = NULL;
	3541
	3542	// DumpPanic pases a non-zero write value when it needs oneshot behaviour
	3543	if (req->newptr) {
	3544	uint8_t oneshot = 0;
	3545	int error = SYSCTL_IN(req, &oneshot, sizeof(oneshot));
	3546	if (error) {
	3547	return error;
	3548	}
	3549
	3550	if (oneshot) {
	3551	if (!OSCompareAndSwap8(false, true, &oneshot_executed)) {
	3552	return EPERM;
	3553	}
	3554	}
	3555	}
	3556
	3557	preoslog = sysctl_debug_get_preoslog(&preoslog_size);
	3558	if (preoslog == NULL \|\| preoslog_size == 0) {
	3559	return 0;
	3560	}
	3561
	3562	if (req->oldptr == USER_ADDR_NULL) {
	3563	req->oldidx = preoslog_size;
	3564	return 0;
	3565	}
	3566
	3567	return SYSCTL_OUT(req, preoslog, preoslog_size);
	3568	}
	3569
	3570	SYSCTL_PROC(_kern, OID_AUTO, preoslog, CTLTYPE_OPAQUE \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3571	0, 0, sysctl_kern_debug_get_preoslog, "-", "");
	3572
	3573	static int
	3574	sysctl_kern_task_set_filter_msg_flag SYSCTL_HANDLER_ARGS
	3575	{
	3576	#pragma unused(oidp, arg1, arg2)
	3577	int new_value, changed;
	3578	int old_value = task_get_filter_msg_flag(current_task()) ? 1 : 0;
	3579	int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
	3580
	3581	if (changed) {
	3582	task_set_filter_msg_flag(current_task(), !!new_value);
	3583	}
	3584
	3585	return error;
	3586	}
	3587
	3588	SYSCTL_PROC(_kern, OID_AUTO, task_set_filter_msg_flag, CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3589	0, 0, sysctl_kern_task_set_filter_msg_flag, "I", "");
	3590
	3591	#endif /* DEVELOPMENT \|\| DEBUG */