git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2015 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1982, 1986, 1989, 1993
	31	* The Regents of the University of California. All rights reserved.
	32	* (c) UNIX System Laboratories, Inc.
	33	* All or some portions of this file are derived from material licensed
	34	* to the University of California by American Telephone and Telegraph
	35	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	36	* the permission of UNIX System Laboratories, Inc.
	37	*
	38	* Redistribution and use in source and binary forms, with or without
	39	* modification, are permitted provided that the following conditions
	40	* are met:
	41	* 1. Redistributions of source code must retain the above copyright
	42	* notice, this list of conditions and the following disclaimer.
	43	* 2. Redistributions in binary form must reproduce the above copyright
	44	* notice, this list of conditions and the following disclaimer in the
	45	* documentation and/or other materials provided with the distribution.
	46	* 3. All advertising materials mentioning features or use of this software
	47	* must display the following acknowledgement:
	48	* This product includes software developed by the University of
	49	* California, Berkeley and its contributors.
	50	* 4. Neither the name of the University nor the names of its contributors
	51	* may be used to endorse or promote products derived from this software
	52	* without specific prior written permission.
	53	*
	54	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	55	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	56	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	57	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	58	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	59	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	60	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	61	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	62	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	63	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	64	* SUCH DAMAGE.
	65	*
	66	* @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
	67	*/
	68	/*
	69	* NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
	70	* support for mandatory and extensible security protections. This notice
	71	* is included in support of clause 2.2 (b) of the Apple Public License,
	72	* Version 2.0.
	73	*/
	74
	75	#include <sys/param.h>
	76	#include <sys/systm.h>
	77	#include <sys/filedesc.h>
	78	#include <sys/ioctl.h>
	79	#include <sys/file_internal.h>
	80	#include <sys/proc_internal.h>
	81	#include <sys/socketvar.h>
	82	#include <sys/uio_internal.h>
	83	#include <sys/kernel.h>
	84	#include <sys/guarded.h>
	85	#include <sys/stat.h>
	86	#include <sys/malloc.h>
	87	#include <sys/sysproto.h>
	88
	89	#include <sys/mount_internal.h>
	90	#include <sys/protosw.h>
	91	#include <sys/ev.h>
	92	#include <sys/user.h>
	93	#include <sys/kdebug.h>
	94	#include <sys/poll.h>
	95	#include <sys/event.h>
	96	#include <sys/eventvar.h>
	97	#include <sys/proc.h>
	98	#include <sys/kauth.h>
	99
	100	#include <machine/smp.h>
	101	#include <mach/mach_types.h>
	102	#include <kern/kern_types.h>
	103	#include <kern/assert.h>
	104	#include <kern/kalloc.h>
	105	#include <kern/thread.h>
	106	#include <kern/clock.h>
	107	#include <kern/ledger.h>
	108	#include <kern/task.h>
	109	#include <kern/telemetry.h>
	110	#include <kern/waitq.h>
	111	#include <kern/sched_prim.h>
	112	#include <kern/mpsc_queue.h>
	113
	114	#include <sys/mbuf.h>
	115	#include <sys/domain.h>
	116	#include <sys/socket.h>
	117	#include <sys/socketvar.h>
	118	#include <sys/errno.h>
	119	#include <sys/syscall.h>
	120	#include <sys/pipe.h>
	121
	122	#include <security/audit/audit.h>
	123
	124	#include <net/if.h>
	125	#include <net/route.h>
	126
	127	#include <netinet/in.h>
	128	#include <netinet/in_systm.h>
	129	#include <netinet/ip.h>
	130	#include <netinet/in_pcb.h>
	131	#include <netinet/ip_var.h>
	132	#include <netinet/ip6.h>
	133	#include <netinet/tcp.h>
	134	#include <netinet/tcp_fsm.h>
	135	#include <netinet/tcp_seq.h>
	136	#include <netinet/tcp_timer.h>
	137	#include <netinet/tcp_var.h>
	138	#include <netinet/tcpip.h>
	139	#include <netinet/tcp_debug.h>
	140	/* for wait queue based select */
	141	#include <kern/waitq.h>
	142	#include <kern/kalloc.h>
	143	#include <sys/vnode_internal.h>
	144	/* for remote time api*/
	145	#include <kern/remote_time.h>
	146	#include <os/log.h>
	147	#include <sys/log_data.h>
	148
	149	#if CONFIG_MACF
	150	#include <security/mac_framework.h>
	151	#endif
	152
	153	/* for entitlement check */
	154	#include <IOKit/IOBSD.h>
	155
	156	/* XXX should be in a header file somewhere */
	157	void evsofree(struct socket *);
	158	void evpipefree(struct pipe *);
	159	void postpipeevent(struct pipe *, int);
	160	void postevent(struct socket , struct sockbuf , int);
	161	extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
	162
	163	int rd_uio(struct proc p, int fdes, uio_t uio, user_ssize_t retval);
	164	int wr_uio(struct proc p, struct fileproc fp, uio_t uio, user_ssize_t *retval);
	165
	166	__private_extern__ int dofileread(vfs_context_t ctx, struct fileproc *fp,
	167	user_addr_t bufp, user_size_t nbyte,
	168	off_t offset, int flags, user_ssize_t *retval);
	169	__private_extern__ int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
	170	user_addr_t bufp, user_size_t nbyte,
	171	off_t offset, int flags, user_ssize_t *retval);
	172	__private_extern__ int preparefileread(struct proc p, struct fileproc *fp_ret, int fd, int check_for_vnode);
	173	__private_extern__ void donefileread(struct proc p, struct fileproc fp_ret, int fd);
	174
	175	/* Conflict wait queue for when selects collide (opaque type) */
	176	struct waitq select_conflict_queue;
	177
	178	/*
	179	* Init routine called from bsd_init.c
	180	*/
	181	void select_waitq_init(void);
	182	void
	183	select_waitq_init(void)
	184	{
	185	waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO);
	186	}
	187
	188	#define f_flag f_fglob->fg_flag
	189	#define f_type f_fglob->fg_ops->fo_type
	190	#define f_msgcount f_fglob->fg_msgcount
	191	#define f_cred f_fglob->fg_cred
	192	#define f_ops f_fglob->fg_ops
	193	#define f_offset f_fglob->fg_offset
	194	#define f_data f_fglob->fg_data
	195
	196	/*
	197	* Read system call.
	198	*
	199	* Returns: 0 Success
	200	* preparefileread:EBADF
	201	* preparefileread:ESPIPE
	202	* preparefileread:ENXIO
	203	* preparefileread:EBADF
	204	* dofileread:???
	205	*/
	206	int
	207	read(struct proc p, struct read_args uap, user_ssize_t *retval)
	208	{
	209	__pthread_testcancel(1);
	210	return read_nocancel(p, (struct read_nocancel_args *)uap, retval);
	211	}
	212
	213	int
	214	read_nocancel(struct proc p, struct read_nocancel_args uap, user_ssize_t *retval)
	215	{
	216	struct fileproc *fp;
	217	int error;
	218	int fd = uap->fd;
	219	struct vfs_context context;
	220
	221	if ((error = preparefileread(p, &fp, fd, 0))) {
	222	return error;
	223	}
	224
	225	context = *(vfs_context_current());
	226	context.vc_ucred = fp->f_fglob->fg_cred;
	227
	228	error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
	229	(off_t)-1, 0, retval);
	230
	231	donefileread(p, fp, fd);
	232
	233	return error;
	234	}
	235
	236	/*
	237	* Pread system call
	238	*
	239	* Returns: 0 Success
	240	* preparefileread:EBADF
	241	* preparefileread:ESPIPE
	242	* preparefileread:ENXIO
	243	* preparefileread:EBADF
	244	* dofileread:???
	245	*/
	246	int
	247	pread(struct proc p, struct pread_args uap, user_ssize_t *retval)
	248	{
	249	__pthread_testcancel(1);
	250	return pread_nocancel(p, (struct pread_nocancel_args *)uap, retval);
	251	}
	252
	253	int
	254	pread_nocancel(struct proc p, struct pread_nocancel_args uap, user_ssize_t *retval)
	255	{
	256	struct fileproc fp = NULL; / fp set by preparefileread() */
	257	int fd = uap->fd;
	258	int error;
	259	struct vfs_context context;
	260
	261	if ((error = preparefileread(p, &fp, fd, 1))) {
	262	goto out;
	263	}
	264
	265	context = *(vfs_context_current());
	266	context.vc_ucred = fp->f_fglob->fg_cred;
	267
	268	error = dofileread(&context, fp, uap->buf, uap->nbyte,
	269	uap->offset, FOF_OFFSET, retval);
	270
	271	donefileread(p, fp, fd);
	272
	273	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) \| DBG_FUNC_NONE),
	274	uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
	275
	276	out:
	277	return error;
	278	}
	279
	280	/*
	281	* Code common for read and pread
	282	*/
	283
	284	void
	285	donefileread(struct proc p, struct fileproc fp, int fd)
	286	{
	287	proc_fdlock_spin(p);
	288	fp_drop(p, fd, fp, 1);
	289	proc_fdunlock(p);
	290	}
	291
	292	/*
	293	* Returns: 0 Success
	294	* EBADF
	295	* ESPIPE
	296	* ENXIO
	297	* fp_lookup:EBADF
	298	* fo_read:???
	299	*/
	300	int
	301	preparefileread(struct proc p, struct fileproc *fp_ret, int fd, int check_for_pread)
	302	{
	303	vnode_t vp;
	304	int error;
	305	struct fileproc *fp;
	306
	307	AUDIT_ARG(fd, fd);
	308
	309	proc_fdlock_spin(p);
	310
	311	error = fp_lookup(p, fd, &fp, 1);
	312
	313	if (error) {
	314	proc_fdunlock(p);
	315	return error;
	316	}
	317	if ((fp->f_flag & FREAD) == 0) {
	318	error = EBADF;
	319	goto out;
	320	}
	321	if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
	322	error = ESPIPE;
	323	goto out;
	324	}
	325	if (fp->f_type == DTYPE_VNODE) {
	326	vp = (struct vnode *)fp->f_fglob->fg_data;
	327
	328	if (check_for_pread && (vnode_isfifo(vp))) {
	329	error = ESPIPE;
	330	goto out;
	331	}
	332	if (check_for_pread && (vp->v_flag & VISTTY)) {
	333	error = ENXIO;
	334	goto out;
	335	}
	336	}
	337
	338	*fp_ret = fp;
	339
	340	proc_fdunlock(p);
	341	return 0;
	342
	343	out:
	344	fp_drop(p, fd, fp, 1);
	345	proc_fdunlock(p);
	346	return error;
	347	}
	348
	349
	350	/*
	351	* Returns: 0 Success
	352	* EINVAL
	353	* fo_read:???
	354	*/
	355	__private_extern__ int
	356	dofileread(vfs_context_t ctx, struct fileproc *fp,
	357	user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
	358	user_ssize_t *retval)
	359	{
	360	uio_t auio;
	361	user_ssize_t bytecnt;
	362	int error = 0;
	363	char uio_buf[UIO_SIZEOF(1)];
	364
	365	if (nbyte > INT_MAX) {
	366	return EINVAL;
	367	}
	368
	369	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
	370	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
	371	&uio_buf[0], sizeof(uio_buf));
	372	} else {
	373	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
	374	&uio_buf[0], sizeof(uio_buf));
	375	}
	376	if (uio_addiov(auio, bufp, nbyte) != 0) {
	377	*retval = 0;
	378	return EINVAL;
	379	}
	380
	381	bytecnt = nbyte;
	382
	383	if ((error = fo_read(fp, auio, flags, ctx))) {
	384	if (uio_resid(auio) != bytecnt && (error == ERESTART \|\|
	385	error == EINTR \|\| error == EWOULDBLOCK)) {
	386	error = 0;
	387	}
	388	}
	389	bytecnt -= uio_resid(auio);
	390
	391	*retval = bytecnt;
	392
	393	return error;
	394	}
	395
	396	/*
	397	* Scatter read system call.
	398	*
	399	* Returns: 0 Success
	400	* EINVAL
	401	* ENOMEM
	402	* copyin:EFAULT
	403	* rd_uio:???
	404	*/
	405	int
	406	readv(struct proc p, struct readv_args uap, user_ssize_t *retval)
	407	{
	408	__pthread_testcancel(1);
	409	return readv_nocancel(p, (struct readv_nocancel_args *)uap, retval);
	410	}
	411
	412	int
	413	readv_nocancel(struct proc p, struct readv_nocancel_args uap, user_ssize_t *retval)
	414	{
	415	uio_t auio = NULL;
	416	int error;
	417	struct user_iovec *iovp;
	418
	419	/* Verify range bedfore calling uio_create() */
	420	if (uap->iovcnt <= 0 \|\| uap->iovcnt > UIO_MAXIOV) {
	421	return EINVAL;
	422	}
	423
	424	/* allocate a uio large enough to hold the number of iovecs passed */
	425	auio = uio_create(uap->iovcnt, 0,
	426	(IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
	427	UIO_READ);
	428
	429	/* get location of iovecs within the uio. then copyin the iovecs from
	430	* user space.
	431	*/
	432	iovp = uio_iovsaddr(auio);
	433	if (iovp == NULL) {
	434	error = ENOMEM;
	435	goto ExitThisRoutine;
	436	}
	437	error = copyin_user_iovec_array(uap->iovp,
	438	IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
	439	uap->iovcnt, iovp);
	440	if (error) {
	441	goto ExitThisRoutine;
	442	}
	443
	444	/* finalize uio_t for use and do the IO
	445	*/
	446	error = uio_calculateresid(auio);
	447	if (error) {
	448	goto ExitThisRoutine;
	449	}
	450	error = rd_uio(p, uap->fd, auio, retval);
	451
	452	ExitThisRoutine:
	453	if (auio != NULL) {
	454	uio_free(auio);
	455	}
	456	return error;
	457	}
	458
	459	/*
	460	* Write system call
	461	*
	462	* Returns: 0 Success
	463	* EBADF
	464	* fp_lookup:EBADF
	465	* dofilewrite:???
	466	*/
	467	int
	468	write(struct proc p, struct write_args uap, user_ssize_t *retval)
	469	{
	470	__pthread_testcancel(1);
	471	return write_nocancel(p, (struct write_nocancel_args *)uap, retval);
	472	}
	473
	474	int
	475	write_nocancel(struct proc p, struct write_nocancel_args uap, user_ssize_t *retval)
	476	{
	477	struct fileproc *fp;
	478	int error;
	479	int fd = uap->fd;
	480	bool wrote_some = false;
	481
	482	AUDIT_ARG(fd, fd);
	483
	484	error = fp_lookup(p, fd, &fp, 0);
	485	if (error) {
	486	return error;
	487	}
	488	if ((fp->f_flag & FWRITE) == 0) {
	489	error = EBADF;
	490	} else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
	491	proc_fdlock(p);
	492	error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
	493	proc_fdunlock(p);
	494	} else {
	495	struct vfs_context context = *(vfs_context_current());
	496	context.vc_ucred = fp->f_fglob->fg_cred;
	497
	498	error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
	499	(off_t)-1, 0, retval);
	500
	501	wrote_some = *retval > 0;
	502	}
	503	if (wrote_some) {
	504	fp_drop_written(p, fd, fp);
	505	} else {
	506	fp_drop(p, fd, fp, 0);
	507	}
	508	return error;
	509	}
	510
	511	/*
	512	* pwrite system call
	513	*
	514	* Returns: 0 Success
	515	* EBADF
	516	* ESPIPE
	517	* ENXIO
	518	* EINVAL
	519	* fp_lookup:EBADF
	520	* dofilewrite:???
	521	*/
	522	int
	523	pwrite(struct proc p, struct pwrite_args uap, user_ssize_t *retval)
	524	{
	525	__pthread_testcancel(1);
	526	return pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval);
	527	}
	528
	529	int
	530	pwrite_nocancel(struct proc p, struct pwrite_nocancel_args uap, user_ssize_t *retval)
	531	{
	532	struct fileproc *fp;
	533	int error;
	534	int fd = uap->fd;
	535	vnode_t vp = (vnode_t)0;
	536	bool wrote_some = false;
	537
	538	AUDIT_ARG(fd, fd);
	539
	540	error = fp_lookup(p, fd, &fp, 0);
	541	if (error) {
	542	return error;
	543	}
	544
	545	if ((fp->f_flag & FWRITE) == 0) {
	546	error = EBADF;
	547	} else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
	548	proc_fdlock(p);
	549	error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
	550	proc_fdunlock(p);
	551	} else {
	552	struct vfs_context context = *vfs_context_current();
	553	context.vc_ucred = fp->f_fglob->fg_cred;
	554
	555	if (fp->f_type != DTYPE_VNODE) {
	556	error = ESPIPE;
	557	goto errout;
	558	}
	559	vp = (vnode_t)fp->f_fglob->fg_data;
	560	if (vnode_isfifo(vp)) {
	561	error = ESPIPE;
	562	goto errout;
	563	}
	564	if ((vp->v_flag & VISTTY)) {
	565	error = ENXIO;
	566	goto errout;
	567	}
	568	if (uap->offset == (off_t)-1) {
	569	error = EINVAL;
	570	goto errout;
	571	}
	572
	573	error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
	574	uap->offset, FOF_OFFSET, retval);
	575	wrote_some = *retval > 0;
	576	}
	577	errout:
	578	if (wrote_some) {
	579	fp_drop_written(p, fd, fp);
	580	} else {
	581	fp_drop(p, fd, fp, 0);
	582	}
	583
	584	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) \| DBG_FUNC_NONE),
	585	uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
	586
	587	return error;
	588	}
	589
	590	/*
	591	* Returns: 0 Success
	592	* EINVAL
	593	* <fo_write>:EPIPE
	594	* <fo_write>:??? [indirect through struct fileops]
	595	*/
	596	__private_extern__ int
	597	dofilewrite(vfs_context_t ctx, struct fileproc *fp,
	598	user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
	599	user_ssize_t *retval)
	600	{
	601	uio_t auio;
	602	int error = 0;
	603	user_ssize_t bytecnt;
	604	char uio_buf[UIO_SIZEOF(1)];
	605
	606	if (nbyte > INT_MAX) {
	607	*retval = 0;
	608	return EINVAL;
	609	}
	610
	611	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
	612	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
	613	&uio_buf[0], sizeof(uio_buf));
	614	} else {
	615	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
	616	&uio_buf[0], sizeof(uio_buf));
	617	}
	618	if (uio_addiov(auio, bufp, nbyte) != 0) {
	619	*retval = 0;
	620	return EINVAL;
	621	}
	622
	623	bytecnt = nbyte;
	624	if ((error = fo_write(fp, auio, flags, ctx))) {
	625	if (uio_resid(auio) != bytecnt && (error == ERESTART \|\|
	626	error == EINTR \|\| error == EWOULDBLOCK)) {
	627	error = 0;
	628	}
	629	/* The socket layer handles SIGPIPE */
	630	if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
	631	(fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
	632	/* XXX Raise the signal on the thread? */
	633	psignal(vfs_context_proc(ctx), SIGPIPE);
	634	}
	635	}
	636	bytecnt -= uio_resid(auio);
	637	*retval = bytecnt;
	638
	639	return error;
	640	}
	641
	642	/*
	643	* Gather write system call
	644	*/
	645	int
	646	writev(struct proc p, struct writev_args uap, user_ssize_t *retval)
	647	{
	648	__pthread_testcancel(1);
	649	return writev_nocancel(p, (struct writev_nocancel_args *)uap, retval);
	650	}
	651
	652	int
	653	writev_nocancel(struct proc p, struct writev_nocancel_args uap, user_ssize_t *retval)
	654	{
	655	uio_t auio = NULL;
	656	int error;
	657	struct fileproc *fp;
	658	struct user_iovec *iovp;
	659	bool wrote_some = false;
	660
	661	AUDIT_ARG(fd, uap->fd);
	662
	663	/* Verify range bedfore calling uio_create() */
	664	if (uap->iovcnt <= 0 \|\| uap->iovcnt > UIO_MAXIOV) {
	665	return EINVAL;
	666	}
	667
	668	/* allocate a uio large enough to hold the number of iovecs passed */
	669	auio = uio_create(uap->iovcnt, 0,
	670	(IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
	671	UIO_WRITE);
	672
	673	/* get location of iovecs within the uio. then copyin the iovecs from
	674	* user space.
	675	*/
	676	iovp = uio_iovsaddr(auio);
	677	if (iovp == NULL) {
	678	error = ENOMEM;
	679	goto ExitThisRoutine;
	680	}
	681	error = copyin_user_iovec_array(uap->iovp,
	682	IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
	683	uap->iovcnt, iovp);
	684	if (error) {
	685	goto ExitThisRoutine;
	686	}
	687
	688	/* finalize uio_t for use and do the IO
	689	*/
	690	error = uio_calculateresid(auio);
	691	if (error) {
	692	goto ExitThisRoutine;
	693	}
	694
	695	error = fp_lookup(p, uap->fd, &fp, 0);
	696	if (error) {
	697	goto ExitThisRoutine;
	698	}
	699
	700	if ((fp->f_flag & FWRITE) == 0) {
	701	error = EBADF;
	702	} else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
	703	proc_fdlock(p);
	704	error = fp_guard_exception(p, uap->fd, fp, kGUARD_EXC_WRITE);
	705	proc_fdunlock(p);
	706	} else {
	707	error = wr_uio(p, fp, auio, retval);
	708	wrote_some = *retval > 0;
	709	}
	710
	711	if (wrote_some) {
	712	fp_drop_written(p, uap->fd, fp);
	713	} else {
	714	fp_drop(p, uap->fd, fp, 0);
	715	}
	716
	717	ExitThisRoutine:
	718	if (auio != NULL) {
	719	uio_free(auio);
	720	}
	721	return error;
	722	}
	723
	724
	725	int
	726	wr_uio(struct proc p, struct fileproc fp, uio_t uio, user_ssize_t *retval)
	727	{
	728	int error;
	729	user_ssize_t count;
	730	struct vfs_context context = *vfs_context_current();
	731
	732	count = uio_resid(uio);
	733
	734	context.vc_ucred = fp->f_cred;
	735	error = fo_write(fp, uio, 0, &context);
	736	if (error) {
	737	if (uio_resid(uio) != count && (error == ERESTART \|\|
	738	error == EINTR \|\| error == EWOULDBLOCK)) {
	739	error = 0;
	740	}
	741	/* The socket layer handles SIGPIPE */
	742	if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
	743	(fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
	744	psignal(p, SIGPIPE);
	745	}
	746	}
	747	*retval = count - uio_resid(uio);
	748
	749	return error;
	750	}
	751
	752
	753	int
	754	rd_uio(struct proc p, int fdes, uio_t uio, user_ssize_t retval)
	755	{
	756	struct fileproc *fp;
	757	int error;
	758	user_ssize_t count;
	759	struct vfs_context context = *vfs_context_current();
	760
	761	if ((error = preparefileread(p, &fp, fdes, 0))) {
	762	return error;
	763	}
	764
	765	count = uio_resid(uio);
	766
	767	context.vc_ucred = fp->f_cred;
	768
	769	error = fo_read(fp, uio, 0, &context);
	770
	771	if (error) {
	772	if (uio_resid(uio) != count && (error == ERESTART \|\|
	773	error == EINTR \|\| error == EWOULDBLOCK)) {
	774	error = 0;
	775	}
	776	}
	777	*retval = count - uio_resid(uio);
	778
	779	donefileread(p, fp, fdes);
	780
	781	return error;
	782	}
	783
	784	/*
	785	* Ioctl system call
	786	*
	787	* Returns: 0 Success
	788	* EBADF
	789	* ENOTTY
	790	* ENOMEM
	791	* ESRCH
	792	* copyin:EFAULT
	793	* copyoutEFAULT
	794	* fp_lookup:EBADF Bad file descriptor
	795	* fo_ioctl:???
	796	*/
	797	int
	798	ioctl(struct proc p, struct ioctl_args uap, __unused int32_t *retval)
	799	{
	800	struct fileproc *fp = NULL;
	801	int error = 0;
	802	u_int size = 0;
	803	caddr_t datap = NULL, memp = NULL;
	804	boolean_t is64bit = FALSE;
	805	int tmp = 0;
	806	#define STK_PARAMS 128
	807	char stkbuf[STK_PARAMS] = {};
	808	int fd = uap->fd;
	809	u_long com = uap->com;
	810	struct vfs_context context = *vfs_context_current();
	811
	812	AUDIT_ARG(fd, uap->fd);
	813	AUDIT_ARG(addr, uap->data);
	814
	815	is64bit = proc_is64bit(p);
	816	#if CONFIG_AUDIT
	817	if (is64bit) {
	818	AUDIT_ARG(value64, com);
	819	} else {
	820	AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
	821	}
	822	#endif /* CONFIG_AUDIT */
	823
	824	/*
	825	* Interpret high order word to find amount of data to be
	826	* copied to/from the user's address space.
	827	*/
	828	size = IOCPARM_LEN(com);
	829	if (size > IOCPARM_MAX) {
	830	return ENOTTY;
	831	}
	832	if (size > sizeof(stkbuf)) {
	833	if ((memp = (caddr_t)kalloc(size)) == 0) {
	834	return ENOMEM;
	835	}
	836	datap = memp;
	837	} else {
	838	datap = &stkbuf[0];
	839	}
	840	if (com & IOC_IN) {
	841	if (size) {
	842	error = copyin(uap->data, datap, size);
	843	if (error) {
	844	goto out_nofp;
	845	}
	846	} else {
	847	/* XXX - IOC_IN and no size? we should proably return an error here!! */
	848	if (is64bit) {
	849	(user_addr_t )datap = uap->data;
	850	} else {
	851	(uint32_t )datap = (uint32_t)uap->data;
	852	}
	853	}
	854	} else if ((com & IOC_OUT) && size) {
	855	/*
	856	* Zero the buffer so the user always
	857	* gets back something deterministic.
	858	*/
	859	bzero(datap, size);
	860	} else if (com & IOC_VOID) {
	861	/* XXX - this is odd since IOC_VOID means no parameters */
	862	if (is64bit) {
	863	(user_addr_t )datap = uap->data;
	864	} else {
	865	(uint32_t )datap = (uint32_t)uap->data;
	866	}
	867	}
	868
	869	proc_fdlock(p);
	870	error = fp_lookup(p, fd, &fp, 1);
	871	if (error) {
	872	proc_fdunlock(p);
	873	goto out_nofp;
	874	}
	875
	876	AUDIT_ARG(file, p, fp);
	877
	878	if ((fp->f_flag & (FREAD \| FWRITE)) == 0) {
	879	error = EBADF;
	880	goto out;
	881	}
	882
	883	context.vc_ucred = fp->f_fglob->fg_cred;
	884
	885	#if CONFIG_MACF
	886	error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, com);
	887	if (error) {
	888	goto out;
	889	}
	890	#endif
	891
	892	switch (com) {
	893	case FIONCLEX:
	894	*fdflags(p, fd) &= ~UF_EXCLOSE;
	895	break;
	896
	897	case FIOCLEX:
	898	*fdflags(p, fd) \|= UF_EXCLOSE;
	899	break;
	900
	901	case FIONBIO:
	902	if ((tmp = (int )datap)) {
	903	fp->f_flag \|= FNONBLOCK;
	904	} else {
	905	fp->f_flag &= ~FNONBLOCK;
	906	}
	907	error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
	908	break;
	909
	910	case FIOASYNC:
	911	if ((tmp = (int )datap)) {
	912	fp->f_flag \|= FASYNC;
	913	} else {
	914	fp->f_flag &= ~FASYNC;
	915	}
	916	error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
	917	break;
	918
	919	case FIOSETOWN:
	920	tmp = (int )datap;
	921	if (fp->f_type == DTYPE_SOCKET) {
	922	((struct socket *)fp->f_data)->so_pgid = tmp;
	923	break;
	924	}
	925	if (fp->f_type == DTYPE_PIPE) {
	926	error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
	927	break;
	928	}
	929	if (tmp <= 0) {
	930	tmp = -tmp;
	931	} else {
	932	struct proc *p1 = proc_find(tmp);
	933	if (p1 == 0) {
	934	error = ESRCH;
	935	break;
	936	}
	937	tmp = p1->p_pgrpid;
	938	proc_rele(p1);
	939	}
	940	error = fo_ioctl(fp, TIOCSPGRP, (caddr_t)&tmp, &context);
	941	break;
	942
	943	case FIOGETOWN:
	944	if (fp->f_type == DTYPE_SOCKET) {
	945	(int )datap = ((struct socket *)fp->f_data)->so_pgid;
	946	break;
	947	}
	948	error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
	949	(int )datap = -(int )datap;
	950	break;
	951
	952	default:
	953	error = fo_ioctl(fp, com, datap, &context);
	954	/*
	955	* Copy any data to user, size was
	956	* already set and checked above.
	957	*/
	958	if (error == 0 && (com & IOC_OUT) && size) {
	959	error = copyout(datap, uap->data, (u_int)size);
	960	}
	961	break;
	962	}
	963	out:
	964	fp_drop(p, fd, fp, 1);
	965	proc_fdunlock(p);
	966
	967	out_nofp:
	968	if (memp) {
	969	kfree(memp, size);
	970	}
	971	return error;
	972	}
	973
	974	int selwait, nselcoll;
	975	#define SEL_FIRSTPASS 1
	976	#define SEL_SECONDPASS 2
	977	extern int selcontinue(int error);
	978	extern int selprocess(int error, int sel_pass);
	979	static int selscan(struct proc p, struct _select sel, struct _select_data * seldata,
	980	int nfd, int32_t retval, int sel_pass, struct waitq_set wqset);
	981	static int selcount(struct proc p, u_int32_t ibits, int nfd, int *count);
	982	static int seldrop_locked(struct proc p, u_int32_t ibits, int nfd, int lim, int *need_wakeup, int fromselcount);
	983	static int seldrop(struct proc p, u_int32_t ibits, int nfd);
	984	static int select_internal(struct proc p, struct select_nocancel_args uap, uint64_t timeout, int32_t *retval);
	985
	986	/*
	987	* Select system call.
	988	*
	989	* Returns: 0 Success
	990	* EINVAL Invalid argument
	991	* EAGAIN Nonconformant error if allocation fails
	992	*/
	993	int
	994	select(struct proc p, struct select_args uap, int32_t *retval)
	995	{
	996	__pthread_testcancel(1);
	997	return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
	998	}
	999
	1000	int
	1001	select_nocancel(struct proc p, struct select_nocancel_args uap, int32_t *retval)
	1002	{
	1003	uint64_t timeout = 0;
	1004
	1005	if (uap->tv) {
	1006	int err;
	1007	struct timeval atv;
	1008	if (IS_64BIT_PROCESS(p)) {
	1009	struct user64_timeval atv64;
	1010	err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
	1011	/* Loses resolution - assume timeout < 68 years */
	1012	atv.tv_sec = atv64.tv_sec;
	1013	atv.tv_usec = atv64.tv_usec;
	1014	} else {
	1015	struct user32_timeval atv32;
	1016	err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
	1017	atv.tv_sec = atv32.tv_sec;
	1018	atv.tv_usec = atv32.tv_usec;
	1019	}
	1020	if (err) {
	1021	return err;
	1022	}
	1023
	1024	if (itimerfix(&atv)) {
	1025	err = EINVAL;
	1026	return err;
	1027	}
	1028
	1029	clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
	1030	}
	1031
	1032	return select_internal(p, uap, timeout, retval);
	1033	}
	1034
	1035	int
	1036	pselect(struct proc p, struct pselect_args uap, int32_t *retval)
	1037	{
	1038	__pthread_testcancel(1);
	1039	return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
	1040	}
	1041
	1042	int
	1043	pselect_nocancel(struct proc p, struct pselect_nocancel_args uap, int32_t *retval)
	1044	{
	1045	int err;
	1046	struct uthread *ut;
	1047	uint64_t timeout = 0;
	1048
	1049	if (uap->ts) {
	1050	struct timespec ts;
	1051
	1052	if (IS_64BIT_PROCESS(p)) {
	1053	struct user64_timespec ts64;
	1054	err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
	1055	ts.tv_sec = ts64.tv_sec;
	1056	ts.tv_nsec = ts64.tv_nsec;
	1057	} else {
	1058	struct user32_timespec ts32;
	1059	err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
	1060	ts.tv_sec = ts32.tv_sec;
	1061	ts.tv_nsec = ts32.tv_nsec;
	1062	}
	1063	if (err) {
	1064	return err;
	1065	}
	1066
	1067	if (!timespec_is_valid(&ts)) {
	1068	return EINVAL;
	1069	}
	1070	clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
	1071	}
	1072
	1073	ut = get_bsdthread_info(current_thread());
	1074
	1075	if (uap->mask != USER_ADDR_NULL) {
	1076	/* save current mask, then copyin and set new mask */
	1077	sigset_t newset;
	1078	err = copyin(uap->mask, &newset, sizeof(sigset_t));
	1079	if (err) {
	1080	return err;
	1081	}
	1082	ut->uu_oldmask = ut->uu_sigmask;
	1083	ut->uu_flag \|= UT_SAS_OLDMASK;
	1084	ut->uu_sigmask = (newset & ~sigcantmask);
	1085	}
	1086
	1087	err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
	1088
	1089	if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
	1090	/*
	1091	* Restore old mask (direct return case). NOTE: EINTR can also be returned
	1092	* if the thread is cancelled. In that case, we don't reset the signal
	1093	* mask to its original value (which usually happens in the signal
	1094	* delivery path). This behavior is permitted by POSIX.
	1095	*/
	1096	ut->uu_sigmask = ut->uu_oldmask;
	1097	ut->uu_oldmask = 0;
	1098	ut->uu_flag &= ~UT_SAS_OLDMASK;
	1099	}
	1100
	1101	return err;
	1102	}
	1103
	1104	/*
	1105	* Generic implementation of {,p}select. Care: we type-pun uap across the two
	1106	* syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
	1107	* are identical. The 5th (timeout) argument points to different types, so we
	1108	* unpack in the syscall-specific code, but the generic code still does a null
	1109	* check on this argument to determine if a timeout was specified.
	1110	*/
	1111	static int
	1112	select_internal(struct proc p, struct select_nocancel_args uap, uint64_t timeout, int32_t *retval)
	1113	{
	1114	int error = 0;
	1115	u_int ni, nw;
	1116	thread_t th_act;
	1117	struct uthread *uth;
	1118	struct _select *sel;
	1119	struct _select_data *seldata;
	1120	int needzerofill = 1;
	1121	int count = 0;
	1122	size_t sz = 0;
	1123
	1124	th_act = current_thread();
	1125	uth = get_bsdthread_info(th_act);
	1126	sel = &uth->uu_select;
	1127	seldata = &uth->uu_save.uus_select_data;
	1128	*retval = 0;
	1129
	1130	seldata->args = uap;
	1131	seldata->retval = retval;
	1132	seldata->wqp = NULL;
	1133	seldata->count = 0;
	1134
	1135	if (uap->nd < 0) {
	1136	return EINVAL;
	1137	}
	1138
	1139	/* select on thread of process that already called proc_exit() */
	1140	if (p->p_fd == NULL) {
	1141	return EBADF;
	1142	}
	1143
	1144	if (uap->nd > p->p_fd->fd_nfiles) {
	1145	uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
	1146	}
	1147	nw = howmany(uap->nd, NFDBITS);
	1148	ni = nw * sizeof(fd_mask);
	1149
	1150	/*
	1151	* if the previously allocated space for the bits is smaller than
	1152	* what is requested or no space has yet been allocated for this
	1153	* thread, allocate enough space now.
	1154	*
	1155	* Note: If this process fails, select() will return EAGAIN; this
	1156	* is the same thing pool() returns in a no-memory situation, but
	1157	* it is not a POSIX compliant error code for select().
	1158	*/
	1159	if (sel->nbytes < (3 * ni)) {
	1160	int nbytes = 3 * ni;
	1161
	1162	/* Free previous allocation, if any */
	1163	if (sel->ibits != NULL) {
	1164	FREE(sel->ibits, M_TEMP);
	1165	}
	1166	if (sel->obits != NULL) {
	1167	FREE(sel->obits, M_TEMP);
	1168	/* NULL out; subsequent ibits allocation may fail */
	1169	sel->obits = NULL;
	1170	}
	1171
	1172	MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK \| M_ZERO);
	1173	if (sel->ibits == NULL) {
	1174	return EAGAIN;
	1175	}
	1176	MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK \| M_ZERO);
	1177	if (sel->obits == NULL) {
	1178	FREE(sel->ibits, M_TEMP);
	1179	sel->ibits = NULL;
	1180	return EAGAIN;
	1181	}
	1182	sel->nbytes = nbytes;
	1183	needzerofill = 0;
	1184	}
	1185
	1186	if (needzerofill) {
	1187	bzero((caddr_t)sel->ibits, sel->nbytes);
	1188	bzero((caddr_t)sel->obits, sel->nbytes);
	1189	}
	1190
	1191	/*
	1192	* get the bits from the user address space
	1193	*/
	1194	#define getbits(name, x) \
	1195	do { \
	1196	if (uap->name && (error = copyin(uap->name, \
	1197	(caddr_t)&sel->ibits[(x) * nw], ni))) \
	1198	goto continuation; \
	1199	} while (0)
	1200
	1201	getbits(in, 0);
	1202	getbits(ou, 1);
	1203	getbits(ex, 2);
	1204	#undef getbits
	1205
	1206	seldata->abstime = timeout;
	1207
	1208	if ((error = selcount(p, sel->ibits, uap->nd, &count))) {
	1209	goto continuation;
	1210	}
	1211
	1212	/*
	1213	* We need an array of waitq pointers. This is due to the new way
	1214	* in which waitqs are linked to sets. When a thread selects on a
	1215	* file descriptor, a waitq (embedded in a selinfo structure) is
	1216	* added to the thread's local waitq set. There is no longer any
	1217	* way to directly iterate over all members of a given waitq set.
	1218	* The process of linking a waitq into a set may allocate a link
	1219	* table object. Because we can't iterate over all the waitqs to
	1220	* which our thread waitq set belongs, we need a way of removing
	1221	* this link object!
	1222	*
	1223	* Thus we need a buffer which will hold one waitq pointer
	1224	* per FD being selected. During the tear-down phase we can use
	1225	* these pointers to dis-associate the underlying selinfo's waitq
	1226	* from our thread's waitq set.
	1227	*
	1228	* Because we also need to allocate a waitq set for this thread,
	1229	* we use a bare buffer pointer to hold all the memory. Note that
	1230	* this memory is cached in the thread pointer and not reaped until
	1231	* the thread exists. This is generally OK because threads that
	1232	* call select tend to keep calling select repeatedly.
	1233	*/
	1234	sz = ALIGN(sizeof(struct waitq_set)) + (count * sizeof(uint64_t));
	1235	if (sz > uth->uu_wqstate_sz) {
	1236	/* (re)allocate a buffer to hold waitq pointers */
	1237	if (uth->uu_wqset) {
	1238	if (waitq_set_is_valid(uth->uu_wqset)) {
	1239	waitq_set_deinit(uth->uu_wqset);
	1240	}
	1241	FREE(uth->uu_wqset, M_SELECT);
	1242	} else if (uth->uu_wqstate_sz && !uth->uu_wqset) {
	1243	panic("select: thread structure corrupt! "
	1244	"uu_wqstate_sz:%ld, wqstate_buf == NULL",
	1245	uth->uu_wqstate_sz);
	1246	}
	1247	uth->uu_wqstate_sz = sz;
	1248	MALLOC(uth->uu_wqset, struct waitq_set *, sz, M_SELECT, M_WAITOK);
	1249	if (!uth->uu_wqset) {
	1250	panic("can't allocate %ld bytes for wqstate buffer",
	1251	uth->uu_wqstate_sz);
	1252	}
	1253	waitq_set_init(uth->uu_wqset,
	1254	SYNC_POLICY_FIFO \| SYNC_POLICY_PREPOST, NULL, NULL);
	1255	}
	1256
	1257	if (!waitq_set_is_valid(uth->uu_wqset)) {
	1258	waitq_set_init(uth->uu_wqset,
	1259	SYNC_POLICY_FIFO \| SYNC_POLICY_PREPOST, NULL, NULL);
	1260	}
	1261
	1262	/* the last chunk of our buffer is an array of waitq pointers */
	1263	seldata->wqp = (uint64_t )((char )(uth->uu_wqset) + ALIGN(sizeof(struct waitq_set)));
	1264	bzero(seldata->wqp, sz - ALIGN(sizeof(struct waitq_set)));
	1265
	1266	seldata->count = count;
	1267
	1268	continuation:
	1269
	1270	if (error) {
	1271	/*
	1272	* We have already cleaned up any state we established,
	1273	* either locally or as a result of selcount(). We don't
	1274	* need to wait_subqueue_unlink_all(), since we haven't set
	1275	* anything at this point.
	1276	*/
	1277	return error;
	1278	}
	1279
	1280	return selprocess(0, SEL_FIRSTPASS);
	1281	}
	1282
	1283	int
	1284	selcontinue(int error)
	1285	{
	1286	return selprocess(error, SEL_SECONDPASS);
	1287	}
	1288
	1289
	1290	/*
	1291	* selprocess
	1292	*
	1293	* Parameters: error The error code from our caller
	1294	* sel_pass The pass we are on
	1295	*/
	1296	int
	1297	selprocess(int error, int sel_pass)
	1298	{
	1299	int ncoll;
	1300	u_int ni, nw;
	1301	thread_t th_act;
	1302	struct uthread *uth;
	1303	struct proc *p;
	1304	struct select_nocancel_args *uap;
	1305	int *retval;
	1306	struct _select *sel;
	1307	struct _select_data *seldata;
	1308	int unwind = 1;
	1309	int prepost = 0;
	1310	int somewakeup = 0;
	1311	int doretry = 0;
	1312	wait_result_t wait_result;
	1313
	1314	p = current_proc();
	1315	th_act = current_thread();
	1316	uth = get_bsdthread_info(th_act);
	1317	sel = &uth->uu_select;
	1318	seldata = &uth->uu_save.uus_select_data;
	1319	uap = seldata->args;
	1320	retval = seldata->retval;
	1321
	1322	if ((error != 0) && (sel_pass == SEL_FIRSTPASS)) {
	1323	unwind = 0;
	1324	}
	1325	if (seldata->count == 0) {
	1326	unwind = 0;
	1327	}
	1328	retry:
	1329	if (error != 0) {
	1330	goto done;
	1331	}
	1332
	1333	ncoll = nselcoll;
	1334	OSBitOrAtomic(P_SELECT, &p->p_flag);
	1335
	1336	/* skip scans if the select is just for timeouts */
	1337	if (seldata->count) {
	1338	error = selscan(p, sel, seldata, uap->nd, retval, sel_pass, uth->uu_wqset);
	1339	if (error \|\| *retval) {
	1340	goto done;
	1341	}
	1342	if (prepost \|\| somewakeup) {
	1343	/*
	1344	* if the select of log, then we can wakeup and
	1345	* discover some one else already read the data;
	1346	* go to select again if time permits
	1347	*/
	1348	prepost = 0;
	1349	somewakeup = 0;
	1350	doretry = 1;
	1351	}
	1352	}
	1353
	1354	if (uap->tv) {
	1355	uint64_t now;
	1356
	1357	clock_get_uptime(&now);
	1358	if (now >= seldata->abstime) {
	1359	goto done;
	1360	}
	1361	}
	1362
	1363	if (doretry) {
	1364	/* cleanup obits and try again */
	1365	doretry = 0;
	1366	sel_pass = SEL_FIRSTPASS;
	1367	goto retry;
	1368	}
	1369
	1370	/*
	1371	* To effect a poll, the timeout argument should be
	1372	* non-nil, pointing to a zero-valued timeval structure.
	1373	*/
	1374	if (uap->tv && seldata->abstime == 0) {
	1375	goto done;
	1376	}
	1377
	1378	/* No spurious wakeups due to colls,no need to check for them */
	1379	if ((sel_pass == SEL_SECONDPASS) \|\| ((p->p_flag & P_SELECT) == 0)) {
	1380	sel_pass = SEL_FIRSTPASS;
	1381	goto retry;
	1382	}
	1383
	1384	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1385
	1386	/* if the select is just for timeout skip check */
	1387	if (seldata->count && (sel_pass == SEL_SECONDPASS)) {
	1388	panic("selprocess: 2nd pass assertwaiting");
	1389	}
	1390
	1391	/* waitq_set has waitqueue as first element */
	1392	wait_result = waitq_assert_wait64_leeway((struct waitq *)uth->uu_wqset,
	1393	NO_EVENT64, THREAD_ABORTSAFE,
	1394	TIMEOUT_URGENCY_USER_NORMAL,
	1395	seldata->abstime,
	1396	TIMEOUT_NO_LEEWAY);
	1397	if (wait_result != THREAD_AWAKENED) {
	1398	/* there are no preposted events */
	1399	error = tsleep1(NULL, PSOCK \| PCATCH,
	1400	"select", 0, selcontinue);
	1401	} else {
	1402	prepost = 1;
	1403	error = 0;
	1404	}
	1405
	1406	if (error == 0) {
	1407	sel_pass = SEL_SECONDPASS;
	1408	if (!prepost) {
	1409	somewakeup = 1;
	1410	}
	1411	goto retry;
	1412	}
	1413	done:
	1414	if (unwind) {
	1415	seldrop(p, sel->ibits, uap->nd);
	1416	waitq_set_deinit(uth->uu_wqset);
	1417	/*
	1418	* zero out the waitq pointer array to avoid use-after free
	1419	* errors in the selcount error path (seldrop_locked) if/when
	1420	* the thread re-calls select().
	1421	*/
	1422	bzero((void *)uth->uu_wqset, uth->uu_wqstate_sz);
	1423	}
	1424	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1425	/* select is not restarted after signals... */
	1426	if (error == ERESTART) {
	1427	error = EINTR;
	1428	}
	1429	if (error == EWOULDBLOCK) {
	1430	error = 0;
	1431	}
	1432	nw = howmany(uap->nd, NFDBITS);
	1433	ni = nw * sizeof(fd_mask);
	1434
	1435	#define putbits(name, x) \
	1436	do { \
	1437	if (uap->name && (error2 = \
	1438	copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
	1439	error = error2; \
	1440	} while (0)
	1441
	1442	if (error == 0) {
	1443	int error2;
	1444
	1445	putbits(in, 0);
	1446	putbits(ou, 1);
	1447	putbits(ex, 2);
	1448	#undef putbits
	1449	}
	1450
	1451	if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
	1452	/* restore signal mask - continuation case */
	1453	uth->uu_sigmask = uth->uu_oldmask;
	1454	uth->uu_oldmask = 0;
	1455	uth->uu_flag &= ~UT_SAS_OLDMASK;
	1456	}
	1457
	1458	return error;
	1459	}
	1460
	1461
	1462	/**
	1463	* remove the fileproc's underlying waitq from the supplied waitq set;
	1464	* clear FP_INSELECT when appropriate
	1465	*
	1466	* Parameters:
	1467	* fp File proc that is potentially currently in select
	1468	* wqset Waitq set to which the fileproc may belong
	1469	* (usually this is the thread's private waitq set)
	1470	* Conditions:
	1471	* proc_fdlock is held
	1472	*/
	1473	static void
	1474	selunlinkfp(struct fileproc fp, uint64_t wqp_id, struct waitq_set wqset)
	1475	{
	1476	int valid_set = waitq_set_is_valid(wqset);
	1477	int valid_q = !!wqp_id;
	1478
	1479	/*
	1480	* This could be called (from selcount error path) before we setup
	1481	* the thread's wqset. Check the wqset passed in, and only unlink if
	1482	* the set is valid.
	1483	*/
	1484
	1485	/* unlink the underlying waitq from the input set (thread waitq set) */
	1486	if (valid_q && valid_set) {
	1487	waitq_unlink_by_prepost_id(wqp_id, wqset);
	1488	}
	1489
	1490	/* allow passing a NULL/invalid fp for seldrop unwind */
	1491	if (!fp \|\| !(fp->f_flags & (FP_INSELECT \| FP_SELCONFLICT))) {
	1492	return;
	1493	}
	1494
	1495	/*
	1496	* We can always remove the conflict queue from our thread's set: this
	1497	* will not affect other threads that potentially need to be awoken on
	1498	* the conflict queue during a fileproc_drain - those sets will still
	1499	* be linked with the global conflict queue, and the last waiter
	1500	* on the fp clears the CONFLICT marker.
	1501	*/
	1502	if (valid_set && (fp->f_flags & FP_SELCONFLICT)) {
	1503	waitq_unlink(&select_conflict_queue, wqset);
	1504	}
	1505
	1506	/* jca: TODO:
	1507	* This isn't quite right - we don't actually know if this
	1508	* fileproc is in another select or not! Here we just assume
	1509	* that if we were the first thread to select on the FD, then
	1510	* we'll be the one to clear this flag...
	1511	*/
	1512	if (valid_set && fp->f_wset == (void *)wqset) {
	1513	fp->f_flags &= ~FP_INSELECT;
	1514	fp->f_wset = NULL;
	1515	}
	1516	}
	1517
	1518	/**
	1519	* connect a fileproc to the given wqset, potentially bridging to a waitq
	1520	* pointed to indirectly by wq_data
	1521	*
	1522	* Parameters:
	1523	* fp File proc potentially currently in select
	1524	* wq_data Pointer to a pointer to a waitq (could be NULL)
	1525	* wqset Waitq set to which the fileproc should now belong
	1526	* (usually this is the thread's private waitq set)
	1527	*
	1528	* Conditions:
	1529	* proc_fdlock is held
	1530	*/
	1531	static uint64_t
	1532	sellinkfp(struct fileproc fp, void wq_data, struct waitq_set wqset)
	1533	{
	1534	struct waitq *f_wq = NULL;
	1535
	1536	if ((fp->f_flags & FP_INSELECT) != FP_INSELECT) {
	1537	if (wq_data) {
	1538	panic("non-null data:%p on fp:%p not in select?!"
	1539	"(wqset:%p)", wq_data, fp, wqset);
	1540	}
	1541	return 0;
	1542	}
	1543
	1544	if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) {
	1545	waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, NULL);
	1546	}
	1547
	1548	/*
	1549	* The wq_data parameter has potentially been set by selrecord called
	1550	* from a subsystems fo_select() function. If the subsystem does not
	1551	* call selrecord, then wq_data will be NULL
	1552	*
	1553	* Use memcpy to get the value into a proper pointer because
	1554	* wq_data most likely points to a stack variable that could be
	1555	* unaligned on 32-bit systems.
	1556	*/
	1557	if (wq_data) {
	1558	memcpy(&f_wq, wq_data, sizeof(f_wq));
	1559	if (!waitq_is_valid(f_wq)) {
	1560	f_wq = NULL;
	1561	}
	1562	}
	1563
	1564	/* record the first thread's wqset in the fileproc structure */
	1565	if (!fp->f_wset) {
	1566	fp->f_wset = (void *)wqset;
	1567	}
	1568
	1569	/* handles NULL f_wq */
	1570	return waitq_get_prepost_id(f_wq);
	1571	}
	1572
	1573
	1574	/*
	1575	* selscan
	1576	*
	1577	* Parameters: p Process performing the select
	1578	* sel The per-thread select context structure
	1579	* nfd The number of file descriptors to scan
	1580	* retval The per thread system call return area
	1581	* sel_pass Which pass this is; allowed values are
	1582	* SEL_FIRSTPASS and SEL_SECONDPASS
	1583	* wqset The per thread wait queue set
	1584	*
	1585	* Returns: 0 Success
	1586	* EIO Invalid p->p_fd field XXX Obsolete?
	1587	* EBADF One of the files in the bit vector is
	1588	* invalid.
	1589	*/
	1590	static int
	1591	selscan(struct proc p, struct _select sel, struct _select_data * seldata,
	1592	int nfd, int32_t retval, int sel_pass, struct waitq_set wqset)
	1593	{
	1594	struct filedesc *fdp = p->p_fd;
	1595	int msk, i, j, fd;
	1596	u_int32_t bits;
	1597	struct fileproc *fp;
	1598	int n = 0; /* count of bits */
	1599	int nc = 0; /* bit vector offset (nc'th bit) */
	1600	static int flag[3] = { FREAD, FWRITE, 0 };
	1601	u_int32_t iptr, optr;
	1602	u_int nw;
	1603	u_int32_t ibits, obits;
	1604	uint64_t reserved_link, *rl_ptr = NULL;
	1605	int count;
	1606	struct vfs_context context = *vfs_context_current();
	1607
	1608	/*
	1609	* Problems when reboot; due to MacOSX signal probs
	1610	* in Beaker1C ; verify that the p->p_fd is valid
	1611	*/
	1612	if (fdp == NULL) {
	1613	*retval = 0;
	1614	return EIO;
	1615	}
	1616	ibits = sel->ibits;
	1617	obits = sel->obits;
	1618
	1619	nw = howmany(nfd, NFDBITS);
	1620
	1621	count = seldata->count;
	1622
	1623	nc = 0;
	1624	if (!count) {
	1625	*retval = 0;
	1626	return 0;
	1627	}
	1628
	1629	proc_fdlock(p);
	1630	for (msk = 0; msk < 3; msk++) {
	1631	iptr = (u_int32_t )&ibits[msk nw];
	1632	optr = (u_int32_t )&obits[msk nw];
	1633
	1634	for (i = 0; i < nfd; i += NFDBITS) {
	1635	bits = iptr[i / NFDBITS];
	1636
	1637	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	1638	bits &= ~(1U << j);
	1639
	1640	if (fd < fdp->fd_nfiles) {
	1641	fp = fdp->fd_ofiles[fd];
	1642	} else {
	1643	fp = NULL;
	1644	}
	1645
	1646	if (fp == NULL \|\| (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
	1647	/*
	1648	* If we abort because of a bad
	1649	* fd, let the caller unwind...
	1650	*/
	1651	proc_fdunlock(p);
	1652	return EBADF;
	1653	}
	1654	if (sel_pass == SEL_SECONDPASS) {
	1655	reserved_link = 0;
	1656	rl_ptr = NULL;
	1657	selunlinkfp(fp, seldata->wqp[nc], wqset);
	1658	} else {
	1659	reserved_link = waitq_link_reserve((struct waitq *)wqset);
	1660	rl_ptr = &reserved_link;
	1661	if (fp->f_flags & FP_INSELECT) {
	1662	/* someone is already in select on this fp */
	1663	fp->f_flags \|= FP_SELCONFLICT;
	1664	} else {
	1665	fp->f_flags \|= FP_INSELECT;
	1666	}
	1667
	1668	waitq_set_lazy_init_link(wqset);
	1669	}
	1670
	1671	context.vc_ucred = fp->f_cred;
	1672
	1673	/*
	1674	* stash this value b/c fo_select may replace
	1675	* reserved_link with a pointer to a waitq object
	1676	*/
	1677	uint64_t rsvd = reserved_link;
	1678
	1679	/* The select; set the bit, if true */
	1680	if (fp->f_ops && fp->f_type
	1681	&& fo_select(fp, flag[msk], rl_ptr, &context)) {
	1682	optr[fd / NFDBITS] \|= (1U << (fd % NFDBITS));
	1683	n++;
	1684	}
	1685	if (sel_pass == SEL_FIRSTPASS) {
	1686	waitq_link_release(rsvd);
	1687	/*
	1688	* If the fp's supporting selinfo structure was linked
	1689	* to this thread's waitq set, then 'reserved_link'
	1690	* will have been updated by selrecord to be a pointer
	1691	* to the selinfo's waitq.
	1692	*/
	1693	if (reserved_link == rsvd) {
	1694	rl_ptr = NULL; /* fo_select never called selrecord() */
	1695	}
	1696	/*
	1697	* Hook up the thread's waitq set either to
	1698	* the fileproc structure, or to the global
	1699	* conflict queue: but only on the first
	1700	* select pass.
	1701	*/
	1702	seldata->wqp[nc] = sellinkfp(fp, (void **)rl_ptr, wqset);
	1703	}
	1704	nc++;
	1705	}
	1706	}
	1707	}
	1708	proc_fdunlock(p);
	1709
	1710	*retval = n;
	1711	return 0;
	1712	}
	1713
	1714	static int poll_callback(struct kevent_qos_s *, kevent_ctx_t);
	1715
	1716	int
	1717	poll(struct proc p, struct poll_args uap, int32_t *retval)
	1718	{
	1719	__pthread_testcancel(1);
	1720	return poll_nocancel(p, (struct poll_nocancel_args *)uap, retval);
	1721	}
	1722
	1723
	1724	int
	1725	poll_nocancel(struct proc p, struct poll_nocancel_args uap, int32_t *retval)
	1726	{
	1727	struct pollfd *fds = NULL;
	1728	struct kqueue *kq = NULL;
	1729	int ncoll, error = 0;
	1730	u_int nfds = uap->nfds;
	1731	u_int rfds = 0;
	1732
	1733	/*
	1734	* This is kinda bogus. We have fd limits, but that is not
	1735	* really related to the size of the pollfd array. Make sure
	1736	* we let the process use at least FD_SETSIZE entries and at
	1737	* least enough for the current limits. We want to be reasonably
	1738	* safe, but not overly restrictive.
	1739	*/
	1740	if (nfds > OPEN_MAX \|\|
	1741	(nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) \|\| nfds > FD_SETSIZE))) {
	1742	return EINVAL;
	1743	}
	1744
	1745	kq = kqueue_alloc(p);
	1746	if (kq == NULL) {
	1747	return EAGAIN;
	1748	}
	1749
	1750	if (nfds) {
	1751	size_t ni = nfds * sizeof(struct pollfd);
	1752	MALLOC(fds, struct pollfd *, ni, M_TEMP, M_WAITOK);
	1753	if (NULL == fds) {
	1754	error = EAGAIN;
	1755	goto out;
	1756	}
	1757
	1758	error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
	1759	if (error) {
	1760	goto out;
	1761	}
	1762	}
	1763
	1764	/* JMM - all this P_SELECT stuff is bogus */
	1765	ncoll = nselcoll;
	1766	OSBitOrAtomic(P_SELECT, &p->p_flag);
	1767	for (u_int i = 0; i < nfds; i++) {
	1768	short events = fds[i].events;
	1769	__assert_only int rc;
	1770
	1771	/* per spec, ignore fd values below zero */
	1772	if (fds[i].fd < 0) {
	1773	fds[i].revents = 0;
	1774	continue;
	1775	}
	1776
	1777	/* convert the poll event into a kqueue kevent */
	1778	struct kevent_qos_s kev = {
	1779	.ident = fds[i].fd,
	1780	.flags = EV_ADD \| EV_ONESHOT \| EV_POLL,
	1781	.udata = CAST_USER_ADDR_T(&fds[i])
	1782	};
	1783
	1784	/* Handle input events */
	1785	if (events & (POLLIN \| POLLRDNORM \| POLLPRI \| POLLRDBAND \| POLLHUP)) {
	1786	kev.filter = EVFILT_READ;
	1787	if (events & (POLLPRI \| POLLRDBAND)) {
	1788	kev.flags \|= EV_OOBAND;
	1789	}
	1790	rc = kevent_register(kq, &kev, NULL);
	1791	assert((rc & FILTER_REGISTER_WAIT) == 0);
	1792	}
	1793
	1794	/* Handle output events */
	1795	if ((kev.flags & EV_ERROR) == 0 &&
	1796	(events & (POLLOUT \| POLLWRNORM \| POLLWRBAND))) {
	1797	kev.filter = EVFILT_WRITE;
	1798	rc = kevent_register(kq, &kev, NULL);
	1799	assert((rc & FILTER_REGISTER_WAIT) == 0);
	1800	}
	1801
	1802	/* Handle BSD extension vnode events */
	1803	if ((kev.flags & EV_ERROR) == 0 &&
	1804	(events & (POLLEXTEND \| POLLATTRIB \| POLLNLINK \| POLLWRITE))) {
	1805	kev.filter = EVFILT_VNODE;
	1806	kev.fflags = 0;
	1807	if (events & POLLEXTEND) {
	1808	kev.fflags \|= NOTE_EXTEND;
	1809	}
	1810	if (events & POLLATTRIB) {
	1811	kev.fflags \|= NOTE_ATTRIB;
	1812	}
	1813	if (events & POLLNLINK) {
	1814	kev.fflags \|= NOTE_LINK;
	1815	}
	1816	if (events & POLLWRITE) {
	1817	kev.fflags \|= NOTE_WRITE;
	1818	}
	1819	rc = kevent_register(kq, &kev, NULL);
	1820	assert((rc & FILTER_REGISTER_WAIT) == 0);
	1821	}
	1822
	1823	if (kev.flags & EV_ERROR) {
	1824	fds[i].revents = POLLNVAL;
	1825	rfds++;
	1826	} else {
	1827	fds[i].revents = 0;
	1828	}
	1829	}
	1830
	1831	/*
	1832	* Did we have any trouble registering?
	1833	* If user space passed 0 FDs, then respect any timeout value passed.
	1834	* This is an extremely inefficient sleep. If user space passed one or
	1835	* more FDs, and we had trouble registering _all_ of them, then bail
	1836	* out. If a subset of the provided FDs failed to register, then we
	1837	* will still call the kqueue_scan function.
	1838	*/
	1839	if (nfds && (rfds == nfds)) {
	1840	goto done;
	1841	}
	1842
	1843	/* scan for, and possibly wait for, the kevents to trigger */
	1844	kevent_ctx_t kectx = kevent_get_context(current_thread());
	1845	*kectx = (struct kevent_ctx_s){
	1846	.kec_process_noutputs = rfds,
	1847	.kec_process_flags = KEVENT_FLAG_POLL,
	1848	.kec_deadline = 0, /* wait forever */
	1849	};
	1850
	1851	/*
	1852	* If any events have trouble registering, an event has fired and we
	1853	* shouldn't wait for events in kqueue_scan.
	1854	*/
	1855	if (rfds) {
	1856	kectx->kec_process_flags \|= KEVENT_FLAG_IMMEDIATE;
	1857	} else if (uap->timeout != -1) {
	1858	clock_interval_to_deadline(uap->timeout, NSEC_PER_MSEC,
	1859	&kectx->kec_deadline);
	1860	}
	1861
	1862	error = kqueue_scan(kq, kectx->kec_process_flags, kectx, poll_callback);
	1863	rfds = kectx->kec_process_noutputs;
	1864
	1865	done:
	1866	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1867	/* poll is not restarted after signals... */
	1868	if (error == ERESTART) {
	1869	error = EINTR;
	1870	}
	1871	if (error == 0) {
	1872	error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
	1873	*retval = rfds;
	1874	}
	1875
	1876	out:
	1877	if (NULL != fds) {
	1878	FREE(fds, M_TEMP);
	1879	}
	1880
	1881	kqueue_dealloc(kq);
	1882	return error;
	1883	}
	1884
	1885	static int
	1886	poll_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
	1887	{
	1888	struct pollfd fds = CAST_DOWN(struct pollfd , kevp->udata);
	1889	short prev_revents = fds->revents;
	1890	short mask = 0;
	1891
	1892	/* convert the results back into revents */
	1893	if (kevp->flags & EV_EOF) {
	1894	fds->revents \|= POLLHUP;
	1895	}
	1896	if (kevp->flags & EV_ERROR) {
	1897	fds->revents \|= POLLERR;
	1898	}
	1899
	1900	switch (kevp->filter) {
	1901	case EVFILT_READ:
	1902	if (fds->revents & POLLHUP) {
	1903	mask = (POLLIN \| POLLRDNORM \| POLLPRI \| POLLRDBAND);
	1904	} else {
	1905	mask = (POLLIN \| POLLRDNORM);
	1906	if (kevp->flags & EV_OOBAND) {
	1907	mask \|= (POLLPRI \| POLLRDBAND);
	1908	}
	1909	}
	1910	fds->revents \|= (fds->events & mask);
	1911	break;
	1912
	1913	case EVFILT_WRITE:
	1914	if (!(fds->revents & POLLHUP)) {
	1915	fds->revents \|= (fds->events & (POLLOUT \| POLLWRNORM \| POLLWRBAND));
	1916	}
	1917	break;
	1918
	1919	case EVFILT_VNODE:
	1920	if (kevp->fflags & NOTE_EXTEND) {
	1921	fds->revents \|= (fds->events & POLLEXTEND);
	1922	}
	1923	if (kevp->fflags & NOTE_ATTRIB) {
	1924	fds->revents \|= (fds->events & POLLATTRIB);
	1925	}
	1926	if (kevp->fflags & NOTE_LINK) {
	1927	fds->revents \|= (fds->events & POLLNLINK);
	1928	}
	1929	if (kevp->fflags & NOTE_WRITE) {
	1930	fds->revents \|= (fds->events & POLLWRITE);
	1931	}
	1932	break;
	1933	}
	1934
	1935	if (fds->revents != 0 && prev_revents == 0) {
	1936	kectx->kec_process_noutputs++;
	1937	}
	1938
	1939	return 0;
	1940	}
	1941
	1942	int
	1943	seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
	1944	{
	1945	return 1;
	1946	}
	1947
	1948	/*
	1949	* selcount
	1950	*
	1951	* Count the number of bits set in the input bit vector, and establish an
	1952	* outstanding fp->f_iocount for each of the descriptors which will be in
	1953	* use in the select operation.
	1954	*
	1955	* Parameters: p The process doing the select
	1956	* ibits The input bit vector
	1957	* nfd The number of fd's in the vector
	1958	* countp Pointer to where to store the bit count
	1959	*
	1960	* Returns: 0 Success
	1961	* EIO Bad per process open file table
	1962	* EBADF One of the bits in the input bit vector
	1963	* references an invalid fd
	1964	*
	1965	* Implicit: *countp (modified) Count of fd's
	1966	*
	1967	* Notes: This function is the first pass under the proc_fdlock() that
	1968	* permits us to recognize invalid descriptors in the bit vector;
	1969	* the may, however, not remain valid through the drop and
	1970	* later reacquisition of the proc_fdlock().
	1971	*/
	1972	static int
	1973	selcount(struct proc p, u_int32_t ibits, int nfd, int *countp)
	1974	{
	1975	struct filedesc *fdp = p->p_fd;
	1976	int msk, i, j, fd;
	1977	u_int32_t bits;
	1978	struct fileproc *fp;
	1979	int n = 0;
	1980	u_int32_t *iptr;
	1981	u_int nw;
	1982	int error = 0;
	1983	int dropcount;
	1984	int need_wakeup = 0;
	1985
	1986	/*
	1987	* Problems when reboot; due to MacOSX signal probs
	1988	* in Beaker1C ; verify that the p->p_fd is valid
	1989	*/
	1990	if (fdp == NULL) {
	1991	*countp = 0;
	1992	return EIO;
	1993	}
	1994	nw = howmany(nfd, NFDBITS);
	1995
	1996	proc_fdlock(p);
	1997	for (msk = 0; msk < 3; msk++) {
	1998	iptr = (u_int32_t )&ibits[msk nw];
	1999	for (i = 0; i < nfd; i += NFDBITS) {
	2000	bits = iptr[i / NFDBITS];
	2001	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	2002	bits &= ~(1U << j);
	2003
	2004	if (fd < fdp->fd_nfiles) {
	2005	fp = fdp->fd_ofiles[fd];
	2006	} else {
	2007	fp = NULL;
	2008	}
	2009
	2010	if (fp == NULL \|\|
	2011	(fdp->fd_ofileflags[fd] & UF_RESERVED)) {
	2012	*countp = 0;
	2013	error = EBADF;
	2014	goto bad;
	2015	}
	2016	os_ref_retain_locked(&fp->f_iocount);
	2017	n++;
	2018	}
	2019	}
	2020	}
	2021	proc_fdunlock(p);
	2022
	2023	*countp = n;
	2024	return 0;
	2025
	2026	bad:
	2027	dropcount = 0;
	2028
	2029	if (n == 0) {
	2030	goto out;
	2031	}
	2032	/* Ignore error return; it's already EBADF */
	2033	(void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1);
	2034
	2035	out:
	2036	proc_fdunlock(p);
	2037	if (need_wakeup) {
	2038	wakeup(&p->p_fpdrainwait);
	2039	}
	2040	return error;
	2041	}
	2042
	2043
	2044	/*
	2045	* seldrop_locked
	2046	*
	2047	* Drop outstanding wait queue references set up during selscan(); drop the
	2048	* outstanding per fileproc f_iocount() picked up during the selcount().
	2049	*
	2050	* Parameters: p Process performing the select
	2051	* ibits Input bit bector of fd's
	2052	* nfd Number of fd's
	2053	* lim Limit to number of vector entries to
	2054	* consider, or -1 for "all"
	2055	* inselect True if
	2056	* need_wakeup Pointer to flag to set to do a wakeup
	2057	* if f_iocont on any descriptor goes to 0
	2058	*
	2059	* Returns: 0 Success
	2060	* EBADF One or more fds in the bit vector
	2061	* were invalid, but the rest
	2062	* were successfully dropped
	2063	*
	2064	* Notes: An fd make become bad while the proc_fdlock() is not held,
	2065	* if a multithreaded application closes the fd out from under
	2066	* the in progress select. In this case, we still have to
	2067	* clean up after the set up on the remaining fds.
	2068	*/
	2069	static int
	2070	seldrop_locked(struct proc p, u_int32_t ibits, int nfd, int lim, int *need_wakeup, int fromselcount)
	2071	{
	2072	struct filedesc *fdp = p->p_fd;
	2073	int msk, i, j, nc, fd;
	2074	u_int32_t bits;
	2075	struct fileproc *fp;
	2076	u_int32_t *iptr;
	2077	u_int nw;
	2078	int error = 0;
	2079	int dropcount = 0;
	2080	uthread_t uth = get_bsdthread_info(current_thread());
	2081	struct _select_data *seldata;
	2082
	2083	*need_wakeup = 0;
	2084
	2085	/*
	2086	* Problems when reboot; due to MacOSX signal probs
	2087	* in Beaker1C ; verify that the p->p_fd is valid
	2088	*/
	2089	if (fdp == NULL) {
	2090	return EIO;
	2091	}
	2092
	2093	nw = howmany(nfd, NFDBITS);
	2094	seldata = &uth->uu_save.uus_select_data;
	2095
	2096	nc = 0;
	2097	for (msk = 0; msk < 3; msk++) {
	2098	iptr = (u_int32_t )&ibits[msk nw];
	2099	for (i = 0; i < nfd; i += NFDBITS) {
	2100	bits = iptr[i / NFDBITS];
	2101	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	2102	bits &= ~(1U << j);
	2103	fp = fdp->fd_ofiles[fd];
	2104	/*
	2105	* If we've already dropped as many as were
	2106	* counted/scanned, then we are done.
	2107	*/
	2108	if ((fromselcount != 0) && (++dropcount > lim)) {
	2109	goto done;
	2110	}
	2111
	2112	/*
	2113	* unlink even potentially NULL fileprocs.
	2114	* If the FD was closed from under us, we
	2115	* still need to cleanup the waitq links!
	2116	*/
	2117	selunlinkfp(fp,
	2118	seldata->wqp ? seldata->wqp[nc] : 0,
	2119	uth->uu_wqset);
	2120
	2121	nc++;
	2122
	2123	if (fp == NULL) {
	2124	/* skip (now) bad fds */
	2125	error = EBADF;
	2126	continue;
	2127	}
	2128
	2129	const os_ref_count_t refc = os_ref_release_locked(&fp->f_iocount);
	2130	if (0 == refc) {
	2131	panic("f_iocount overdecrement!");
	2132	}
	2133
	2134	if (1 == refc) {
	2135	/*
	2136	* The last iocount is responsible for clearing
	2137	* selconfict flag - even if we didn't set it -
	2138	* and is also responsible for waking up anyone
	2139	* waiting on iocounts to drain.
	2140	*/
	2141	if (fp->f_flags & FP_SELCONFLICT) {
	2142	fp->f_flags &= ~FP_SELCONFLICT;
	2143	}
	2144	if (p->p_fpdrainwait) {
	2145	p->p_fpdrainwait = 0;
	2146	*need_wakeup = 1;
	2147	}
	2148	}
	2149	}
	2150	}
	2151	}
	2152	done:
	2153	return error;
	2154	}
	2155
	2156
	2157	static int
	2158	seldrop(struct proc p, u_int32_t ibits, int nfd)
	2159	{
	2160	int error;
	2161	int need_wakeup = 0;
	2162
	2163	proc_fdlock(p);
	2164	error = seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0);
	2165	proc_fdunlock(p);
	2166	if (need_wakeup) {
	2167	wakeup(&p->p_fpdrainwait);
	2168	}
	2169	return error;
	2170	}
	2171
	2172	/*
	2173	* Record a select request.
	2174	*/
	2175	void
	2176	selrecord(__unused struct proc selector, struct selinfo sip, void *s_data)
	2177	{
	2178	thread_t cur_act = current_thread();
	2179	struct uthread * ut = get_bsdthread_info(cur_act);
	2180	/* on input, s_data points to the 64-bit ID of a reserved link object */
	2181	uint64_t reserved_link = (uint64_t )s_data;
	2182
	2183	/* need to look at collisions */
	2184
	2185	/do not record if this is second pass of select /
	2186	if (!s_data) {
	2187	return;
	2188	}
	2189
	2190	if ((sip->si_flags & SI_INITED) == 0) {
	2191	waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO);
	2192	sip->si_flags \|= SI_INITED;
	2193	sip->si_flags &= ~SI_CLEAR;
	2194	}
	2195
	2196	if (sip->si_flags & SI_RECORDED) {
	2197	sip->si_flags \|= SI_COLL;
	2198	} else {
	2199	sip->si_flags &= ~SI_COLL;
	2200	}
	2201
	2202	sip->si_flags \|= SI_RECORDED;
	2203	/* note: this checks for pre-existing linkage */
	2204	waitq_link(&sip->si_waitq, ut->uu_wqset,
	2205	WAITQ_SHOULD_LOCK, reserved_link);
	2206
	2207	/*
	2208	* Always consume the reserved link.
	2209	* We can always call waitq_link_release() safely because if
	2210	* waitq_link is successful, it consumes the link and resets the
	2211	* value to 0, in which case our call to release becomes a no-op.
	2212	* If waitq_link fails, then the following release call will actually
	2213	* release the reserved link object.
	2214	*/
	2215	waitq_link_release(*reserved_link);
	2216	*reserved_link = 0;
	2217
	2218	/*
	2219	* Use the s_data pointer as an output parameter as well
	2220	* This avoids changing the prototype for this function which is
	2221	* used by many kexts. We need to surface the waitq object
	2222	* associated with the selinfo we just added to the thread's select
	2223	* set. New waitq sets do not have back-pointers to set members, so
	2224	* the only way to clear out set linkage objects is to go from the
	2225	* waitq to the set. We use a memcpy because s_data could be
	2226	* pointing to an unaligned value on the stack
	2227	* (especially on 32-bit systems)
	2228	*/
	2229	void wqptr = (void )&sip->si_waitq;
	2230	memcpy((void )s_data, (void )&wqptr, sizeof(void *));
	2231
	2232	return;
	2233	}
	2234
	2235	void
	2236	selwakeup(struct selinfo *sip)
	2237	{
	2238	if ((sip->si_flags & SI_INITED) == 0) {
	2239	return;
	2240	}
	2241
	2242	if (sip->si_flags & SI_COLL) {
	2243	nselcoll++;
	2244	sip->si_flags &= ~SI_COLL;
	2245	#if 0
	2246	/* will not support */
	2247	//wakeup((caddr_t)&selwait);
	2248	#endif
	2249	}
	2250
	2251	if (sip->si_flags & SI_RECORDED) {
	2252	waitq_wakeup64_all(&sip->si_waitq, NO_EVENT64,
	2253	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
	2254	sip->si_flags &= ~SI_RECORDED;
	2255	}
	2256	}
	2257
	2258	void
	2259	selthreadclear(struct selinfo *sip)
	2260	{
	2261	struct waitq *wq;
	2262
	2263	if ((sip->si_flags & SI_INITED) == 0) {
	2264	return;
	2265	}
	2266	if (sip->si_flags & SI_RECORDED) {
	2267	selwakeup(sip);
	2268	sip->si_flags &= ~(SI_RECORDED \| SI_COLL);
	2269	}
	2270	sip->si_flags \|= SI_CLEAR;
	2271	sip->si_flags &= ~SI_INITED;
	2272
	2273	wq = &sip->si_waitq;
	2274
	2275	/*
	2276	* Higher level logic may have a handle on this waitq's prepost ID,
	2277	* but that's OK because the waitq_deinit will remove/invalidate the
	2278	* prepost object (as well as mark the waitq invalid). This de-couples
	2279	* us from any callers that may have a handle to this waitq via the
	2280	* prepost ID.
	2281	*/
	2282	waitq_deinit(wq);
	2283	}
	2284
	2285
	2286
	2287
	2288	#define DBG_POST 0x10
	2289	#define DBG_WATCH 0x11
	2290	#define DBG_WAIT 0x12
	2291	#define DBG_MOD 0x13
	2292	#define DBG_EWAKEUP 0x14
	2293	#define DBG_ENQUEUE 0x15
	2294	#define DBG_DEQUEUE 0x16
	2295
	2296	#define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
	2297	#define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
	2298	#define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
	2299	#define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
	2300	#define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
	2301	#define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
	2302	#define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
	2303
	2304
	2305	#define EVPROCDEQUE(p, evq) do { \
	2306	proc_lock(p); \
	2307	if (evq->ee_flags & EV_QUEUED) { \
	2308	TAILQ_REMOVE(&p->p_evlist, evq, ee_plist); \
	2309	evq->ee_flags &= ~EV_QUEUED; \
	2310	} \
	2311	proc_unlock(p); \
	2312	} while (0);
	2313
	2314
	2315	/*
	2316	* called upon socket close. deque and free all events for
	2317	* the socket... socket must be locked by caller.
	2318	*/
	2319	void
	2320	evsofree(struct socket *sp)
	2321	{
	2322	struct eventqelt evq, next;
	2323	proc_t p;
	2324
	2325	if (sp == NULL) {
	2326	return;
	2327	}
	2328
	2329	for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
	2330	next = evq->ee_slist.tqe_next;
	2331	p = evq->ee_proc;
	2332
	2333	if (evq->ee_flags & EV_QUEUED) {
	2334	EVPROCDEQUE(p, evq);
	2335	}
	2336	TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
	2337	FREE(evq, M_TEMP);
	2338	}
	2339	}
	2340
	2341
	2342	/*
	2343	* called upon pipe close. deque and free all events for
	2344	* the pipe... pipe must be locked by caller
	2345	*/
	2346	void
	2347	evpipefree(struct pipe *cpipe)
	2348	{
	2349	struct eventqelt evq, next;
	2350	proc_t p;
	2351
	2352	for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
	2353	next = evq->ee_slist.tqe_next;
	2354	p = evq->ee_proc;
	2355
	2356	EVPROCDEQUE(p, evq);
	2357
	2358	TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
	2359	FREE(evq, M_TEMP);
	2360	}
	2361	}
	2362
	2363
	2364	/*
	2365	* enqueue this event if it's not already queued. wakeup
	2366	* the proc if we do queue this event to it...
	2367	* entered with proc lock held... we drop it before
	2368	* doing the wakeup and return in that state
	2369	*/
	2370	static void
	2371	evprocenque(struct eventqelt *evq)
	2372	{
	2373	proc_t p;
	2374
	2375	assert(evq);
	2376	p = evq->ee_proc;
	2377
	2378	KERNEL_DEBUG(DBG_MISC_ENQUEUE \| DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask, 0, 0);
	2379
	2380	proc_lock(p);
	2381
	2382	if (evq->ee_flags & EV_QUEUED) {
	2383	proc_unlock(p);
	2384
	2385	KERNEL_DEBUG(DBG_MISC_ENQUEUE \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	2386	return;
	2387	}
	2388	evq->ee_flags \|= EV_QUEUED;
	2389
	2390	TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
	2391
	2392	proc_unlock(p);
	2393
	2394	wakeup(&p->p_evlist);
	2395
	2396	KERNEL_DEBUG(DBG_MISC_ENQUEUE \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	2397	}
	2398
	2399
	2400	/*
	2401	* pipe lock must be taken by the caller
	2402	*/
	2403	void
	2404	postpipeevent(struct pipe *pipep, int event)
	2405	{
	2406	int mask;
	2407	struct eventqelt *evq;
	2408
	2409	if (pipep == NULL) {
	2410	return;
	2411	}
	2412	KERNEL_DEBUG(DBG_MISC_POST \| DBG_FUNC_START, event, 0, 0, 1, 0);
	2413
	2414	for (evq = pipep->pipe_evlist.tqh_first;
	2415	evq != NULL; evq = evq->ee_slist.tqe_next) {
	2416	if (evq->ee_eventmask == 0) {
	2417	continue;
	2418	}
	2419	mask = 0;
	2420
	2421	switch (event & (EV_RWBYTES \| EV_RCLOSED \| EV_WCLOSED)) {
	2422	case EV_RWBYTES:
	2423	if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
	2424	mask \|= EV_RE;
	2425	evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
	2426	}
	2427	if ((evq->ee_eventmask & EV_WR) &&
	2428	(MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
	2429	if (pipep->pipe_state & PIPE_EOF) {
	2430	mask \|= EV_WR \| EV_RESET;
	2431	break;
	2432	}
	2433	mask \|= EV_WR;
	2434	evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt;
	2435	}
	2436	break;
	2437
	2438	case EV_WCLOSED:
	2439	case EV_RCLOSED:
	2440	if ((evq->ee_eventmask & EV_RE)) {
	2441	mask \|= EV_RE \| EV_RCLOSED;
	2442	}
	2443	if ((evq->ee_eventmask & EV_WR)) {
	2444	mask \|= EV_WR \| EV_WCLOSED;
	2445	}
	2446	break;
	2447
	2448	default:
	2449	return;
	2450	}
	2451	if (mask) {
	2452	/*
	2453	* disarm... postevents are nops until this event is 'read' via
	2454	* waitevent and then re-armed via modwatch
	2455	*/
	2456	evq->ee_eventmask = 0;
	2457
	2458	/*
	2459	* since events are disarmed until after the waitevent
	2460	* the ee_req.er_xxxx fields can't change once we've
	2461	* inserted this event into the proc queue...
	2462	* therefore, the waitevent will see a 'consistent'
	2463	* snapshot of the event, even though it won't hold
	2464	* the pipe lock, and we're updating the event outside
	2465	* of the proc lock, which it will hold
	2466	*/
	2467	evq->ee_req.er_eventbits \|= mask;
	2468
	2469	KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1, 0);
	2470
	2471	evprocenque(evq);
	2472	}
	2473	}
	2474	KERNEL_DEBUG(DBG_MISC_POST \| DBG_FUNC_END, 0, 0, 0, 1, 0);
	2475	}
	2476
	2477	#if SOCKETS
	2478	/*
	2479	* given either a sockbuf or a socket run down the
	2480	* event list and queue ready events found...
	2481	* the socket must be locked by the caller
	2482	*/
	2483	void
	2484	postevent(struct socket sp, struct sockbuf sb, int event)
	2485	{
	2486	int mask;
	2487	struct eventqelt *evq;
	2488	struct tcpcb *tp;
	2489
	2490	if (sb) {
	2491	sp = sb->sb_so;
	2492	}
	2493	if (sp == NULL) {
	2494	return;
	2495	}
	2496
	2497	KERNEL_DEBUG(DBG_MISC_POST \| DBG_FUNC_START, (int)sp, event, 0, 0, 0);
	2498
	2499	for (evq = sp->so_evlist.tqh_first;
	2500	evq != NULL; evq = evq->ee_slist.tqe_next) {
	2501	if (evq->ee_eventmask == 0) {
	2502	continue;
	2503	}
	2504	mask = 0;
	2505
	2506	/* ready for reading:
	2507	* - byte cnt >= receive low water mark
	2508	* - read-half of conn closed
	2509	* - conn pending for listening sock
	2510	* - socket error pending
	2511	*
	2512	* ready for writing
	2513	* - byte cnt avail >= send low water mark
	2514	* - write half of conn closed
	2515	* - socket error pending
	2516	* - non-blocking conn completed successfully
	2517	*
	2518	* exception pending
	2519	* - out of band data
	2520	* - sock at out of band mark
	2521	*/
	2522
	2523	switch (event & EV_DMASK) {
	2524	case EV_OOB:
	2525	if ((evq->ee_eventmask & EV_EX)) {
	2526	if (sp->so_oobmark \|\| ((sp->so_state & SS_RCVATMARK))) {
	2527	mask \|= EV_EX \| EV_OOB;
	2528	}
	2529	}
	2530	break;
	2531
	2532	case EV_RWBYTES \| EV_OOB:
	2533	if ((evq->ee_eventmask & EV_EX)) {
	2534	if (sp->so_oobmark \|\| ((sp->so_state & SS_RCVATMARK))) {
	2535	mask \|= EV_EX \| EV_OOB;
	2536	}
	2537	}
	2538	/*
	2539	* fall into the next case
	2540	*/
	2541	case EV_RWBYTES:
	2542	if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
	2543	/* for AFP/OT purposes; may go away in future */
	2544	if ((SOCK_DOM(sp) == PF_INET \|\|
	2545	SOCK_DOM(sp) == PF_INET6) &&
	2546	SOCK_PROTO(sp) == IPPROTO_TCP &&
	2547	(sp->so_error == ECONNREFUSED \|\|
	2548	sp->so_error == ECONNRESET)) {
	2549	if (sp->so_pcb == NULL \|\|
	2550	sotoinpcb(sp)->inp_state ==
	2551	INPCB_STATE_DEAD \|\|
	2552	(tp = sototcpcb(sp)) == NULL \|\|
	2553	tp->t_state == TCPS_CLOSED) {
	2554	mask \|= EV_RE \| EV_RESET;
	2555	break;
	2556	}
	2557	}
	2558	mask \|= EV_RE;
	2559	evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
	2560
	2561	if (sp->so_state & SS_CANTRCVMORE) {
	2562	mask \|= EV_FIN;
	2563	break;
	2564	}
	2565	}
	2566	if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
	2567	/* for AFP/OT purposes; may go away in future */
	2568	if ((SOCK_DOM(sp) == PF_INET \|\|
	2569	SOCK_DOM(sp) == PF_INET6) &&
	2570	SOCK_PROTO(sp) == IPPROTO_TCP &&
	2571	(sp->so_error == ECONNREFUSED \|\|
	2572	sp->so_error == ECONNRESET)) {
	2573	if (sp->so_pcb == NULL \|\|
	2574	sotoinpcb(sp)->inp_state ==
	2575	INPCB_STATE_DEAD \|\|
	2576	(tp = sototcpcb(sp)) == NULL \|\|
	2577	tp->t_state == TCPS_CLOSED) {
	2578	mask \|= EV_WR \| EV_RESET;
	2579	break;
	2580	}
	2581	}
	2582	mask \|= EV_WR;
	2583	evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
	2584	}
	2585	break;
	2586
	2587	case EV_RCONN:
	2588	if ((evq->ee_eventmask & EV_RE)) {
	2589	mask \|= EV_RE \| EV_RCONN;
	2590	evq->ee_req.er_rcnt = sp->so_qlen + 1; // incl this one
	2591	}
	2592	break;
	2593
	2594	case EV_WCONN:
	2595	if ((evq->ee_eventmask & EV_WR)) {
	2596	mask \|= EV_WR \| EV_WCONN;
	2597	}
	2598	break;
	2599
	2600	case EV_RCLOSED:
	2601	if ((evq->ee_eventmask & EV_RE)) {
	2602	mask \|= EV_RE \| EV_RCLOSED;
	2603	}
	2604	break;
	2605
	2606	case EV_WCLOSED:
	2607	if ((evq->ee_eventmask & EV_WR)) {
	2608	mask \|= EV_WR \| EV_WCLOSED;
	2609	}
	2610	break;
	2611
	2612	case EV_FIN:
	2613	if (evq->ee_eventmask & EV_RE) {
	2614	mask \|= EV_RE \| EV_FIN;
	2615	}
	2616	break;
	2617
	2618	case EV_RESET:
	2619	case EV_TIMEOUT:
	2620	if (evq->ee_eventmask & EV_RE) {
	2621	mask \|= EV_RE \| event;
	2622	}
	2623	if (evq->ee_eventmask & EV_WR) {
	2624	mask \|= EV_WR \| event;
	2625	}
	2626	break;
	2627
	2628	default:
	2629	KERNEL_DEBUG(DBG_MISC_POST \| DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
	2630	return;
	2631	} /* switch */
	2632
	2633	KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
	2634
	2635	if (mask) {
	2636	/*
	2637	* disarm... postevents are nops until this event is 'read' via
	2638	* waitevent and then re-armed via modwatch
	2639	*/
	2640	evq->ee_eventmask = 0;
	2641
	2642	/*
	2643	* since events are disarmed until after the waitevent
	2644	* the ee_req.er_xxxx fields can't change once we've
	2645	* inserted this event into the proc queue...
	2646	* since waitevent can't see this event until we
	2647	* enqueue it, waitevent will see a 'consistent'
	2648	* snapshot of the event, even though it won't hold
	2649	* the socket lock, and we're updating the event outside
	2650	* of the proc lock, which it will hold
	2651	*/
	2652	evq->ee_req.er_eventbits \|= mask;
	2653
	2654	evprocenque(evq);
	2655	}
	2656	}
	2657	KERNEL_DEBUG(DBG_MISC_POST \| DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
	2658	}
	2659	#endif /* SOCKETS */
	2660
	2661
	2662	/*
	2663	* watchevent system call. user passes us an event to watch
	2664	* for. we malloc an event object, initialize it, and queue
	2665	* it to the open socket. when the event occurs, postevent()
	2666	* will enque it back to our proc where we can retrieve it
	2667	* via waitevent().
	2668	*
	2669	* should this prevent duplicate events on same socket?
	2670	*
	2671	* Returns:
	2672	* ENOMEM No memory for operation
	2673	* copyin:EFAULT
	2674	*/
	2675	int
	2676	watchevent(proc_t p, struct watchevent_args uap, __unused int retval)
	2677	{
	2678	struct eventqelt evq = (struct eventqelt )0;
	2679	struct eventqelt *np = NULL;
	2680	struct eventreq64 *erp;
	2681	struct fileproc *fp = NULL;
	2682	int error;
	2683
	2684	KERNEL_DEBUG(DBG_MISC_WATCH \| DBG_FUNC_START, 0, 0, 0, 0, 0);
	2685
	2686	// get a qelt and fill with users req
	2687	MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
	2688
	2689	if (evq == NULL) {
	2690	return ENOMEM;
	2691	}
	2692	erp = &evq->ee_req;
	2693
	2694	// get users request pkt
	2695
	2696	if (IS_64BIT_PROCESS(p)) {
	2697	error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
	2698	} else {
	2699	struct eventreq32 er32;
	2700
	2701	error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
	2702	if (error == 0) {
	2703	/*
	2704	* the user only passes in the
	2705	* er_type, er_handle and er_data...
	2706	* the other fields are initialized
	2707	* below, so don't bother to copy
	2708	*/
	2709	erp->er_type = er32.er_type;
	2710	erp->er_handle = er32.er_handle;
	2711	erp->er_data = (user_addr_t)er32.er_data;
	2712	}
	2713	}
	2714	if (error) {
	2715	FREE(evq, M_TEMP);
	2716	KERNEL_DEBUG(DBG_MISC_WATCH \| DBG_FUNC_END, error, 0, 0, 0, 0);
	2717
	2718	return error;
	2719	}
	2720	KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle, uap->u_eventmask, (uint32_t)evq, 0, 0);
	2721
	2722	// validate, freeing qelt if errors
	2723	error = 0;
	2724	proc_fdlock(p);
	2725
	2726	if (erp->er_type != EV_FD) {
	2727	error = EINVAL;
	2728	} else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
	2729	error = EBADF;
	2730	#if SOCKETS
	2731	} else if (fp->f_type == DTYPE_SOCKET) {
	2732	socket_lock((struct socket *)fp->f_data, 1);
	2733	np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
	2734	#endif /* SOCKETS */
	2735	} else if (fp->f_type == DTYPE_PIPE) {
	2736	PIPE_LOCK((struct pipe *)fp->f_data);
	2737	np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
	2738	} else {
	2739	fp_drop(p, erp->er_handle, fp, 1);
	2740	error = EINVAL;
	2741	}
	2742	proc_fdunlock(p);
	2743
	2744	if (error) {
	2745	FREE(evq, M_TEMP);
	2746
	2747	KERNEL_DEBUG(DBG_MISC_WATCH \| DBG_FUNC_END, error, 0, 0, 0, 0);
	2748	return error;
	2749	}
	2750
	2751	/*
	2752	* only allow one watch per file per proc
	2753	*/
	2754	for (; np != NULL; np = np->ee_slist.tqe_next) {
	2755	if (np->ee_proc == p) {
	2756	#if SOCKETS
	2757	if (fp->f_type == DTYPE_SOCKET) {
	2758	socket_unlock((struct socket *)fp->f_data, 1);
	2759	} else
	2760	#endif /* SOCKETS */
	2761	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2762	fp_drop(p, erp->er_handle, fp, 0);
	2763	FREE(evq, M_TEMP);
	2764
	2765	KERNEL_DEBUG(DBG_MISC_WATCH \| DBG_FUNC_END, EINVAL, 0, 0, 0, 0);
	2766	return EINVAL;
	2767	}
	2768	}
	2769	erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
	2770	evq->ee_proc = p;
	2771	evq->ee_eventmask = uap->u_eventmask & EV_MASK;
	2772	evq->ee_flags = 0;
	2773
	2774	#if SOCKETS
	2775	if (fp->f_type == DTYPE_SOCKET) {
	2776	TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
	2777	postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
	2778
	2779	socket_unlock((struct socket *)fp->f_data, 1);
	2780	} else
	2781	#endif /* SOCKETS */
	2782	{
	2783	TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
	2784	postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
	2785
	2786	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2787	}
	2788	fp_drop_event(p, erp->er_handle, fp);
	2789
	2790	KERNEL_DEBUG(DBG_MISC_WATCH \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	2791	return 0;
	2792	}
	2793
	2794
	2795
	2796	/*
	2797	* waitevent system call.
	2798	* grabs the next waiting event for this proc and returns
	2799	* it. if no events, user can request to sleep with timeout
	2800	* or without or poll mode
	2801	* ((tv != NULL && interval == 0) \|\| tv == -1)
	2802	*/
	2803	int
	2804	waitevent(proc_t p, struct waitevent_args uap, int retval)
	2805	{
	2806	int error = 0;
	2807	struct eventqelt *evq;
	2808	struct eventreq64 *erp;
	2809	uint64_t abstime, interval;
	2810	boolean_t fast_poll = FALSE;
	2811	union {
	2812	struct eventreq64 er64;
	2813	struct eventreq32 er32;
	2814	} uer = {};
	2815
	2816	interval = 0;
	2817
	2818	if (uap->tv) {
	2819	struct timeval atv;
	2820	/*
	2821	* check for fast poll method
	2822	*/
	2823	if (IS_64BIT_PROCESS(p)) {
	2824	if (uap->tv == (user_addr_t)-1) {
	2825	fast_poll = TRUE;
	2826	}
	2827	} else if (uap->tv == (user_addr_t)((uint32_t)-1)) {
	2828	fast_poll = TRUE;
	2829	}
	2830
	2831	if (fast_poll == TRUE) {
	2832	if (p->p_evlist.tqh_first == NULL) {
	2833	KERNEL_DEBUG(DBG_MISC_WAIT \| DBG_FUNC_NONE, -1, 0, 0, 0, 0);
	2834	/*
	2835	* poll failed
	2836	*/
	2837	*retval = 1;
	2838	return 0;
	2839	}
	2840	proc_lock(p);
	2841	goto retry;
	2842	}
	2843	if (IS_64BIT_PROCESS(p)) {
	2844	struct user64_timeval atv64;
	2845	error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
	2846	/* Loses resolution - assume timeout < 68 years */
	2847	atv.tv_sec = atv64.tv_sec;
	2848	atv.tv_usec = atv64.tv_usec;
	2849	} else {
	2850	struct user32_timeval atv32;
	2851	error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
	2852	atv.tv_sec = atv32.tv_sec;
	2853	atv.tv_usec = atv32.tv_usec;
	2854	}
	2855
	2856	if (error) {
	2857	return error;
	2858	}
	2859	if (itimerfix(&atv)) {
	2860	error = EINVAL;
	2861	return error;
	2862	}
	2863	interval = tvtoabstime(&atv);
	2864	}
	2865	KERNEL_DEBUG(DBG_MISC_WAIT \| DBG_FUNC_START, 0, 0, 0, 0, 0);
	2866
	2867	proc_lock(p);
	2868	retry:
	2869	if ((evq = p->p_evlist.tqh_first) != NULL) {
	2870	/*
	2871	* found one... make a local copy while it's still on the queue
	2872	* to prevent it from changing while in the midst of copying
	2873	* don't want to hold the proc lock across a copyout because
	2874	* it might block on a page fault at the target in user space
	2875	*/
	2876	erp = &evq->ee_req;
	2877
	2878	if (IS_64BIT_PROCESS(p)) {
	2879	bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof(struct eventreq64));
	2880	} else {
	2881	uer.er32.er_type = erp->er_type;
	2882	uer.er32.er_handle = erp->er_handle;
	2883	uer.er32.er_data = (uint32_t)erp->er_data;
	2884	uer.er32.er_ecnt = erp->er_ecnt;
	2885	uer.er32.er_rcnt = erp->er_rcnt;
	2886	uer.er32.er_wcnt = erp->er_wcnt;
	2887	uer.er32.er_eventbits = erp->er_eventbits;
	2888	}
	2889	TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
	2890
	2891	evq->ee_flags &= ~EV_QUEUED;
	2892
	2893	proc_unlock(p);
	2894
	2895	if (IS_64BIT_PROCESS(p)) {
	2896	error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
	2897	} else {
	2898	error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
	2899	}
	2900
	2901	KERNEL_DEBUG(DBG_MISC_WAIT \| DBG_FUNC_END, error,
	2902	evq->ee_req.er_handle, evq->ee_req.er_eventbits, (uint32_t)evq, 0);
	2903	return error;
	2904	} else {
	2905	if (uap->tv && interval == 0) {
	2906	proc_unlock(p);
	2907	*retval = 1; // poll failed
	2908
	2909	KERNEL_DEBUG(DBG_MISC_WAIT \| DBG_FUNC_END, error, 0, 0, 0, 0);
	2910	return error;
	2911	}
	2912	if (interval != 0) {
	2913	clock_absolutetime_interval_to_deadline(interval, &abstime);
	2914	} else {
	2915	abstime = 0;
	2916	}
	2917
	2918	KERNEL_DEBUG(DBG_MISC_WAIT, 1, (uint32_t)&p->p_evlist, 0, 0, 0);
	2919
	2920	error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK \| PCATCH), "waitevent", abstime);
	2921
	2922	KERNEL_DEBUG(DBG_MISC_WAIT, 2, (uint32_t)&p->p_evlist, 0, 0, 0);
	2923
	2924	if (error == 0) {
	2925	goto retry;
	2926	}
	2927	if (error == ERESTART) {
	2928	error = EINTR;
	2929	}
	2930	if (error == EWOULDBLOCK) {
	2931	*retval = 1;
	2932	error = 0;
	2933	}
	2934	}
	2935	proc_unlock(p);
	2936
	2937	KERNEL_DEBUG(DBG_MISC_WAIT \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	2938	return error;
	2939	}
	2940
	2941
	2942	/*
	2943	* modwatch system call. user passes in event to modify.
	2944	* if we find it we reset the event bits and que/deque event
	2945	* it needed.
	2946	*/
	2947	int
	2948	modwatch(proc_t p, struct modwatch_args uap, __unused int retval)
	2949	{
	2950	struct eventreq64 er;
	2951	struct eventreq64 *erp = &er;
	2952	struct eventqelt evq = NULL; / protected by error return */
	2953	int error;
	2954	struct fileproc *fp;
	2955	int flag;
	2956
	2957	KERNEL_DEBUG(DBG_MISC_MOD \| DBG_FUNC_START, 0, 0, 0, 0, 0);
	2958
	2959	/*
	2960	* get user's request pkt
	2961	* just need the er_type and er_handle which sit above the
	2962	* problematic er_data (32/64 issue)... so only copy in
	2963	* those 2 fields
	2964	*/
	2965	if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
	2966	KERNEL_DEBUG(DBG_MISC_MOD \| DBG_FUNC_END, error, 0, 0, 0, 0);
	2967	return error;
	2968	}
	2969	proc_fdlock(p);
	2970
	2971	if (erp->er_type != EV_FD) {
	2972	error = EINVAL;
	2973	} else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
	2974	error = EBADF;
	2975	#if SOCKETS
	2976	} else if (fp->f_type == DTYPE_SOCKET) {
	2977	socket_lock((struct socket *)fp->f_data, 1);
	2978	evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
	2979	#endif /* SOCKETS */
	2980	} else if (fp->f_type == DTYPE_PIPE) {
	2981	PIPE_LOCK((struct pipe *)fp->f_data);
	2982	evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
	2983	} else {
	2984	fp_drop(p, erp->er_handle, fp, 1);
	2985	error = EINVAL;
	2986	}
	2987
	2988	if (error) {
	2989	proc_fdunlock(p);
	2990	KERNEL_DEBUG(DBG_MISC_MOD \| DBG_FUNC_END, error, 0, 0, 0, 0);
	2991	return error;
	2992	}
	2993
	2994	if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
	2995	fp->f_flags &= ~FP_WAITEVENT;
	2996	}
	2997	proc_fdunlock(p);
	2998
	2999	// locate event if possible
	3000	for (; evq != NULL; evq = evq->ee_slist.tqe_next) {
	3001	if (evq->ee_proc == p) {
	3002	break;
	3003	}
	3004	}
	3005	if (evq == NULL) {
	3006	#if SOCKETS
	3007	if (fp->f_type == DTYPE_SOCKET) {
	3008	socket_unlock((struct socket *)fp->f_data, 1);
	3009	} else
	3010	#endif /* SOCKETS */
	3011	PIPE_UNLOCK((struct pipe *)fp->f_data);
	3012	fp_drop(p, erp->er_handle, fp, 0);
	3013	KERNEL_DEBUG(DBG_MISC_MOD \| DBG_FUNC_END, EINVAL, 0, 0, 0, 0);
	3014	return EINVAL;
	3015	}
	3016	KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle, uap->u_eventmask, (uint32_t)evq, 0, 0);
	3017
	3018	if (uap->u_eventmask == EV_RM) {
	3019	EVPROCDEQUE(p, evq);
	3020
	3021	#if SOCKETS
	3022	if (fp->f_type == DTYPE_SOCKET) {
	3023	TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
	3024	socket_unlock((struct socket *)fp->f_data, 1);
	3025	} else
	3026	#endif /* SOCKETS */
	3027	{
	3028	TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
	3029	PIPE_UNLOCK((struct pipe *)fp->f_data);
	3030	}
	3031	fp_drop(p, erp->er_handle, fp, 0);
	3032	FREE(evq, M_TEMP);
	3033	KERNEL_DEBUG(DBG_MISC_MOD \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	3034	return 0;
	3035	}
	3036	switch (uap->u_eventmask & EV_MASK) {
	3037	case 0:
	3038	flag = 0;
	3039	break;
	3040
	3041	case EV_RE:
	3042	case EV_WR:
	3043	case EV_RE \| EV_WR:
	3044	flag = EV_RWBYTES;
	3045	break;
	3046
	3047	case EV_EX:
	3048	flag = EV_OOB;
	3049	break;
	3050
	3051	case EV_EX \| EV_RE:
	3052	case EV_EX \| EV_WR:
	3053	case EV_EX \| EV_RE \| EV_WR:
	3054	flag = EV_OOB \| EV_RWBYTES;
	3055	break;
	3056
	3057	default:
	3058	#if SOCKETS
	3059	if (fp->f_type == DTYPE_SOCKET) {
	3060	socket_unlock((struct socket *)fp->f_data, 1);
	3061	} else
	3062	#endif /* SOCKETS */
	3063	PIPE_UNLOCK((struct pipe *)fp->f_data);
	3064	fp_drop(p, erp->er_handle, fp, 0);
	3065	KERNEL_DEBUG(DBG_MISC_WATCH \| DBG_FUNC_END, EINVAL, 0, 0, 0, 0);
	3066	return EINVAL;
	3067	}
	3068	/*
	3069	* since we're holding the socket/pipe lock, the event
	3070	* cannot go from the unqueued state to the queued state
	3071	* however, it can go from the queued state to the unqueued state
	3072	* since that direction is protected by the proc_lock...
	3073	* so do a quick check for EV_QUEUED w/o holding the proc lock
	3074	* since by far the common case will be NOT EV_QUEUED, this saves
	3075	* us taking the proc_lock the majority of the time
	3076	*/
	3077	if (evq->ee_flags & EV_QUEUED) {
	3078	/*
	3079	* EVPROCDEQUE will recheck the state after it grabs the proc_lock
	3080	*/
	3081	EVPROCDEQUE(p, evq);
	3082	}
	3083	/*
	3084	* while the event is off the proc queue and
	3085	* we're holding the socket/pipe lock
	3086	* it's safe to update these fields...
	3087	*/
	3088	evq->ee_req.er_eventbits = 0;
	3089	evq->ee_eventmask = uap->u_eventmask & EV_MASK;
	3090
	3091	#if SOCKETS
	3092	if (fp->f_type == DTYPE_SOCKET) {
	3093	postevent((struct socket *)fp->f_data, 0, flag);
	3094	socket_unlock((struct socket *)fp->f_data, 1);
	3095	} else
	3096	#endif /* SOCKETS */
	3097	{
	3098	postpipeevent((struct pipe *)fp->f_data, flag);
	3099	PIPE_UNLOCK((struct pipe *)fp->f_data);
	3100	}
	3101	fp_drop(p, erp->er_handle, fp, 0);
	3102	KERNEL_DEBUG(DBG_MISC_MOD \| DBG_FUNC_END, evq->ee_req.er_handle, evq->ee_eventmask, (uint32_t)fp->f_data, flag, 0);
	3103	return 0;
	3104	}
	3105
	3106	/* this routine is called from the close of fd with proc_fdlock held */
	3107	int
	3108	waitevent_close(struct proc p, struct fileproc fp)
	3109	{
	3110	struct eventqelt *evq;
	3111
	3112
	3113	fp->f_flags &= ~FP_WAITEVENT;
	3114
	3115	#if SOCKETS
	3116	if (fp->f_type == DTYPE_SOCKET) {
	3117	socket_lock((struct socket *)fp->f_data, 1);
	3118	evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
	3119	} else
	3120	#endif /* SOCKETS */
	3121	if (fp->f_type == DTYPE_PIPE) {
	3122	PIPE_LOCK((struct pipe *)fp->f_data);
	3123	evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
	3124	} else {
	3125	return EINVAL;
	3126	}
	3127	proc_fdunlock(p);
	3128
	3129
	3130	// locate event if possible
	3131	for (; evq != NULL; evq = evq->ee_slist.tqe_next) {
	3132	if (evq->ee_proc == p) {
	3133	break;
	3134	}
	3135	}
	3136	if (evq == NULL) {
	3137	#if SOCKETS
	3138	if (fp->f_type == DTYPE_SOCKET) {
	3139	socket_unlock((struct socket *)fp->f_data, 1);
	3140	} else
	3141	#endif /* SOCKETS */
	3142	PIPE_UNLOCK((struct pipe *)fp->f_data);
	3143
	3144	proc_fdlock(p);
	3145
	3146	return EINVAL;
	3147	}
	3148	EVPROCDEQUE(p, evq);
	3149
	3150	#if SOCKETS
	3151	if (fp->f_type == DTYPE_SOCKET) {
	3152	TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
	3153	socket_unlock((struct socket *)fp->f_data, 1);
	3154	} else
	3155	#endif /* SOCKETS */
	3156	{
	3157	TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
	3158	PIPE_UNLOCK((struct pipe *)fp->f_data);
	3159	}
	3160	FREE(evq, M_TEMP);
	3161
	3162	proc_fdlock(p);
	3163
	3164	return 0;
	3165	}
	3166
	3167
	3168	/*
	3169	* gethostuuid
	3170	*
	3171	* Description: Get the host UUID from IOKit and return it to user space.
	3172	*
	3173	* Parameters: uuid_buf Pointer to buffer to receive UUID
	3174	* timeout Timespec for timout
	3175	*
	3176	* Returns: 0 Success
	3177	* EWOULDBLOCK Timeout is too short
	3178	* copyout:EFAULT Bad user buffer
	3179	* mac_system_check_info:EPERM Client not allowed to perform this operation
	3180	*
	3181	* Notes: A timeout seems redundant, since if it's tolerable to not
	3182	* have a system UUID in hand, then why ask for one?
	3183	*/
	3184	int
	3185	gethostuuid(struct proc p, struct gethostuuid_args uap, __unused int32_t *retval)
	3186	{
	3187	kern_return_t kret;
	3188	int error;
	3189	mach_timespec_t mach_ts; /* for IOKit call */
	3190	__darwin_uuid_t uuid_kern = {}; /* for IOKit call */
	3191
	3192	/* Check entitlement */
	3193	if (!IOTaskHasEntitlement(current_task(), "com.apple.private.getprivatesysid")) {
	3194	#if CONFIG_EMBEDDED
	3195	#if CONFIG_MACF
	3196	if ((error = mac_system_check_info(kauth_cred_get(), "hw.uuid")) != 0) {
	3197	/* EPERM invokes userspace upcall if present */
	3198	return error;
	3199	}
	3200	#endif
	3201	#endif
	3202	}
	3203
	3204	/* Convert the 32/64 bit timespec into a mach_timespec_t */
	3205	if (proc_is64bit(p)) {
	3206	struct user64_timespec ts;
	3207	error = copyin(uap->timeoutp, &ts, sizeof(ts));
	3208	if (error) {
	3209	return error;
	3210	}
	3211	mach_ts.tv_sec = ts.tv_sec;
	3212	mach_ts.tv_nsec = ts.tv_nsec;
	3213	} else {
	3214	struct user32_timespec ts;
	3215	error = copyin(uap->timeoutp, &ts, sizeof(ts));
	3216	if (error) {
	3217	return error;
	3218	}
	3219	mach_ts.tv_sec = ts.tv_sec;
	3220	mach_ts.tv_nsec = ts.tv_nsec;
	3221	}
	3222
	3223	/* Call IOKit with the stack buffer to get the UUID */
	3224	kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
	3225
	3226	/*
	3227	* If we get it, copy out the data to the user buffer; note that a
	3228	* uuid_t is an array of characters, so this is size invariant for
	3229	* 32 vs. 64 bit.
	3230	*/
	3231	if (kret == KERN_SUCCESS) {
	3232	error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
	3233	} else {
	3234	error = EWOULDBLOCK;
	3235	}
	3236
	3237	return error;
	3238	}
	3239
	3240	/*
	3241	* ledger
	3242	*
	3243	* Description: Omnibus system call for ledger operations
	3244	*/
	3245	int
	3246	ledger(struct proc p, struct ledger_args args, __unused int32_t *retval)
	3247	{
	3248	#if !CONFIG_MACF
	3249	#pragma unused(p)
	3250	#endif
	3251	int rval, pid, len, error;
	3252	#ifdef LEDGER_DEBUG
	3253	struct ledger_limit_args lla;
	3254	#endif
	3255	task_t task;
	3256	proc_t proc;
	3257
	3258	/* Finish copying in the necessary args before taking the proc lock */
	3259	error = 0;
	3260	len = 0;
	3261	if (args->cmd == LEDGER_ENTRY_INFO) {
	3262	error = copyin(args->arg3, (char *)&len, sizeof(len));
	3263	} else if (args->cmd == LEDGER_TEMPLATE_INFO) {
	3264	error = copyin(args->arg2, (char *)&len, sizeof(len));
	3265	} else if (args->cmd == LEDGER_LIMIT)
	3266	#ifdef LEDGER_DEBUG
	3267	{ error = copyin(args->arg2, (char *)&lla, sizeof(lla));}
	3268	#else
	3269	{ return EINVAL; }
	3270	#endif
	3271	else if ((args->cmd < 0) \|\| (args->cmd > LEDGER_MAX_CMD)) {
	3272	return EINVAL;
	3273	}
	3274
	3275	if (error) {
	3276	return error;
	3277	}
	3278	if (len < 0) {
	3279	return EINVAL;
	3280	}
	3281
	3282	rval = 0;
	3283	if (args->cmd != LEDGER_TEMPLATE_INFO) {
	3284	pid = args->arg1;
	3285	proc = proc_find(pid);
	3286	if (proc == NULL) {
	3287	return ESRCH;
	3288	}
	3289
	3290	#if CONFIG_MACF
	3291	error = mac_proc_check_ledger(p, proc, args->cmd);
	3292	if (error) {
	3293	proc_rele(proc);
	3294	return error;
	3295	}
	3296	#endif
	3297
	3298	task = proc->task;
	3299	}
	3300
	3301	switch (args->cmd) {
	3302	#ifdef LEDGER_DEBUG
	3303	case LEDGER_LIMIT: {
	3304	if (!kauth_cred_issuser(kauth_cred_get())) {
	3305	rval = EPERM;
	3306	}
	3307	rval = ledger_limit(task, &lla);
	3308	proc_rele(proc);
	3309	break;
	3310	}
	3311	#endif
	3312	case LEDGER_INFO: {
	3313	struct ledger_info info = {};
	3314
	3315	rval = ledger_info(task, &info);
	3316	proc_rele(proc);
	3317	if (rval == 0) {
	3318	rval = copyout(&info, args->arg2,
	3319	sizeof(info));
	3320	}
	3321	break;
	3322	}
	3323
	3324	case LEDGER_ENTRY_INFO: {
	3325	void *buf;
	3326	int sz;
	3327
	3328	rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
	3329	proc_rele(proc);
	3330	if ((rval == 0) && (len >= 0)) {
	3331	sz = len * sizeof(struct ledger_entry_info);
	3332	rval = copyout(buf, args->arg2, sz);
	3333	kfree(buf, sz);
	3334	}
	3335	if (rval == 0) {
	3336	rval = copyout(&len, args->arg3, sizeof(len));
	3337	}
	3338	break;
	3339	}
	3340
	3341	case LEDGER_TEMPLATE_INFO: {
	3342	void *buf;
	3343	int sz;
	3344
	3345	rval = ledger_template_info(&buf, &len);
	3346	if ((rval == 0) && (len >= 0)) {
	3347	sz = len * sizeof(struct ledger_template_info);
	3348	rval = copyout(buf, args->arg1, sz);
	3349	kfree(buf, sz);
	3350	}
	3351	if (rval == 0) {
	3352	rval = copyout(&len, args->arg2, sizeof(len));
	3353	}
	3354	break;
	3355	}
	3356
	3357	default:
	3358	panic("ledger syscall logic error -- command type %d", args->cmd);
	3359	proc_rele(proc);
	3360	rval = EINVAL;
	3361	}
	3362
	3363	return rval;
	3364	}
	3365
	3366	int
	3367	telemetry(__unused struct proc p, struct telemetry_args args, __unused int32_t *retval)
	3368	{
	3369	int error = 0;
	3370
	3371	switch (args->cmd) {
	3372	#if CONFIG_TELEMETRY
	3373	case TELEMETRY_CMD_TIMER_EVENT:
	3374	error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
	3375	break;
	3376	case TELEMETRY_CMD_PMI_SETUP:
	3377	error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval);
	3378	break;
	3379	#endif /* CONFIG_TELEMETRY */
	3380	case TELEMETRY_CMD_VOUCHER_NAME:
	3381	if (thread_set_voucher_name((mach_port_name_t)args->deadline)) {
	3382	error = EINVAL;
	3383	}
	3384	break;
	3385
	3386	default:
	3387	error = EINVAL;
	3388	break;
	3389	}
	3390
	3391	return error;
	3392	}
	3393
	3394	/*
	3395	* Logging
	3396	*
	3397	* Description: syscall to access kernel logging from userspace
	3398	*
	3399	* Args:
	3400	* tag - used for syncing with userspace on the version.
	3401	* flags - flags used by the syscall.
	3402	* buffer - userspace address of string to copy.
	3403	* size - size of buffer.
	3404	*/
	3405	int
	3406	log_data(__unused struct proc p, struct log_data_args args, int *retval)
	3407	{
	3408	unsigned int tag = args->tag;
	3409	unsigned int flags = args->flags;
	3410	user_addr_t buffer = args->buffer;
	3411	unsigned int size = args->size;
	3412	int ret = 0;
	3413	char *log_msg = NULL;
	3414	int error;
	3415	*retval = 0;
	3416
	3417	/*
	3418	* Tag synchronize the syscall version with userspace.
	3419	* Tag == 0 => flags == OS_LOG_TYPE
	3420	*/
	3421	if (tag != 0) {
	3422	return EINVAL;
	3423	}
	3424
	3425	/*
	3426	* OS_LOG_TYPE are defined in libkern/os/log.h
	3427	* In userspace they are defined in libtrace/os/log.h
	3428	*/
	3429	if (flags != OS_LOG_TYPE_DEFAULT &&
	3430	flags != OS_LOG_TYPE_INFO &&
	3431	flags != OS_LOG_TYPE_DEBUG &&
	3432	flags != OS_LOG_TYPE_ERROR &&
	3433	flags != OS_LOG_TYPE_FAULT) {
	3434	return EINVAL;
	3435	}
	3436
	3437	if (size == 0) {
	3438	return EINVAL;
	3439	}
	3440
	3441	/* truncate to OS_LOG_DATA_MAX_SIZE */
	3442	if (size > OS_LOG_DATA_MAX_SIZE) {
	3443	printf("%s: WARNING msg is going to be truncated from %u to %u\n", __func__, size, OS_LOG_DATA_MAX_SIZE);
	3444	size = OS_LOG_DATA_MAX_SIZE;
	3445	}
	3446
	3447	log_msg = kalloc(size);
	3448	if (!log_msg) {
	3449	return ENOMEM;
	3450	}
	3451
	3452	error = copyin(buffer, log_msg, size);
	3453	if (error) {
	3454	ret = EFAULT;
	3455	goto out;
	3456	}
	3457	log_msg[size - 1] = '\0';
	3458
	3459	/*
	3460	* This will log to dmesg and logd.
	3461	* The call will fail if the current
	3462	* process is not a driverKit process.
	3463	*/
	3464	os_log_driverKit(&ret, OS_LOG_DEFAULT, flags, "%s", log_msg);
	3465
	3466	out:
	3467	if (log_msg != NULL) {
	3468	kfree(log_msg, size);
	3469	}
	3470
	3471	return ret;
	3472	}
	3473
	3474	#if DEVELOPMENT \|\| DEBUG
	3475	#if CONFIG_WAITQ_DEBUG
	3476	static uint64_t g_wqset_num = 0;
	3477	struct g_wqset {
	3478	queue_chain_t link;
	3479	struct waitq_set *wqset;
	3480	};
	3481
	3482	static queue_head_t g_wqset_list;
	3483	static struct waitq_set *g_waitq_set = NULL;
	3484
	3485	static inline struct waitq_set *
	3486	sysctl_get_wqset(int idx)
	3487	{
	3488	struct g_wqset *gwqs;
	3489
	3490	if (!g_wqset_num) {
	3491	queue_init(&g_wqset_list);
	3492	}
	3493
	3494	/* don't bother with locks: this is test-only code! */
	3495	qe_foreach_element(gwqs, &g_wqset_list, link) {
	3496	if ((int)(wqset_id(gwqs->wqset) & 0xffffffff) == idx) {
	3497	return gwqs->wqset;
	3498	}
	3499	}
	3500
	3501	/* allocate a new one */
	3502	++g_wqset_num;
	3503	gwqs = (struct g_wqset )kalloc(sizeof(gwqs));
	3504	assert(gwqs != NULL);
	3505
	3506	gwqs->wqset = waitq_set_alloc(SYNC_POLICY_FIFO \| SYNC_POLICY_PREPOST, NULL);
	3507	enqueue_tail(&g_wqset_list, &gwqs->link);
	3508	printf("[WQ]: created new waitq set 0x%llx\n", wqset_id(gwqs->wqset));
	3509
	3510	return gwqs->wqset;
	3511	}
	3512
	3513	#define MAX_GLOBAL_TEST_QUEUES 64
	3514	static int g_wq_init = 0;
	3515	static struct waitq g_wq[MAX_GLOBAL_TEST_QUEUES];
	3516
	3517	static inline struct waitq *
	3518	global_test_waitq(int idx)
	3519	{
	3520	if (idx < 0) {
	3521	return NULL;
	3522	}
	3523
	3524	if (!g_wq_init) {
	3525	g_wq_init = 1;
	3526	for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++) {
	3527	waitq_init(&g_wq[i], SYNC_POLICY_FIFO);
	3528	}
	3529	}
	3530
	3531	return &g_wq[idx % MAX_GLOBAL_TEST_QUEUES];
	3532	}
	3533
	3534	static int sysctl_waitq_wakeup_one SYSCTL_HANDLER_ARGS
	3535	{
	3536	#pragma unused(oidp, arg1, arg2)
	3537	int error;
	3538	int index;
	3539	struct waitq *waitq;
	3540	kern_return_t kr;
	3541	int64_t event64 = 0;
	3542
	3543	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3544	if (error) {
	3545	return error;
	3546	}
	3547
	3548	if (!req->newptr) {
	3549	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3550	}
	3551
	3552	if (event64 < 0) {
	3553	index = (int)((-event64) & 0xffffffff);
	3554	waitq = wqset_waitq(sysctl_get_wqset(index));
	3555	index = -index;
	3556	} else {
	3557	index = (int)event64;
	3558	waitq = global_test_waitq(index);
	3559	}
	3560
	3561	event64 = 0;
	3562
	3563	printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
	3564	index, event64);
	3565	kr = waitq_wakeup64_one(waitq, (event64_t)event64, THREAD_AWAKENED,
	3566	WAITQ_ALL_PRIORITIES);
	3567	printf("[WQ]: \tkr=%d\n", kr);
	3568
	3569	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3570	}
	3571	SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_one, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3572	0, 0, sysctl_waitq_wakeup_one, "Q", "wakeup one thread waiting on given event");
	3573
	3574
	3575	static int sysctl_waitq_wakeup_all SYSCTL_HANDLER_ARGS
	3576	{
	3577	#pragma unused(oidp, arg1, arg2)
	3578	int error;
	3579	int index;
	3580	struct waitq *waitq;
	3581	kern_return_t kr;
	3582	int64_t event64 = 0;
	3583
	3584	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3585	if (error) {
	3586	return error;
	3587	}
	3588
	3589	if (!req->newptr) {
	3590	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3591	}
	3592
	3593	if (event64 < 0) {
	3594	index = (int)((-event64) & 0xffffffff);
	3595	waitq = wqset_waitq(sysctl_get_wqset(index));
	3596	index = -index;
	3597	} else {
	3598	index = (int)event64;
	3599	waitq = global_test_waitq(index);
	3600	}
	3601
	3602	event64 = 0;
	3603
	3604	printf("[WQ]: Waking all threads on waitq [%d] event:0x%llx\n",
	3605	index, event64);
	3606	kr = waitq_wakeup64_all(waitq, (event64_t)event64,
	3607	THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
	3608	printf("[WQ]: \tkr=%d\n", kr);
	3609
	3610	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3611	}
	3612	SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_all, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3613	0, 0, sysctl_waitq_wakeup_all, "Q", "wakeup all threads waiting on given event");
	3614
	3615
	3616	static int sysctl_waitq_wait SYSCTL_HANDLER_ARGS
	3617	{
	3618	#pragma unused(oidp, arg1, arg2)
	3619	int error;
	3620	int index;
	3621	struct waitq *waitq;
	3622	kern_return_t kr;
	3623	int64_t event64 = 0;
	3624
	3625	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3626	if (error) {
	3627	return error;
	3628	}
	3629
	3630	if (!req->newptr) {
	3631	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3632	}
	3633
	3634	if (event64 < 0) {
	3635	index = (int)((-event64) & 0xffffffff);
	3636	waitq = wqset_waitq(sysctl_get_wqset(index));
	3637	index = -index;
	3638	} else {
	3639	index = (int)event64;
	3640	waitq = global_test_waitq(index);
	3641	}
	3642
	3643	event64 = 0;
	3644
	3645	printf("[WQ]: Current thread waiting on waitq [%d] event:0x%llx\n",
	3646	index, event64);
	3647	kr = waitq_assert_wait64(waitq, (event64_t)event64, THREAD_INTERRUPTIBLE, 0);
	3648	if (kr == THREAD_WAITING) {
	3649	thread_block(THREAD_CONTINUE_NULL);
	3650	}
	3651	printf("[WQ]: \tWoke Up: kr=%d\n", kr);
	3652
	3653	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3654	}
	3655	SYSCTL_PROC(_kern, OID_AUTO, waitq_wait, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3656	0, 0, sysctl_waitq_wait, "Q", "start waiting on given event");
	3657
	3658
	3659	static int sysctl_wqset_select SYSCTL_HANDLER_ARGS
	3660	{
	3661	#pragma unused(oidp, arg1, arg2)
	3662	int error;
	3663	struct waitq_set *wqset;
	3664	uint64_t event64 = 0;
	3665
	3666	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3667	if (error) {
	3668	return error;
	3669	}
	3670
	3671	if (!req->newptr) {
	3672	goto out;
	3673	}
	3674
	3675	wqset = sysctl_get_wqset((int)(event64 & 0xffffffff));
	3676	g_waitq_set = wqset;
	3677
	3678	event64 = wqset_id(wqset);
	3679	printf("[WQ]: selected wqset 0x%llx\n", event64);
	3680
	3681	out:
	3682	if (g_waitq_set) {
	3683	event64 = wqset_id(g_waitq_set);
	3684	} else {
	3685	event64 = (uint64_t)(-1);
	3686	}
	3687
	3688	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3689	}
	3690	SYSCTL_PROC(_kern, OID_AUTO, wqset_select, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3691	0, 0, sysctl_wqset_select, "Q", "select/create a global waitq set");
	3692
	3693
	3694	static int sysctl_waitq_link SYSCTL_HANDLER_ARGS
	3695	{
	3696	#pragma unused(oidp, arg1, arg2)
	3697	int error;
	3698	int index;
	3699	struct waitq *waitq;
	3700	struct waitq_set *wqset;
	3701	kern_return_t kr;
	3702	uint64_t reserved_link = 0;
	3703	int64_t event64 = 0;
	3704
	3705	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3706	if (error) {
	3707	return error;
	3708	}
	3709
	3710	if (!req->newptr) {
	3711	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3712	}
	3713
	3714	if (!g_waitq_set) {
	3715	g_waitq_set = sysctl_get_wqset(1);
	3716	}
	3717	wqset = g_waitq_set;
	3718
	3719	if (event64 < 0) {
	3720	struct waitq_set *tmp;
	3721	index = (int)((-event64) & 0xffffffff);
	3722	tmp = sysctl_get_wqset(index);
	3723	if (tmp == wqset) {
	3724	goto out;
	3725	}
	3726	waitq = wqset_waitq(tmp);
	3727	index = -index;
	3728	} else {
	3729	index = (int)event64;
	3730	waitq = global_test_waitq(index);
	3731	}
	3732
	3733	printf("[WQ]: linking waitq [%d] to global wqset (0x%llx)\n",
	3734	index, wqset_id(wqset));
	3735	reserved_link = waitq_link_reserve(waitq);
	3736	kr = waitq_link(waitq, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
	3737	waitq_link_release(reserved_link);
	3738
	3739	printf("[WQ]: \tkr=%d\n", kr);
	3740
	3741	out:
	3742	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3743	}
	3744	SYSCTL_PROC(_kern, OID_AUTO, waitq_link, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3745	0, 0, sysctl_waitq_link, "Q", "link global waitq to test waitq set");
	3746
	3747
	3748	static int sysctl_waitq_unlink SYSCTL_HANDLER_ARGS
	3749	{
	3750	#pragma unused(oidp, arg1, arg2)
	3751	int error;
	3752	int index;
	3753	struct waitq *waitq;
	3754	struct waitq_set *wqset;
	3755	kern_return_t kr;
	3756	uint64_t event64 = 0;
	3757
	3758	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3759	if (error) {
	3760	return error;
	3761	}
	3762
	3763	if (!req->newptr) {
	3764	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3765	}
	3766
	3767	if (!g_waitq_set) {
	3768	g_waitq_set = sysctl_get_wqset(1);
	3769	}
	3770	wqset = g_waitq_set;
	3771
	3772	index = (int)event64;
	3773	waitq = global_test_waitq(index);
	3774
	3775	printf("[WQ]: unlinking waitq [%d] from global wqset (0x%llx)\n",
	3776	index, wqset_id(wqset));
	3777
	3778	kr = waitq_unlink(waitq, wqset);
	3779	printf("[WQ]: \tkr=%d\n", kr);
	3780
	3781	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3782	}
	3783	SYSCTL_PROC(_kern, OID_AUTO, waitq_unlink, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3784	0, 0, sysctl_waitq_unlink, "Q", "unlink global waitq from test waitq set");
	3785
	3786
	3787	static int sysctl_waitq_clear_prepost SYSCTL_HANDLER_ARGS
	3788	{
	3789	#pragma unused(oidp, arg1, arg2)
	3790	struct waitq *waitq;
	3791	uint64_t event64 = 0;
	3792	int error, index;
	3793
	3794	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3795	if (error) {
	3796	return error;
	3797	}
	3798
	3799	if (!req->newptr) {
	3800	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3801	}
	3802
	3803	index = (int)event64;
	3804	waitq = global_test_waitq(index);
	3805
	3806	printf("[WQ]: clearing prepost on waitq [%d]\n", index);
	3807	waitq_clear_prepost(waitq);
	3808
	3809	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3810	}
	3811	SYSCTL_PROC(_kern, OID_AUTO, waitq_clear_prepost, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3812	0, 0, sysctl_waitq_clear_prepost, "Q", "clear prepost on given waitq");
	3813
	3814
	3815	static int sysctl_wqset_unlink_all SYSCTL_HANDLER_ARGS
	3816	{
	3817	#pragma unused(oidp, arg1, arg2)
	3818	int error;
	3819	struct waitq_set *wqset;
	3820	kern_return_t kr;
	3821	uint64_t event64 = 0;
	3822
	3823	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3824	if (error) {
	3825	return error;
	3826	}
	3827
	3828	if (!req->newptr) {
	3829	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3830	}
	3831
	3832	if (!g_waitq_set) {
	3833	g_waitq_set = sysctl_get_wqset(1);
	3834	}
	3835	wqset = g_waitq_set;
	3836
	3837	printf("[WQ]: unlinking all queues from global wqset (0x%llx)\n",
	3838	wqset_id(wqset));
	3839
	3840	kr = waitq_set_unlink_all(wqset);
	3841	printf("[WQ]: \tkr=%d\n", kr);
	3842
	3843	return SYSCTL_OUT(req, &kr, sizeof(kr));
	3844	}
	3845	SYSCTL_PROC(_kern, OID_AUTO, wqset_unlink_all, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3846	0, 0, sysctl_wqset_unlink_all, "Q", "unlink all queues from test waitq set");
	3847
	3848
	3849	static int sysctl_wqset_clear_preposts SYSCTL_HANDLER_ARGS
	3850	{
	3851	#pragma unused(oidp, arg1, arg2)
	3852	struct waitq_set *wqset = NULL;
	3853	uint64_t event64 = 0;
	3854	int error, index;
	3855
	3856	error = SYSCTL_IN(req, &event64, sizeof(event64));
	3857	if (error) {
	3858	return error;
	3859	}
	3860
	3861	if (!req->newptr) {
	3862	goto out;
	3863	}
	3864
	3865	index = (int)((event64) & 0xffffffff);
	3866	wqset = sysctl_get_wqset(index);
	3867	assert(wqset != NULL);
	3868
	3869	printf("[WQ]: clearing preposts on wqset 0x%llx\n", wqset_id(wqset));
	3870	waitq_set_clear_preposts(wqset);
	3871
	3872	out:
	3873	if (wqset) {
	3874	event64 = wqset_id(wqset);
	3875	} else {
	3876	event64 = (uint64_t)(-1);
	3877	}
	3878
	3879	return SYSCTL_OUT(req, &event64, sizeof(event64));
	3880	}
	3881	SYSCTL_PROC(_kern, OID_AUTO, wqset_clear_preposts, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3882	0, 0, sysctl_wqset_clear_preposts, "Q", "clear preposts on given waitq set");
	3883
	3884	#endif /* CONFIG_WAITQ_DEBUG */
	3885
	3886	static int
	3887	sysctl_waitq_set_nelem SYSCTL_HANDLER_ARGS
	3888	{
	3889	#pragma unused(oidp, arg1, arg2)
	3890	int nelem;
	3891
	3892	/* Read only */
	3893	if (req->newptr != USER_ADDR_NULL) {
	3894	return EPERM;
	3895	}
	3896
	3897	nelem = sysctl_helper_waitq_set_nelem();
	3898
	3899	return SYSCTL_OUT(req, &nelem, sizeof(nelem));
	3900	}
	3901
	3902	SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD \| CTLFLAG_LOCKED,
	3903	0, 0, sysctl_waitq_set_nelem, "I", "ltable elementis currently used");
	3904
	3905
	3906	static int
	3907	sysctl_mpsc_test_pingpong SYSCTL_HANDLER_ARGS
	3908	{
	3909	#pragma unused(oidp, arg1, arg2)
	3910	uint64_t value = 0;
	3911	int error;
	3912
	3913	error = SYSCTL_IN(req, &value, sizeof(value));
	3914	if (error) {
	3915	return error;
	3916	}
	3917
	3918	if (error == 0 && req->newptr) {
	3919	error = mpsc_test_pingpong(value, &value);
	3920	if (error == 0) {
	3921	error = SYSCTL_OUT(req, &value, sizeof(value));
	3922	}
	3923	}
	3924
	3925	return error;
	3926	}
	3927	SYSCTL_PROC(_kern, OID_AUTO, mpsc_test_pingpong, CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3928	0, 0, sysctl_mpsc_test_pingpong, "Q", "MPSC tests: pingpong");
	3929
	3930	#endif /* DEVELOPMENT \|\| DEBUG */
	3931
	3932	/Remote Time api/
	3933	SYSCTL_NODE(_machdep, OID_AUTO, remotetime, CTLFLAG_RD \| CTLFLAG_LOCKED, 0, "Remote time api");
	3934
	3935	#if DEVELOPMENT \|\| DEBUG
	3936	#if CONFIG_MACH_BRIDGE_SEND_TIME
	3937	extern _Atomic uint32_t bt_init_flag;
	3938	extern uint32_t mach_bridge_timer_enable(uint32_t, int);
	3939
	3940	SYSCTL_INT(_machdep_remotetime, OID_AUTO, bridge_timer_init_flag,
	3941	CTLFLAG_RD \| CTLFLAG_LOCKED, &bt_init_flag, 0, "");
	3942
	3943	static int sysctl_mach_bridge_timer_enable SYSCTL_HANDLER_ARGS
	3944	{
	3945	#pragma unused(oidp, arg1, arg2)
	3946	uint32_t value = 0;
	3947	int error = 0;
	3948	/* User is querying buffer size */
	3949	if (req->oldptr == USER_ADDR_NULL && req->newptr == USER_ADDR_NULL) {
	3950	req->oldidx = sizeof(value);
	3951	return 0;
	3952	}
	3953	if (os_atomic_load(&bt_init_flag, acquire)) {
	3954	if (req->newptr) {
	3955	int new_value = 0;
	3956	error = SYSCTL_IN(req, &new_value, sizeof(new_value));
	3957	if (error) {
	3958	return error;
	3959	}
	3960	if (new_value == 0 \|\| new_value == 1) {
	3961	value = mach_bridge_timer_enable(new_value, 1);
	3962	} else {
	3963	return EPERM;
	3964	}
	3965	} else {
	3966	value = mach_bridge_timer_enable(0, 0);
	3967	}
	3968	}
	3969	error = SYSCTL_OUT(req, &value, sizeof(value));
	3970	return error;
	3971	}
	3972
	3973	SYSCTL_PROC(_machdep_remotetime, OID_AUTO, bridge_timer_enable,
	3974	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3975	0, 0, sysctl_mach_bridge_timer_enable, "I", "");
	3976
	3977	#endif /* CONFIG_MACH_BRIDGE_SEND_TIME */
	3978
	3979	static int sysctl_mach_bridge_remote_time SYSCTL_HANDLER_ARGS
	3980	{
	3981	#pragma unused(oidp, arg1, arg2)
	3982	uint64_t ltime = 0, rtime = 0;
	3983	if (req->oldptr == USER_ADDR_NULL) {
	3984	req->oldidx = sizeof(rtime);
	3985	return 0;
	3986	}
	3987	if (req->newptr) {
	3988	int error = SYSCTL_IN(req, &ltime, sizeof(ltime));
	3989	if (error) {
	3990	return error;
	3991	}
	3992	}
	3993	rtime = mach_bridge_remote_time(ltime);
	3994	return SYSCTL_OUT(req, &rtime, sizeof(rtime));
	3995	}
	3996	SYSCTL_PROC(_machdep_remotetime, OID_AUTO, mach_bridge_remote_time,
	3997	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
	3998	0, 0, sysctl_mach_bridge_remote_time, "Q", "");
	3999
	4000	#endif /* DEVELOPMENT \|\| DEBUG */
	4001
	4002	#if CONFIG_MACH_BRIDGE_RECV_TIME
	4003	extern struct bt_params bt_params_get_latest(void);
	4004
	4005	static int sysctl_mach_bridge_conversion_params SYSCTL_HANDLER_ARGS
	4006	{
	4007	#pragma unused(oidp, arg1, arg2)
	4008	struct bt_params params = {};
	4009	if (req->oldptr == USER_ADDR_NULL) {
	4010	req->oldidx = sizeof(struct bt_params);
	4011	return 0;
	4012	}
	4013	if (req->newptr) {
	4014	return EPERM;
	4015	}
	4016	params = bt_params_get_latest();
	4017	return SYSCTL_OUT(req, &params, MIN(sizeof(params), req->oldlen));
	4018	}
	4019
	4020	SYSCTL_PROC(_machdep_remotetime, OID_AUTO, conversion_params,
	4021	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, 0,
	4022	0, sysctl_mach_bridge_conversion_params, "S,bt_params", "");
	4023
	4024	#endif /* CONFIG_MACH_BRIDGE_RECV_TIME */
	4025
	4026	#if DEVELOPMENT \|\| DEBUG
	4027	#endif /* DEVELOPMENT \|\| DEBUG */
	4028
	4029	extern uint32_t task_exc_guard_default;
	4030
	4031	SYSCTL_INT(_kern, OID_AUTO, task_exc_guard_default,
	4032	CTLFLAG_RD \| CTLFLAG_LOCKED, &task_exc_guard_default, 0, "");
	4033
	4034
	4035	static int
	4036	sysctl_kern_tcsm_available SYSCTL_HANDLER_ARGS
	4037	{
	4038	#pragma unused(oidp, arg1, arg2)
	4039	uint32_t value = machine_csv(CPUVN_CI) ? 1 : 0;
	4040
	4041	if (req->newptr) {
	4042	return EINVAL;
	4043	}
	4044
	4045	return SYSCTL_OUT(req, &value, sizeof(value));
	4046	}
	4047	SYSCTL_PROC(_kern, OID_AUTO, tcsm_available,
	4048	CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_LOCKED \| CTLFLAG_MASKED \| CTLFLAG_ANYBODY,
	4049	0, 0, sysctl_kern_tcsm_available, "I", "");
	4050
	4051
	4052	static int
	4053	sysctl_kern_tcsm_enable SYSCTL_HANDLER_ARGS
	4054	{
	4055	#pragma unused(oidp, arg1, arg2)
	4056	uint32_t soflags = 0;
	4057	uint32_t old_value = thread_get_no_smt() ? 1 : 0;
	4058
	4059	int error = SYSCTL_IN(req, &soflags, sizeof(soflags));
	4060	if (error) {
	4061	return error;
	4062	}
	4063
	4064	if (soflags && machine_csv(CPUVN_CI)) {
	4065	thread_set_no_smt(true);
	4066	machine_tecs(current_thread());
	4067	}
	4068
	4069	return SYSCTL_OUT(req, &old_value, sizeof(old_value));
	4070	}
	4071	SYSCTL_PROC(_kern, OID_AUTO, tcsm_enable,
	4072	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_MASKED \| CTLFLAG_ANYBODY,
	4073	0, 0, sysctl_kern_tcsm_enable, "I", "");
	4074
	4075
	4076	#if DEVELOPMENT \|\| DEBUG
	4077	extern void sysctl_task_set_no_smt(char no_smt);
	4078	extern char sysctl_task_get_no_smt(void);
	4079
	4080	static int
	4081	sysctl_kern_sched_task_set_no_smt SYSCTL_HANDLER_ARGS
	4082	{
	4083	#pragma unused(oidp, arg1, arg2)
	4084	char buff[4];
	4085
	4086	int error = SYSCTL_IN(req, buff, 1);
	4087	if (error) {
	4088	return error;
	4089	}
	4090	char no_smt = buff[0];
	4091
	4092	if (!req->newptr) {
	4093	goto out;
	4094	}
	4095
	4096	sysctl_task_set_no_smt(no_smt);
	4097	out:
	4098	no_smt = sysctl_task_get_no_smt();
	4099	buff[0] = no_smt;
	4100
	4101	return SYSCTL_OUT(req, buff, 1);
	4102	}
	4103
	4104	SYSCTL_PROC(_kern, OID_AUTO, sched_task_set_no_smt, CTLTYPE_STRING \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_ANYBODY,
	4105	0, 0, sysctl_kern_sched_task_set_no_smt, "A", "");
	4106
	4107	static int
	4108	sysctl_kern_sched_thread_set_no_smt(__unused struct sysctl_oid oidp, __unused void arg1, __unused int arg2, struct sysctl_req *req)
	4109	{
	4110	int new_value, changed;
	4111	int old_value = thread_get_no_smt() ? 1 : 0;
	4112	int error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed);
	4113
	4114	if (changed) {
	4115	thread_set_no_smt(!!new_value);
	4116	}
	4117
	4118	return error;
	4119	}
	4120
	4121	SYSCTL_PROC(_kern, OID_AUTO, sched_thread_set_no_smt,
	4122	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_ANYBODY,
	4123	0, 0, sysctl_kern_sched_thread_set_no_smt, "I", "");
	4124	#endif /* DEVELOPMENT \|\| DEBUG */