git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000-2013 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1982, 1986, 1989, 1993
	31	* The Regents of the University of California. All rights reserved.
	32	* (c) UNIX System Laboratories, Inc.
	33	* All or some portions of this file are derived from material licensed
	34	* to the University of California by American Telephone and Telegraph
	35	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	36	* the permission of UNIX System Laboratories, Inc.
	37	*
	38	* Redistribution and use in source and binary forms, with or without
	39	* modification, are permitted provided that the following conditions
	40	* are met:
	41	* 1. Redistributions of source code must retain the above copyright
	42	* notice, this list of conditions and the following disclaimer.
	43	* 2. Redistributions in binary form must reproduce the above copyright
	44	* notice, this list of conditions and the following disclaimer in the
	45	* documentation and/or other materials provided with the distribution.
	46	* 3. All advertising materials mentioning features or use of this software
	47	* must display the following acknowledgement:
	48	* This product includes software developed by the University of
	49	* California, Berkeley and its contributors.
	50	* 4. Neither the name of the University nor the names of its contributors
	51	* may be used to endorse or promote products derived from this software
	52	* without specific prior written permission.
	53	*
	54	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	55	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	56	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	57	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	58	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	59	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	60	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	61	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	62	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	63	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	64	* SUCH DAMAGE.
	65	*
	66	* @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
	67	*/
	68	/*
	69	* NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
	70	* support for mandatory and extensible security protections. This notice
	71	* is included in support of clause 2.2 (b) of the Apple Public License,
	72	* Version 2.0.
	73	*/
	74
	75	#include <sys/param.h>
	76	#include <sys/systm.h>
	77	#include <sys/filedesc.h>
	78	#include <sys/ioctl.h>
	79	#include <sys/file_internal.h>
	80	#include <sys/proc_internal.h>
	81	#include <sys/socketvar.h>
	82	#include <sys/uio_internal.h>
	83	#include <sys/kernel.h>
	84	#include <sys/stat.h>
	85	#include <sys/malloc.h>
	86	#include <sys/sysproto.h>
	87
	88	#include <sys/mount_internal.h>
	89	#include <sys/protosw.h>
	90	#include <sys/ev.h>
	91	#include <sys/user.h>
	92	#include <sys/kdebug.h>
	93	#include <sys/poll.h>
	94	#include <sys/event.h>
	95	#include <sys/eventvar.h>
	96	#include <sys/proc.h>
	97	#include <sys/kauth.h>
	98
	99	#include <mach/mach_types.h>
	100	#include <kern/kern_types.h>
	101	#include <kern/assert.h>
	102	#include <kern/kalloc.h>
	103	#include <kern/thread.h>
	104	#include <kern/clock.h>
	105	#include <kern/ledger.h>
	106	#include <kern/task.h>
	107	#if CONFIG_TELEMETRY
	108	#include <kern/telemetry.h>
	109	#endif
	110
	111	#include <sys/mbuf.h>
	112	#include <sys/domain.h>
	113	#include <sys/socket.h>
	114	#include <sys/socketvar.h>
	115	#include <sys/errno.h>
	116	#include <sys/syscall.h>
	117	#include <sys/pipe.h>
	118
	119	#include <security/audit/audit.h>
	120
	121	#include <net/if.h>
	122	#include <net/route.h>
	123
	124	#include <netinet/in.h>
	125	#include <netinet/in_systm.h>
	126	#include <netinet/ip.h>
	127	#include <netinet/in_pcb.h>
	128	#include <netinet/ip_var.h>
	129	#include <netinet/ip6.h>
	130	#include <netinet/tcp.h>
	131	#include <netinet/tcp_fsm.h>
	132	#include <netinet/tcp_seq.h>
	133	#include <netinet/tcp_timer.h>
	134	#include <netinet/tcp_var.h>
	135	#include <netinet/tcpip.h>
	136	#include <netinet/tcp_debug.h>
	137	/* for wait queue based select */
	138	#include <kern/wait_queue.h>
	139	#include <kern/kalloc.h>
	140	#include <sys/vnode_internal.h>
	141
	142	#include <pexpert/pexpert.h>
	143
	144	/* XXX should be in a header file somewhere */
	145	void evsofree(struct socket *);
	146	void evpipefree(struct pipe *);
	147	void postpipeevent(struct pipe *, int);
	148	void postevent(struct socket , struct sockbuf , int);
	149	extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
	150
	151	int rd_uio(struct proc p, int fdes, uio_t uio, user_ssize_t retval);
	152	int wr_uio(struct proc p, int fdes, uio_t uio, user_ssize_t retval);
	153	extern void *get_bsduthreadarg(thread_t);
	154	extern int *get_bsduthreadrval(thread_t);
	155
	156	__private_extern__ int dofileread(vfs_context_t ctx, struct fileproc *fp,
	157	user_addr_t bufp, user_size_t nbyte,
	158	off_t offset, int flags, user_ssize_t *retval);
	159	__private_extern__ int dofilewrite(vfs_context_t ctx, struct fileproc *fp,
	160	user_addr_t bufp, user_size_t nbyte,
	161	off_t offset, int flags, user_ssize_t *retval);
	162	__private_extern__ int preparefileread(struct proc p, struct fileproc *fp_ret, int fd, int check_for_vnode);
	163	__private_extern__ void donefileread(struct proc p, struct fileproc fp_ret, int fd);
	164
	165
	166	/* Conflict wait queue for when selects collide (opaque type) */
	167	struct wait_queue select_conflict_queue;
	168
	169	#if 13841988
	170	int temp_debug_13841988 = 0;
	171	#endif
	172
	173	/*
	174	* Init routine called from bsd_init.c
	175	*/
	176	void select_wait_queue_init(void);
	177	void
	178	select_wait_queue_init(void)
	179	{
	180	wait_queue_init(&select_conflict_queue, SYNC_POLICY_FIFO);
	181	#if 13841988
	182	if (PE_parse_boot_argn("temp_debug_13841988", &temp_debug_13841988, sizeof(temp_debug_13841988))) {
	183	kprintf("Temporary debugging for 13841988 enabled\n");
	184	}
	185	#endif
	186	}
	187
	188	#define f_flag f_fglob->fg_flag
	189	#define f_type f_fglob->fg_ops->fo_type
	190	#define f_msgcount f_fglob->fg_msgcount
	191	#define f_cred f_fglob->fg_cred
	192	#define f_ops f_fglob->fg_ops
	193	#define f_offset f_fglob->fg_offset
	194	#define f_data f_fglob->fg_data
	195
	196	/*
	197	* Read system call.
	198	*
	199	* Returns: 0 Success
	200	* preparefileread:EBADF
	201	* preparefileread:ESPIPE
	202	* preparefileread:ENXIO
	203	* preparefileread:EBADF
	204	* dofileread:???
	205	*/
	206	int
	207	read(struct proc p, struct read_args uap, user_ssize_t *retval)
	208	{
	209	__pthread_testcancel(1);
	210	return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
	211	}
	212
	213	int
	214	read_nocancel(struct proc p, struct read_nocancel_args uap, user_ssize_t *retval)
	215	{
	216	struct fileproc *fp;
	217	int error;
	218	int fd = uap->fd;
	219	struct vfs_context context;
	220
	221	if ( (error = preparefileread(p, &fp, fd, 0)) )
	222	return (error);
	223
	224	context = *(vfs_context_current());
	225	context.vc_ucred = fp->f_fglob->fg_cred;
	226
	227	error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
	228	(off_t)-1, 0, retval);
	229
	230	donefileread(p, fp, fd);
	231
	232	return (error);
	233	}
	234
	235	/*
	236	* Pread system call
	237	*
	238	* Returns: 0 Success
	239	* preparefileread:EBADF
	240	* preparefileread:ESPIPE
	241	* preparefileread:ENXIO
	242	* preparefileread:EBADF
	243	* dofileread:???
	244	*/
	245	int
	246	pread(struct proc p, struct pread_args uap, user_ssize_t *retval)
	247	{
	248	__pthread_testcancel(1);
	249	return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
	250	}
	251
	252	int
	253	pread_nocancel(struct proc p, struct pread_nocancel_args uap, user_ssize_t *retval)
	254	{
	255	struct fileproc fp = NULL; / fp set by preparefileread() */
	256	int fd = uap->fd;
	257	int error;
	258	struct vfs_context context;
	259
	260	if ( (error = preparefileread(p, &fp, fd, 1)) )
	261	goto out;
	262
	263	context = *(vfs_context_current());
	264	context.vc_ucred = fp->f_fglob->fg_cred;
	265
	266	error = dofileread(&context, fp, uap->buf, uap->nbyte,
	267	uap->offset, FOF_OFFSET, retval);
	268
	269	donefileread(p, fp, fd);
	270
	271	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) \| DBG_FUNC_NONE),
	272	uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
	273
	274	out:
	275	return (error);
	276	}
	277
	278	/*
	279	* Code common for read and pread
	280	*/
	281
	282	void
	283	donefileread(struct proc p, struct fileproc fp, int fd)
	284	{
	285	proc_fdlock_spin(p);
	286	fp_drop(p, fd, fp, 1);
	287	proc_fdunlock(p);
	288	}
	289
	290	/*
	291	* Returns: 0 Success
	292	* EBADF
	293	* ESPIPE
	294	* ENXIO
	295	* fp_lookup:EBADF
	296	* fo_read:???
	297	*/
	298	int
	299	preparefileread(struct proc p, struct fileproc *fp_ret, int fd, int check_for_pread)
	300	{
	301	vnode_t vp;
	302	int error;
	303	struct fileproc *fp;
	304
	305	AUDIT_ARG(fd, fd);
	306
	307	proc_fdlock_spin(p);
	308
	309	error = fp_lookup(p, fd, &fp, 1);
	310
	311	if (error) {
	312	proc_fdunlock(p);
	313	return (error);
	314	}
	315	if ((fp->f_flag & FREAD) == 0) {
	316	error = EBADF;
	317	goto out;
	318	}
	319	if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
	320	error = ESPIPE;
	321	goto out;
	322	}
	323	if (fp->f_type == DTYPE_VNODE) {
	324	vp = (struct vnode *)fp->f_fglob->fg_data;
	325
	326	if (check_for_pread && (vnode_isfifo(vp))) {
	327	error = ESPIPE;
	328	goto out;
	329	}
	330	if (check_for_pread && (vp->v_flag & VISTTY)) {
	331	error = ENXIO;
	332	goto out;
	333	}
	334	}
	335
	336	*fp_ret = fp;
	337
	338	proc_fdunlock(p);
	339	return (0);
	340
	341	out:
	342	fp_drop(p, fd, fp, 1);
	343	proc_fdunlock(p);
	344	return (error);
	345	}
	346
	347
	348	/*
	349	* Returns: 0 Success
	350	* EINVAL
	351	* fo_read:???
	352	*/
	353	__private_extern__ int
	354	dofileread(vfs_context_t ctx, struct fileproc *fp,
	355	user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
	356	user_ssize_t *retval)
	357	{
	358	uio_t auio;
	359	user_ssize_t bytecnt;
	360	long error = 0;
	361	char uio_buf[ UIO_SIZEOF(1) ];
	362
	363	if (nbyte > INT_MAX)
	364	return (EINVAL);
	365
	366	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
	367	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
	368	&uio_buf[0], sizeof(uio_buf));
	369	} else {
	370	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
	371	&uio_buf[0], sizeof(uio_buf));
	372	}
	373	uio_addiov(auio, bufp, nbyte);
	374
	375	bytecnt = nbyte;
	376
	377	if ((error = fo_read(fp, auio, flags, ctx))) {
	378	if (uio_resid(auio) != bytecnt && (error == ERESTART \|\|
	379	error == EINTR \|\| error == EWOULDBLOCK))
	380	error = 0;
	381	}
	382	bytecnt -= uio_resid(auio);
	383
	384	*retval = bytecnt;
	385
	386	return (error);
	387	}
	388
	389	/*
	390	* Scatter read system call.
	391	*
	392	* Returns: 0 Success
	393	* EINVAL
	394	* ENOMEM
	395	* copyin:EFAULT
	396	* rd_uio:???
	397	*/
	398	int
	399	readv(struct proc p, struct readv_args uap, user_ssize_t *retval)
	400	{
	401	__pthread_testcancel(1);
	402	return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
	403	}
	404
	405	int
	406	readv_nocancel(struct proc p, struct readv_nocancel_args uap, user_ssize_t *retval)
	407	{
	408	uio_t auio = NULL;
	409	int error;
	410	struct user_iovec *iovp;
	411
	412	/* Verify range bedfore calling uio_create() */
	413	if (uap->iovcnt <= 0 \|\| uap->iovcnt > UIO_MAXIOV)
	414	return (EINVAL);
	415
	416	/* allocate a uio large enough to hold the number of iovecs passed */
	417	auio = uio_create(uap->iovcnt, 0,
	418	(IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
	419	UIO_READ);
	420
	421	/* get location of iovecs within the uio. then copyin the iovecs from
	422	* user space.
	423	*/
	424	iovp = uio_iovsaddr(auio);
	425	if (iovp == NULL) {
	426	error = ENOMEM;
	427	goto ExitThisRoutine;
	428	}
	429	error = copyin_user_iovec_array(uap->iovp,
	430	IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
	431	uap->iovcnt, iovp);
	432	if (error) {
	433	goto ExitThisRoutine;
	434	}
	435
	436	/* finalize uio_t for use and do the IO
	437	*/
	438	error = uio_calculateresid(auio);
	439	if (error) {
	440	goto ExitThisRoutine;
	441	}
	442	error = rd_uio(p, uap->fd, auio, retval);
	443
	444	ExitThisRoutine:
	445	if (auio != NULL) {
	446	uio_free(auio);
	447	}
	448	return (error);
	449	}
	450
	451	/*
	452	* Write system call
	453	*
	454	* Returns: 0 Success
	455	* EBADF
	456	* fp_lookup:EBADF
	457	* dofilewrite:???
	458	*/
	459	int
	460	write(struct proc p, struct write_args uap, user_ssize_t *retval)
	461	{
	462	__pthread_testcancel(1);
	463	return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
	464
	465	}
	466
	467	int
	468	write_nocancel(struct proc p, struct write_nocancel_args uap, user_ssize_t *retval)
	469	{
	470	struct fileproc *fp;
	471	int error;
	472	int fd = uap->fd;
	473
	474	AUDIT_ARG(fd, fd);
	475
	476	error = fp_lookup(p,fd,&fp,0);
	477	if (error)
	478	return(error);
	479	if ((fp->f_flag & FWRITE) == 0) {
	480	error = EBADF;
	481	} else {
	482	struct vfs_context context = *(vfs_context_current());
	483	context.vc_ucred = fp->f_fglob->fg_cred;
	484
	485	error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
	486	(off_t)-1, 0, retval);
	487	}
	488	if (error == 0)
	489	fp_drop_written(p, fd, fp);
	490	else
	491	fp_drop(p, fd, fp, 0);
	492	return(error);
	493	}
	494
	495	/*
	496	* pwrite system call
	497	*
	498	* Returns: 0 Success
	499	* EBADF
	500	* ESPIPE
	501	* ENXIO
	502	* EINVAL
	503	* fp_lookup:EBADF
	504	* dofilewrite:???
	505	*/
	506	int
	507	pwrite(struct proc p, struct pwrite_args uap, user_ssize_t *retval)
	508	{
	509	__pthread_testcancel(1);
	510	return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
	511	}
	512
	513	int
	514	pwrite_nocancel(struct proc p, struct pwrite_nocancel_args uap, user_ssize_t *retval)
	515	{
	516	struct fileproc *fp;
	517	int error;
	518	int fd = uap->fd;
	519	vnode_t vp = (vnode_t)0;
	520
	521	AUDIT_ARG(fd, fd);
	522
	523	error = fp_lookup(p,fd,&fp,0);
	524	if (error)
	525	return(error);
	526
	527	if ((fp->f_flag & FWRITE) == 0) {
	528	error = EBADF;
	529	} else {
	530	struct vfs_context context = *vfs_context_current();
	531	context.vc_ucred = fp->f_fglob->fg_cred;
	532
	533	if (fp->f_type != DTYPE_VNODE) {
	534	error = ESPIPE;
	535	goto errout;
	536	}
	537	vp = (vnode_t)fp->f_fglob->fg_data;
	538	if (vnode_isfifo(vp)) {
	539	error = ESPIPE;
	540	goto errout;
	541	}
	542	if ((vp->v_flag & VISTTY)) {
	543	error = ENXIO;
	544	goto errout;
	545	}
	546	if (uap->offset == (off_t)-1) {
	547	error = EINVAL;
	548	goto errout;
	549	}
	550
	551	error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
	552	uap->offset, FOF_OFFSET, retval);
	553	}
	554	errout:
	555	if (error == 0)
	556	fp_drop_written(p, fd, fp);
	557	else
	558	fp_drop(p, fd, fp, 0);
	559
	560	KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) \| DBG_FUNC_NONE),
	561	uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
	562
	563	return(error);
	564	}
	565
	566	/*
	567	* Returns: 0 Success
	568	* EINVAL
	569	* <fo_write>:EPIPE
	570	* <fo_write>:??? [indirect through struct fileops]
	571	*/
	572	__private_extern__ int
	573	dofilewrite(vfs_context_t ctx, struct fileproc *fp,
	574	user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
	575	user_ssize_t *retval)
	576	{
	577	uio_t auio;
	578	long error = 0;
	579	user_ssize_t bytecnt;
	580	char uio_buf[ UIO_SIZEOF(1) ];
	581
	582	if (nbyte > INT_MAX)
	583	return (EINVAL);
	584
	585	if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
	586	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
	587	&uio_buf[0], sizeof(uio_buf));
	588	} else {
	589	auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
	590	&uio_buf[0], sizeof(uio_buf));
	591	}
	592	uio_addiov(auio, bufp, nbyte);
	593
	594	bytecnt = nbyte;
	595	if ((error = fo_write(fp, auio, flags, ctx))) {
	596	if (uio_resid(auio) != bytecnt && (error == ERESTART \|\|
	597	error == EINTR \|\| error == EWOULDBLOCK))
	598	error = 0;
	599	/* The socket layer handles SIGPIPE */
	600	if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
	601	(fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
	602	/* XXX Raise the signal on the thread? */
	603	psignal(vfs_context_proc(ctx), SIGPIPE);
	604	}
	605	}
	606	bytecnt -= uio_resid(auio);
	607	*retval = bytecnt;
	608
	609	return (error);
	610	}
	611
	612	/*
	613	* Gather write system call
	614	*/
	615	int
	616	writev(struct proc p, struct writev_args uap, user_ssize_t *retval)
	617	{
	618	__pthread_testcancel(1);
	619	return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
	620	}
	621
	622	int
	623	writev_nocancel(struct proc p, struct writev_nocancel_args uap, user_ssize_t *retval)
	624	{
	625	uio_t auio = NULL;
	626	int error;
	627	struct user_iovec *iovp;
	628
	629	AUDIT_ARG(fd, uap->fd);
	630
	631	/* Verify range bedfore calling uio_create() */
	632	if (uap->iovcnt <= 0 \|\| uap->iovcnt > UIO_MAXIOV)
	633	return (EINVAL);
	634
	635	/* allocate a uio large enough to hold the number of iovecs passed */
	636	auio = uio_create(uap->iovcnt, 0,
	637	(IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
	638	UIO_WRITE);
	639
	640	/* get location of iovecs within the uio. then copyin the iovecs from
	641	* user space.
	642	*/
	643	iovp = uio_iovsaddr(auio);
	644	if (iovp == NULL) {
	645	error = ENOMEM;
	646	goto ExitThisRoutine;
	647	}
	648	error = copyin_user_iovec_array(uap->iovp,
	649	IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
	650	uap->iovcnt, iovp);
	651	if (error) {
	652	goto ExitThisRoutine;
	653	}
	654
	655	/* finalize uio_t for use and do the IO
	656	*/
	657	error = uio_calculateresid(auio);
	658	if (error) {
	659	goto ExitThisRoutine;
	660	}
	661	error = wr_uio(p, uap->fd, auio, retval);
	662
	663	ExitThisRoutine:
	664	if (auio != NULL) {
	665	uio_free(auio);
	666	}
	667	return (error);
	668	}
	669
	670
	671	int
	672	wr_uio(struct proc p, int fdes, uio_t uio, user_ssize_t retval)
	673	{
	674	struct fileproc *fp;
	675	int error;
	676	user_ssize_t count;
	677	struct vfs_context context = *vfs_context_current();
	678
	679	error = fp_lookup(p,fdes,&fp,0);
	680	if (error)
	681	return(error);
	682
	683	if ((fp->f_flag & FWRITE) == 0) {
	684	error = EBADF;
	685	goto out;
	686	}
	687	count = uio_resid(uio);
	688
	689	context.vc_ucred = fp->f_cred;
	690	error = fo_write(fp, uio, 0, &context);
	691	if (error) {
	692	if (uio_resid(uio) != count && (error == ERESTART \|\|
	693	error == EINTR \|\| error == EWOULDBLOCK))
	694	error = 0;
	695	/* The socket layer handles SIGPIPE */
	696	if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
	697	(fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0)
	698	psignal(p, SIGPIPE);
	699	}
	700	*retval = count - uio_resid(uio);
	701
	702	out:
	703	if (error == 0)
	704	fp_drop_written(p, fdes, fp);
	705	else
	706	fp_drop(p, fdes, fp, 0);
	707	return(error);
	708	}
	709
	710
	711	int
	712	rd_uio(struct proc p, int fdes, uio_t uio, user_ssize_t retval)
	713	{
	714	struct fileproc *fp;
	715	int error;
	716	user_ssize_t count;
	717	struct vfs_context context = *vfs_context_current();
	718
	719	if ( (error = preparefileread(p, &fp, fdes, 0)) )
	720	return (error);
	721
	722	count = uio_resid(uio);
	723
	724	context.vc_ucred = fp->f_cred;
	725
	726	error = fo_read(fp, uio, 0, &context);
	727
	728	if (error) {
	729	if (uio_resid(uio) != count && (error == ERESTART \|\|
	730	error == EINTR \|\| error == EWOULDBLOCK))
	731	error = 0;
	732	}
	733	*retval = count - uio_resid(uio);
	734
	735	donefileread(p, fp, fdes);
	736
	737	return (error);
	738	}
	739
	740	/*
	741	* Ioctl system call
	742	*
	743	* Returns: 0 Success
	744	* EBADF
	745	* ENOTTY
	746	* ENOMEM
	747	* ESRCH
	748	* copyin:EFAULT
	749	* copyoutEFAULT
	750	* fp_lookup:EBADF Bad file descriptor
	751	* fo_ioctl:???
	752	*/
	753	int
	754	ioctl(struct proc p, struct ioctl_args uap, __unused int32_t *retval)
	755	{
	756	struct fileproc *fp = NULL;
	757	int error = 0;
	758	u_int size = 0;
	759	caddr_t datap = NULL, memp = NULL;
	760	boolean_t is64bit = FALSE;
	761	int tmp = 0;
	762	#define STK_PARAMS 128
	763	char stkbuf[STK_PARAMS];
	764	int fd = uap->fd;
	765	u_long com = uap->com;
	766	struct vfs_context context = *vfs_context_current();
	767
	768	AUDIT_ARG(fd, uap->fd);
	769	AUDIT_ARG(addr, uap->data);
	770
	771	is64bit = proc_is64bit(p);
	772	#if CONFIG_AUDIT
	773	if (is64bit)
	774	AUDIT_ARG(value64, com);
	775	else
	776	AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
	777	#endif /* CONFIG_AUDIT */
	778
	779	/*
	780	* Interpret high order word to find amount of data to be
	781	* copied to/from the user's address space.
	782	*/
	783	size = IOCPARM_LEN(com);
	784	if (size > IOCPARM_MAX)
	785	return ENOTTY;
	786	if (size > sizeof (stkbuf)) {
	787	if ((memp = (caddr_t)kalloc(size)) == 0)
	788	return ENOMEM;
	789	datap = memp;
	790	} else
	791	datap = &stkbuf[0];
	792	if (com & IOC_IN) {
	793	if (size) {
	794	error = copyin(uap->data, datap, size);
	795	if (error)
	796	goto out_nofp;
	797	} else {
	798	/* XXX - IOC_IN and no size? we should proably return an error here!! */
	799	if (is64bit) {
	800	(user_addr_t )datap = uap->data;
	801	}
	802	else {
	803	(uint32_t )datap = (uint32_t)uap->data;
	804	}
	805	}
	806	} else if ((com & IOC_OUT) && size)
	807	/*
	808	* Zero the buffer so the user always
	809	* gets back something deterministic.
	810	*/
	811	bzero(datap, size);
	812	else if (com & IOC_VOID) {
	813	/* XXX - this is odd since IOC_VOID means no parameters */
	814	if (is64bit) {
	815	(user_addr_t )datap = uap->data;
	816	}
	817	else {
	818	(uint32_t )datap = (uint32_t)uap->data;
	819	}
	820	}
	821
	822	proc_fdlock(p);
	823	error = fp_lookup(p,fd,&fp,1);
	824	if (error) {
	825	proc_fdunlock(p);
	826	goto out_nofp;
	827	}
	828
	829	AUDIT_ARG(file, p, fp);
	830
	831	if ((fp->f_flag & (FREAD \| FWRITE)) == 0) {
	832	error = EBADF;
	833	goto out;
	834	}
	835
	836	context.vc_ucred = fp->f_fglob->fg_cred;
	837
	838	#if CONFIG_MACF
	839	error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, com);
	840	if (error)
	841	goto out;
	842	#endif
	843
	844	switch (com) {
	845	case FIONCLEX:
	846	*fdflags(p, fd) &= ~UF_EXCLOSE;
	847	break;
	848
	849	case FIOCLEX:
	850	*fdflags(p, fd) \|= UF_EXCLOSE;
	851	break;
	852
	853	case FIONBIO:
	854	if ( (tmp = (int )datap) )
	855	fp->f_flag \|= FNONBLOCK;
	856	else
	857	fp->f_flag &= ~FNONBLOCK;
	858	error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
	859	break;
	860
	861	case FIOASYNC:
	862	if ( (tmp = (int )datap) )
	863	fp->f_flag \|= FASYNC;
	864	else
	865	fp->f_flag &= ~FASYNC;
	866	error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
	867	break;
	868
	869	case FIOSETOWN:
	870	tmp = (int )datap;
	871	if (fp->f_type == DTYPE_SOCKET) {
	872	((struct socket *)fp->f_data)->so_pgid = tmp;
	873	break;
	874	}
	875	if (fp->f_type == DTYPE_PIPE) {
	876	error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
	877	break;
	878	}
	879	if (tmp <= 0) {
	880	tmp = -tmp;
	881	} else {
	882	struct proc *p1 = proc_find(tmp);
	883	if (p1 == 0) {
	884	error = ESRCH;
	885	break;
	886	}
	887	tmp = p1->p_pgrpid;
	888	proc_rele(p1);
	889	}
	890	error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
	891	break;
	892
	893	case FIOGETOWN:
	894	if (fp->f_type == DTYPE_SOCKET) {
	895	(int )datap = ((struct socket *)fp->f_data)->so_pgid;
	896	break;
	897	}
	898	error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
	899	(int )datap = -(int )datap;
	900	break;
	901
	902	default:
	903	error = fo_ioctl(fp, com, datap, &context);
	904	/*
	905	* Copy any data to user, size was
	906	* already set and checked above.
	907	*/
	908	if (error == 0 && (com & IOC_OUT) && size)
	909	error = copyout(datap, uap->data, (u_int)size);
	910	break;
	911	}
	912	out:
	913	fp_drop(p, fd, fp, 1);
	914	proc_fdunlock(p);
	915
	916	out_nofp:
	917	if (memp)
	918	kfree(memp, size);
	919	return(error);
	920	}
	921
	922	int selwait, nselcoll;
	923	#define SEL_FIRSTPASS 1
	924	#define SEL_SECONDPASS 2
	925	extern int selcontinue(int error);
	926	extern int selprocess(int error, int sel_pass);
	927	static int selscan(struct proc p, struct _select sel,
	928	int nfd, int32_t *retval, int sel_pass, wait_queue_sub_t wqsub);
	929	static int selcount(struct proc p, u_int32_t ibits, int nfd, int *count);
	930	static int seldrop_locked(struct proc p, u_int32_t ibits, int nfd, int lim, int *need_wakeup, int fromselcount);
	931	static int seldrop(struct proc p, u_int32_t ibits, int nfd);
	932
	933	/*
	934	* Select system call.
	935	*
	936	* Returns: 0 Success
	937	* EINVAL Invalid argument
	938	* EAGAIN Nonconformant error if allocation fails
	939	* selprocess:???
	940	*/
	941	int
	942	select(struct proc p, struct select_args uap, int32_t *retval)
	943	{
	944	__pthread_testcancel(1);
	945	return(select_nocancel(p, (struct select_nocancel_args *)uap, retval));
	946	}
	947
	948	int
	949	select_nocancel(struct proc p, struct select_nocancel_args uap, int32_t *retval)
	950	{
	951	int error = 0;
	952	u_int ni, nw, size;
	953	thread_t th_act;
	954	struct uthread *uth;
	955	struct _select *sel;
	956	int needzerofill = 1;
	957	int count = 0;
	958
	959	th_act = current_thread();
	960	uth = get_bsdthread_info(th_act);
	961	sel = &uth->uu_select;
	962	sel->data = &uth->uu_kevent.ss_select_data;
	963	retval = (int *)get_bsduthreadrval(th_act);
	964	*retval = 0;
	965
	966	if (uap->nd < 0) {
	967	return (EINVAL);
	968	}
	969
	970	/* select on thread of process that already called proc_exit() */
	971	if (p->p_fd == NULL) {
	972	return (EBADF);
	973	}
	974
	975	if (uap->nd > p->p_fd->fd_nfiles)
	976	uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
	977
	978	nw = howmany(uap->nd, NFDBITS);
	979	ni = nw * sizeof(fd_mask);
	980
	981	/*
	982	* if the previously allocated space for the bits is smaller than
	983	* what is requested or no space has yet been allocated for this
	984	* thread, allocate enough space now.
	985	*
	986	* Note: If this process fails, select() will return EAGAIN; this
	987	* is the same thing pool() returns in a no-memory situation, but
	988	* it is not a POSIX compliant error code for select().
	989	*/
	990	if (sel->nbytes < (3 * ni)) {
	991	int nbytes = 3 * ni;
	992
	993	/* Free previous allocation, if any */
	994	if (sel->ibits != NULL)
	995	FREE(sel->ibits, M_TEMP);
	996	if (sel->obits != NULL) {
	997	FREE(sel->obits, M_TEMP);
	998	/* NULL out; subsequent ibits allocation may fail */
	999	sel->obits = NULL;
	1000	}
	1001
	1002	MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK \| M_ZERO);
	1003	if (sel->ibits == NULL)
	1004	return (EAGAIN);
	1005	MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK \| M_ZERO);
	1006	if (sel->obits == NULL) {
	1007	FREE(sel->ibits, M_TEMP);
	1008	sel->ibits = NULL;
	1009	return (EAGAIN);
	1010	}
	1011	sel->nbytes = nbytes;
	1012	needzerofill = 0;
	1013	}
	1014
	1015	if (needzerofill) {
	1016	bzero((caddr_t)sel->ibits, sel->nbytes);
	1017	bzero((caddr_t)sel->obits, sel->nbytes);
	1018	}
	1019
	1020	/*
	1021	* get the bits from the user address space
	1022	*/
	1023	#define getbits(name, x) \
	1024	do { \
	1025	if (uap->name && (error = copyin(uap->name, \
	1026	(caddr_t)&sel->ibits[(x) * nw], ni))) \
	1027	goto continuation; \
	1028	} while (0)
	1029
	1030	getbits(in, 0);
	1031	getbits(ou, 1);
	1032	getbits(ex, 2);
	1033	#undef getbits
	1034
	1035	if (uap->tv) {
	1036	struct timeval atv;
	1037	if (IS_64BIT_PROCESS(p)) {
	1038	struct user64_timeval atv64;
	1039	error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
	1040	/* Loses resolution - assume timeout < 68 years */
	1041	atv.tv_sec = atv64.tv_sec;
	1042	atv.tv_usec = atv64.tv_usec;
	1043	} else {
	1044	struct user32_timeval atv32;
	1045	error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
	1046	atv.tv_sec = atv32.tv_sec;
	1047	atv.tv_usec = atv32.tv_usec;
	1048	}
	1049	if (error)
	1050	goto continuation;
	1051	if (itimerfix(&atv)) {
	1052	error = EINVAL;
	1053	goto continuation;
	1054	}
	1055
	1056	clock_absolutetime_interval_to_deadline(
	1057	tvtoabstime(&atv), &sel->data->abstime);
	1058	}
	1059	else
	1060	sel->data->abstime = 0;
	1061
	1062	if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) {
	1063	goto continuation;
	1064	}
	1065
	1066	sel->data->count = count;
	1067	size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK);
	1068	if (uth->uu_allocsize) {
	1069	if (uth->uu_wqset == 0)
	1070	panic("select: wql memory smashed");
	1071	/* needed for the select now */
	1072	if (size > uth->uu_allocsize) {
	1073	kfree(uth->uu_wqset, uth->uu_allocsize);
	1074	uth->uu_allocsize = size;
	1075	uth->uu_wqset = (wait_queue_set_t)kalloc(size);
	1076	if (uth->uu_wqset == (wait_queue_set_t)NULL)
	1077	panic("failed to allocate memory for waitqueue\n");
	1078	}
	1079	} else {
	1080	uth->uu_allocsize = size;
	1081	uth->uu_wqset = (wait_queue_set_t)kalloc(uth->uu_allocsize);
	1082	if (uth->uu_wqset == (wait_queue_set_t)NULL)
	1083	panic("failed to allocate memory for waitqueue\n");
	1084	}
	1085	bzero(uth->uu_wqset, size);
	1086	sel->data->wql = (char *)uth->uu_wqset + SIZEOF_WAITQUEUE_SET;
	1087	wait_queue_set_init(uth->uu_wqset, (SYNC_POLICY_FIFO \| SYNC_POLICY_PREPOST));
	1088
	1089	continuation:
	1090
	1091	if (error) {
	1092	/*
	1093	* We have already cleaned up any state we established,
	1094	* either locally or as a result of selcount(). We don't
	1095	* need to wait_subqueue_unlink_all(), since we haven't set
	1096	* anything at this point.
	1097	*/
	1098	return (error);
	1099	}
	1100
	1101	return selprocess(0, SEL_FIRSTPASS);
	1102	}
	1103
	1104	int
	1105	selcontinue(int error)
	1106	{
	1107	return selprocess(error, SEL_SECONDPASS);
	1108	}
	1109
	1110
	1111	/*
	1112	* selprocess
	1113	*
	1114	* Parameters: error The error code from our caller
	1115	* sel_pass The pass we are on
	1116	*/
	1117	int
	1118	selprocess(int error, int sel_pass)
	1119	{
	1120	int ncoll;
	1121	u_int ni, nw;
	1122	thread_t th_act;
	1123	struct uthread *uth;
	1124	struct proc *p;
	1125	struct select_args *uap;
	1126	int *retval;
	1127	struct _select *sel;
	1128	int unwind = 1;
	1129	int prepost = 0;
	1130	int somewakeup = 0;
	1131	int doretry = 0;
	1132	wait_result_t wait_result;
	1133
	1134	p = current_proc();
	1135	th_act = current_thread();
	1136	uap = (struct select_args *)get_bsduthreadarg(th_act);
	1137	retval = (int *)get_bsduthreadrval(th_act);
	1138	uth = get_bsdthread_info(th_act);
	1139	sel = &uth->uu_select;
	1140
	1141	if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
	1142	unwind = 0;
	1143	if (sel->data->count == 0)
	1144	unwind = 0;
	1145	retry:
	1146	if (error != 0) {
	1147	sel_pass = SEL_FIRSTPASS; /* Reset for seldrop */
	1148	goto done;
	1149	}
	1150
	1151	ncoll = nselcoll;
	1152	OSBitOrAtomic(P_SELECT, &p->p_flag);
	1153	/* skip scans if the select is just for timeouts */
	1154	if (sel->data->count) {
	1155	/*
	1156	* Clear out any dangling refs from prior calls; technically
	1157	* there should not be any.
	1158	*/
	1159	if (sel_pass == SEL_FIRSTPASS)
	1160	wait_queue_sub_clearrefs(uth->uu_wqset);
	1161
	1162	error = selscan(p, sel, uap->nd, retval, sel_pass, (wait_queue_sub_t)uth->uu_wqset);
	1163	if (error \|\| *retval) {
	1164	goto done;
	1165	}
	1166	if (prepost) {
	1167	/* if the select of log, then we canwakeup and discover some one
	1168	* else already read the data; go toselct again if time permits
	1169	*/
	1170	prepost = 0;
	1171	doretry = 1;
	1172	}
	1173	if (somewakeup) {
	1174	somewakeup = 0;
	1175	doretry = 1;
	1176	}
	1177	}
	1178
	1179	if (uap->tv) {
	1180	uint64_t now;
	1181
	1182	clock_get_uptime(&now);
	1183	if (now >= sel->data->abstime)
	1184	goto done;
	1185	}
	1186
	1187	if (doretry) {
	1188	/* cleanup obits and try again */
	1189	doretry = 0;
	1190	sel_pass = SEL_FIRSTPASS;
	1191	goto retry;
	1192	}
	1193
	1194	/*
	1195	* To effect a poll, the timeout argument should be
	1196	* non-nil, pointing to a zero-valued timeval structure.
	1197	*/
	1198	if (uap->tv && sel->data->abstime == 0) {
	1199	goto done;
	1200	}
	1201
	1202	/* No spurious wakeups due to colls,no need to check for them */
	1203	if ((sel_pass == SEL_SECONDPASS) \|\| ((p->p_flag & P_SELECT) == 0)) {
	1204	sel_pass = SEL_FIRSTPASS;
	1205	goto retry;
	1206	}
	1207
	1208	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1209
	1210	/* if the select is just for timeout skip check */
	1211	if (sel->data->count &&(sel_pass == SEL_SECONDPASS))
	1212	panic("selprocess: 2nd pass assertwaiting");
	1213
	1214	/* Wait Queue Subordinate has waitqueue as first element */
	1215	wait_result = wait_queue_assert_wait_with_leeway((wait_queue_t)uth->uu_wqset,
	1216	NULL, THREAD_ABORTSAFE,
	1217	TIMEOUT_URGENCY_USER_NORMAL, sel->data->abstime, 0);
	1218	if (wait_result != THREAD_AWAKENED) {
	1219	/* there are no preposted events */
	1220	error = tsleep1(NULL, PSOCK \| PCATCH,
	1221	"select", 0, selcontinue);
	1222	} else {
	1223	prepost = 1;
	1224	error = 0;
	1225	}
	1226
	1227	if (error == 0) {
	1228	sel_pass = SEL_SECONDPASS;
	1229	if (!prepost)
	1230	somewakeup = 1;
	1231	goto retry;
	1232	}
	1233	done:
	1234	if (unwind) {
	1235	wait_subqueue_unlink_all(uth->uu_wqset);
	1236	seldrop(p, sel->ibits, uap->nd);
	1237	}
	1238	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1239	/* select is not restarted after signals... */
	1240	if (error == ERESTART)
	1241	error = EINTR;
	1242	if (error == EWOULDBLOCK)
	1243	error = 0;
	1244	nw = howmany(uap->nd, NFDBITS);
	1245	ni = nw * sizeof(fd_mask);
	1246
	1247	#define putbits(name, x) \
	1248	do { \
	1249	if (uap->name && (error2 = \
	1250	copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
	1251	error = error2; \
	1252	} while (0)
	1253
	1254	if (error == 0) {
	1255	int error2;
	1256
	1257	putbits(in, 0);
	1258	putbits(ou, 1);
	1259	putbits(ex, 2);
	1260	#undef putbits
	1261	}
	1262	return(error);
	1263	}
	1264
	1265
	1266	/*
	1267	* selscan
	1268	*
	1269	* Parameters: p Process performing the select
	1270	* sel The per-thread select context structure
	1271	* nfd The number of file descriptors to scan
	1272	* retval The per thread system call return area
	1273	* sel_pass Which pass this is; allowed values are
	1274	* SEL_FIRSTPASS and SEL_SECONDPASS
	1275	* wqsub The per thread wait queue set
	1276	*
	1277	* Returns: 0 Success
	1278	* EIO Invalid p->p_fd field XXX Obsolete?
	1279	* EBADF One of the files in the bit vector is
	1280	* invalid.
	1281	*/
	1282	static int
	1283	selscan(struct proc p, struct _select sel, int nfd, int32_t *retval,
	1284	int sel_pass, wait_queue_sub_t wqsub)
	1285	{
	1286	struct filedesc *fdp = p->p_fd;
	1287	int msk, i, j, fd;
	1288	u_int32_t bits;
	1289	struct fileproc *fp;
	1290	int n = 0; /* count of bits */
	1291	int nc = 0; /* bit vector offset (nc'th bit) */
	1292	static int flag[3] = { FREAD, FWRITE, 0 };
	1293	u_int32_t iptr, optr;
	1294	u_int nw;
	1295	u_int32_t ibits, obits;
	1296	char * wql;
	1297	char * wql_ptr;
	1298	int count;
	1299	struct vfs_context context = *vfs_context_current();
	1300
	1301	/*
	1302	* Problems when reboot; due to MacOSX signal probs
	1303	* in Beaker1C ; verify that the p->p_fd is valid
	1304	*/
	1305	if (fdp == NULL) {
	1306	*retval=0;
	1307	return(EIO);
	1308	}
	1309	ibits = sel->ibits;
	1310	obits = sel->obits;
	1311	wql = sel->data->wql;
	1312
	1313	nw = howmany(nfd, NFDBITS);
	1314
	1315	count = sel->data->count;
	1316
	1317	nc = 0;
	1318	if (count) {
	1319	proc_fdlock(p);
	1320	for (msk = 0; msk < 3; msk++) {
	1321	iptr = (u_int32_t )&ibits[msk nw];
	1322	optr = (u_int32_t )&obits[msk nw];
	1323
	1324	for (i = 0; i < nfd; i += NFDBITS) {
	1325	bits = iptr[i/NFDBITS];
	1326
	1327	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	1328	bits &= ~(1 << j);
	1329	fp = fdp->fd_ofiles[fd];
	1330
	1331	if (fp == NULL \|\| (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
	1332	/*
	1333	* If we abort because of a bad
	1334	* fd, let the caller unwind...
	1335	*/
	1336	proc_fdunlock(p);
	1337	return(EBADF);
	1338	}
	1339	if (sel_pass == SEL_SECONDPASS) {
	1340	wql_ptr = (char *)0;
	1341	if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)wqsub)) {
	1342	fp->f_flags &= ~FP_INSELECT;
	1343	fp->f_waddr = (void *)0;
	1344	}
	1345	} else {
	1346	wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
	1347	if (fp->f_flags & FP_INSELECT) {
	1348	/* someone is already in select on this fp */
	1349	fp->f_flags \|= FP_SELCONFLICT;
	1350	wait_queue_link(&select_conflict_queue, (wait_queue_set_t)wqsub);
	1351	} else {
	1352	fp->f_flags \|= FP_INSELECT;
	1353	fp->f_waddr = (void *)wqsub;
	1354	}
	1355	}
	1356
	1357	context.vc_ucred = fp->f_cred;
	1358
	1359	/* The select; set the bit, if true */
	1360	if (fp->f_ops && fp->f_type
	1361	&& fo_select(fp, flag[msk], wql_ptr, &context)) {
	1362	optr[fd/NFDBITS] \|= (1 << (fd % NFDBITS));
	1363	n++;
	1364	}
	1365	nc++;
	1366	}
	1367	}
	1368	}
	1369	proc_fdunlock(p);
	1370	}
	1371	*retval = n;
	1372	return (0);
	1373	}
	1374
	1375	int poll_callback(struct kqueue , struct kevent64_s , void *);
	1376
	1377	struct poll_continue_args {
	1378	user_addr_t pca_fds;
	1379	u_int pca_nfds;
	1380	u_int pca_rfds;
	1381	};
	1382
	1383	int
	1384	poll(struct proc p, struct poll_args uap, int32_t *retval)
	1385	{
	1386	__pthread_testcancel(1);
	1387	return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
	1388	}
	1389
	1390
	1391	int
	1392	poll_nocancel(struct proc p, struct poll_nocancel_args uap, int32_t *retval)
	1393	{
	1394	struct poll_continue_args *cont;
	1395	struct pollfd *fds;
	1396	struct kqueue *kq;
	1397	struct timeval atv;
	1398	int ncoll, error = 0;
	1399	u_int nfds = uap->nfds;
	1400	u_int rfds = 0;
	1401	u_int i;
	1402	size_t ni;
	1403
	1404	/*
	1405	* This is kinda bogus. We have fd limits, but that is not
	1406	* really related to the size of the pollfd array. Make sure
	1407	* we let the process use at least FD_SETSIZE entries and at
	1408	* least enough for the current limits. We want to be reasonably
	1409	* safe, but not overly restrictive.
	1410	*/
	1411	if (nfds > OPEN_MAX \|\|
	1412	(nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) \|\| nfds > FD_SETSIZE)))
	1413	return (EINVAL);
	1414
	1415	kq = kqueue_alloc(p);
	1416	if (kq == NULL)
	1417	return (EAGAIN);
	1418
	1419	ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
	1420	MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
	1421	if (NULL == cont) {
	1422	error = EAGAIN;
	1423	goto out;
	1424	}
	1425
	1426	fds = (struct pollfd *)&cont[1];
	1427	error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
	1428	if (error)
	1429	goto out;
	1430
	1431	if (uap->timeout != -1) {
	1432	struct timeval rtv;
	1433
	1434	atv.tv_sec = uap->timeout / 1000;
	1435	atv.tv_usec = (uap->timeout % 1000) * 1000;
	1436	if (itimerfix(&atv)) {
	1437	error = EINVAL;
	1438	goto out;
	1439	}
	1440	getmicrouptime(&rtv);
	1441	timevaladd(&atv, &rtv);
	1442	} else {
	1443	atv.tv_sec = 0;
	1444	atv.tv_usec = 0;
	1445	}
	1446
	1447	/* JMM - all this P_SELECT stuff is bogus */
	1448	ncoll = nselcoll;
	1449	OSBitOrAtomic(P_SELECT, &p->p_flag);
	1450	for (i = 0; i < nfds; i++) {
	1451	short events = fds[i].events;
	1452	struct kevent64_s kev;
	1453	int kerror = 0;
	1454
	1455	/* per spec, ignore fd values below zero */
	1456	if (fds[i].fd < 0) {
	1457	fds[i].revents = 0;
	1458	continue;
	1459	}
	1460
	1461	/* convert the poll event into a kqueue kevent */
	1462	kev.ident = fds[i].fd;
	1463	kev.flags = EV_ADD \| EV_ONESHOT \| EV_POLL;
	1464	kev.udata = CAST_USER_ADDR_T(&fds[i]);
	1465	kev.fflags = 0;
	1466	kev.data = 0;
	1467	kev.ext[0] = 0;
	1468	kev.ext[1] = 0;
	1469
	1470	/* Handle input events */
	1471	if (events & ( POLLIN \| POLLRDNORM \| POLLPRI \| POLLRDBAND \| POLLHUP )) {
	1472	kev.filter = EVFILT_READ;
	1473	if (!(events & ( POLLIN \| POLLRDNORM )))
	1474	kev.flags \|= EV_OOBAND;
	1475	kerror = kevent_register(kq, &kev, p);
	1476	}
	1477
	1478	/* Handle output events */
	1479	if (kerror == 0 &&
	1480	events & ( POLLOUT \| POLLWRNORM \| POLLWRBAND )) {
	1481	kev.filter = EVFILT_WRITE;
	1482	kerror = kevent_register(kq, &kev, p);
	1483	}
	1484
	1485	/* Handle BSD extension vnode events */
	1486	if (kerror == 0 &&
	1487	events & ( POLLEXTEND \| POLLATTRIB \| POLLNLINK \| POLLWRITE )) {
	1488	kev.filter = EVFILT_VNODE;
	1489	kev.fflags = 0;
	1490	if (events & POLLEXTEND)
	1491	kev.fflags \|= NOTE_EXTEND;
	1492	if (events & POLLATTRIB)
	1493	kev.fflags \|= NOTE_ATTRIB;
	1494	if (events & POLLNLINK)
	1495	kev.fflags \|= NOTE_LINK;
	1496	if (events & POLLWRITE)
	1497	kev.fflags \|= NOTE_WRITE;
	1498	kerror = kevent_register(kq, &kev, p);
	1499	}
	1500
	1501	if (kerror != 0) {
	1502	fds[i].revents = POLLNVAL;
	1503	rfds++;
	1504	} else
	1505	fds[i].revents = 0;
	1506	}
	1507
	1508	/* Did we have any trouble registering? */
	1509	if (rfds > 0)
	1510	goto done;
	1511
	1512	/* scan for, and possibly wait for, the kevents to trigger */
	1513	cont->pca_fds = uap->fds;
	1514	cont->pca_nfds = nfds;
	1515	cont->pca_rfds = rfds;
	1516	error = kqueue_scan(kq, poll_callback, NULL, cont, &atv, p);
	1517	rfds = cont->pca_rfds;
	1518
	1519	done:
	1520	OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
	1521	/* poll is not restarted after signals... */
	1522	if (error == ERESTART)
	1523	error = EINTR;
	1524	if (error == EWOULDBLOCK)
	1525	error = 0;
	1526	if (error == 0) {
	1527	error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
	1528	*retval = rfds;
	1529	}
	1530
	1531	out:
	1532	if (NULL != cont)
	1533	FREE(cont, M_TEMP);
	1534
	1535	kqueue_dealloc(kq);
	1536	return (error);
	1537	}
	1538
	1539	int
	1540	poll_callback(__unused struct kqueue kq, struct kevent64_s kevp, void *data)
	1541	{
	1542	struct poll_continue_args cont = (struct poll_continue_args )data;
	1543	struct pollfd fds = CAST_DOWN(struct pollfd , kevp->udata);
	1544	short prev_revents = fds->revents;
	1545	short mask;
	1546
	1547	/* convert the results back into revents */
	1548	if (kevp->flags & EV_EOF)
	1549	fds->revents \|= POLLHUP;
	1550	if (kevp->flags & EV_ERROR)
	1551	fds->revents \|= POLLERR;
	1552
	1553	switch (kevp->filter) {
	1554	case EVFILT_READ:
	1555	if (fds->revents & POLLHUP)
	1556	mask = (POLLIN \| POLLRDNORM \| POLLPRI \| POLLRDBAND );
	1557	else {
	1558	mask = 0;
	1559	if (kevp->data != 0)
	1560	mask \|= (POLLIN \| POLLRDNORM );
	1561	if (kevp->flags & EV_OOBAND)
	1562	mask \|= ( POLLPRI \| POLLRDBAND );
	1563	}
	1564	fds->revents \|= (fds->events & mask);
	1565	break;
	1566
	1567	case EVFILT_WRITE:
	1568	if (!(fds->revents & POLLHUP))
	1569	fds->revents \|= (fds->events & ( POLLOUT \| POLLWRNORM \| POLLWRBAND ));
	1570	break;
	1571
	1572	case EVFILT_VNODE:
	1573	if (kevp->fflags & NOTE_EXTEND)
	1574	fds->revents \|= (fds->events & POLLEXTEND);
	1575	if (kevp->fflags & NOTE_ATTRIB)
	1576	fds->revents \|= (fds->events & POLLATTRIB);
	1577	if (kevp->fflags & NOTE_LINK)
	1578	fds->revents \|= (fds->events & POLLNLINK);
	1579	if (kevp->fflags & NOTE_WRITE)
	1580	fds->revents \|= (fds->events & POLLWRITE);
	1581	break;
	1582	}
	1583
	1584	if (fds->revents != 0 && prev_revents == 0)
	1585	cont->pca_rfds++;
	1586
	1587	return 0;
	1588	}
	1589
	1590	int
	1591	seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
	1592	{
	1593
	1594	return (1);
	1595	}
	1596
	1597	/*
	1598	* selcount
	1599	*
	1600	* Count the number of bits set in the input bit vector, and establish an
	1601	* outstanding fp->f_iocount for each of the descriptors which will be in
	1602	* use in the select operation.
	1603	*
	1604	* Parameters: p The process doing the select
	1605	* ibits The input bit vector
	1606	* nfd The number of fd's in the vector
	1607	* countp Pointer to where to store the bit count
	1608	*
	1609	* Returns: 0 Success
	1610	* EIO Bad per process open file table
	1611	* EBADF One of the bits in the input bit vector
	1612	* references an invalid fd
	1613	*
	1614	* Implicit: *countp (modified) Count of fd's
	1615	*
	1616	* Notes: This function is the first pass under the proc_fdlock() that
	1617	* permits us to recognize invalid descriptors in the bit vector;
	1618	* the may, however, not remain valid through the drop and
	1619	* later reacquisition of the proc_fdlock().
	1620	*/
	1621	static int
	1622	selcount(struct proc p, u_int32_t ibits, int nfd, int *countp)
	1623	{
	1624	struct filedesc *fdp = p->p_fd;
	1625	int msk, i, j, fd;
	1626	u_int32_t bits;
	1627	struct fileproc *fp;
	1628	int n = 0;
	1629	u_int32_t *iptr;
	1630	u_int nw;
	1631	int error=0;
	1632	int dropcount;
	1633	int need_wakeup = 0;
	1634
	1635	/*
	1636	* Problems when reboot; due to MacOSX signal probs
	1637	* in Beaker1C ; verify that the p->p_fd is valid
	1638	*/
	1639	if (fdp == NULL) {
	1640	*countp = 0;
	1641	return(EIO);
	1642	}
	1643	nw = howmany(nfd, NFDBITS);
	1644
	1645	proc_fdlock(p);
	1646	for (msk = 0; msk < 3; msk++) {
	1647	iptr = (u_int32_t )&ibits[msk nw];
	1648	for (i = 0; i < nfd; i += NFDBITS) {
	1649	bits = iptr[i/NFDBITS];
	1650	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	1651	bits &= ~(1 << j);
	1652	fp = fdp->fd_ofiles[fd];
	1653	if (fp == NULL \|\|
	1654	(fdp->fd_ofileflags[fd] & UF_RESERVED)) {
	1655	*countp = 0;
	1656	error = EBADF;
	1657	goto bad;
	1658	}
	1659	fp->f_iocount++;
	1660	n++;
	1661	}
	1662	}
	1663	}
	1664	proc_fdunlock(p);
	1665
	1666	*countp = n;
	1667	return (0);
	1668
	1669	bad:
	1670	dropcount = 0;
	1671
	1672	if (n== 0)
	1673	goto out;
	1674	/* Ignore error return; it's already EBADF */
	1675	(void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1);
	1676
	1677	out:
	1678	proc_fdunlock(p);
	1679	if (need_wakeup) {
	1680	wakeup(&p->p_fpdrainwait);
	1681	}
	1682	return(error);
	1683	}
	1684
	1685
	1686	/*
	1687	* seldrop_locked
	1688	*
	1689	* Drop outstanding wait queue references set up during selscan(); drop the
	1690	* outstanding per fileproc f_iocount() picked up during the selcount().
	1691	*
	1692	* Parameters: p Process performing the select
	1693	* ibits Input pit bector of fd's
	1694	* nfd Number of fd's
	1695	* lim Limit to number of vector entries to
	1696	* consider, or -1 for "all"
	1697	* inselect True if
	1698	* need_wakeup Pointer to flag to set to do a wakeup
	1699	* if f_iocont on any descriptor goes to 0
	1700	*
	1701	* Returns: 0 Success
	1702	* EBADF One or more fds in the bit vector
	1703	* were invalid, but the rest
	1704	* were successfully dropped
	1705	*
	1706	* Notes: An fd make become bad while the proc_fdlock() is not held,
	1707	* if a multithreaded application closes the fd out from under
	1708	* the in progress select. In this case, we still have to
	1709	* clean up after the set up on the remaining fds.
	1710	*/
	1711	static int
	1712	seldrop_locked(struct proc p, u_int32_t ibits, int nfd, int lim, int *need_wakeup, int fromselcount)
	1713	{
	1714	struct filedesc *fdp = p->p_fd;
	1715	int msk, i, j, fd;
	1716	u_int32_t bits;
	1717	struct fileproc *fp;
	1718	u_int32_t *iptr;
	1719	u_int nw;
	1720	int error = 0;
	1721	int dropcount = 0;
	1722	uthread_t uth = get_bsdthread_info(current_thread());
	1723
	1724	*need_wakeup = 0;
	1725
	1726	/*
	1727	* Problems when reboot; due to MacOSX signal probs
	1728	* in Beaker1C ; verify that the p->p_fd is valid
	1729	*/
	1730	if (fdp == NULL) {
	1731	return(EIO);
	1732	}
	1733
	1734	nw = howmany(nfd, NFDBITS);
	1735
	1736	for (msk = 0; msk < 3; msk++) {
	1737	iptr = (u_int32_t )&ibits[msk nw];
	1738	for (i = 0; i < nfd; i += NFDBITS) {
	1739	bits = iptr[i/NFDBITS];
	1740	while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
	1741	bits &= ~(1 << j);
	1742	fp = fdp->fd_ofiles[fd];
	1743	/*
	1744	* If we've already dropped as many as were
	1745	* counted/scanned, then we are done.
	1746	*/
	1747	if ((fromselcount != 0) && (++dropcount > lim))
	1748	goto done;
	1749
	1750	if (fp == NULL) {
	1751	/* skip (now) bad fds */
	1752	error = EBADF;
	1753	continue;
	1754	}
	1755	/*
	1756	* Only clear the flag if we set it. We'll
	1757	* only find that we set it if we had made
	1758	* at least one [partial] pass through selscan().
	1759	*/
	1760	if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)uth->uu_wqset)) {
	1761	fp->f_flags &= ~FP_INSELECT;
	1762	fp->f_waddr = (void *)0;
	1763	}
	1764
	1765	fp->f_iocount--;
	1766	if (fp->f_iocount < 0)
	1767	panic("f_iocount overdecrement!");
	1768
	1769	if (fp->f_iocount == 0) {
	1770	/*
	1771	* The last iocount is responsible for clearing
	1772	* selconfict flag - even if we didn't set it -
	1773	* and is also responsible for waking up anyone
	1774	* waiting on iocounts to drain.
	1775	*/
	1776	if (fp->f_flags & FP_SELCONFLICT)
	1777	fp->f_flags &= ~FP_SELCONFLICT;
	1778	if (p->p_fpdrainwait) {
	1779	p->p_fpdrainwait = 0;
	1780	*need_wakeup = 1;
	1781	}
	1782	}
	1783	}
	1784	}
	1785	}
	1786	done:
	1787	return (error);
	1788	}
	1789
	1790
	1791	static int
	1792	seldrop(struct proc p, u_int32_t ibits, int nfd)
	1793	{
	1794	int error;
	1795	int need_wakeup = 0;
	1796
	1797	proc_fdlock(p);
	1798	error = seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0);
	1799	proc_fdunlock(p);
	1800	if (need_wakeup) {
	1801	wakeup(&p->p_fpdrainwait);
	1802	}
	1803	return (error);
	1804	}
	1805
	1806	/*
	1807	* Record a select request.
	1808	*/
	1809	void
	1810	selrecord(__unused struct proc selector, struct selinfo sip, void * p_wql)
	1811	{
	1812	thread_t cur_act = current_thread();
	1813	struct uthread * ut = get_bsdthread_info(cur_act);
	1814
	1815	/* need to look at collisions */
	1816
	1817	/do not record if this is second pass of select /
	1818	if(p_wql == (void *)0) {
	1819	return;
	1820	}
	1821
	1822	if ((sip->si_flags & SI_INITED) == 0) {
	1823	wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
	1824	sip->si_flags \|= SI_INITED;
	1825	sip->si_flags &= ~SI_CLEAR;
	1826	}
	1827
	1828	if (sip->si_flags & SI_RECORDED) {
	1829	sip->si_flags \|= SI_COLL;
	1830	} else
	1831	sip->si_flags &= ~SI_COLL;
	1832
	1833	sip->si_flags \|= SI_RECORDED;
	1834	if (!wait_queue_member(&sip->si_wait_queue, ut->uu_wqset))
	1835	wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_wqset,
	1836	(wait_queue_link_t)p_wql);
	1837
	1838	return;
	1839	}
	1840
	1841	void
	1842	selwakeup(struct selinfo *sip)
	1843	{
	1844
	1845	if ((sip->si_flags & SI_INITED) == 0) {
	1846	return;
	1847	}
	1848
	1849	if (sip->si_flags & SI_COLL) {
	1850	nselcoll++;
	1851	sip->si_flags &= ~SI_COLL;
	1852	#if 0
	1853	/* will not support */
	1854	//wakeup((caddr_t)&selwait);
	1855	#endif
	1856	}
	1857
	1858	if (sip->si_flags & SI_RECORDED) {
	1859	wait_queue_wakeup_all(&sip->si_wait_queue, NULL, THREAD_AWAKENED);
	1860	sip->si_flags &= ~SI_RECORDED;
	1861	}
	1862
	1863	}
	1864
	1865	void
	1866	selthreadclear(struct selinfo *sip)
	1867	{
	1868
	1869	if ((sip->si_flags & SI_INITED) == 0) {
	1870	return;
	1871	}
	1872	if (sip->si_flags & SI_RECORDED) {
	1873	selwakeup(sip);
	1874	sip->si_flags &= ~(SI_RECORDED \| SI_COLL);
	1875	}
	1876	sip->si_flags \|= SI_CLEAR;
	1877	wait_queue_unlink_all(&sip->si_wait_queue);
	1878	}
	1879
	1880
	1881
	1882
	1883	#define DBG_POST 0x10
	1884	#define DBG_WATCH 0x11
	1885	#define DBG_WAIT 0x12
	1886	#define DBG_MOD 0x13
	1887	#define DBG_EWAKEUP 0x14
	1888	#define DBG_ENQUEUE 0x15
	1889	#define DBG_DEQUEUE 0x16
	1890
	1891	#define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
	1892	#define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
	1893	#define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
	1894	#define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
	1895	#define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
	1896	#define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
	1897	#define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
	1898
	1899
	1900	#define EVPROCDEQUE(p, evq) do { \
	1901	proc_lock(p); \
	1902	if (evq->ee_flags & EV_QUEUED) { \
	1903	TAILQ_REMOVE(&p->p_evlist, evq, ee_plist); \
	1904	evq->ee_flags &= ~EV_QUEUED; \
	1905	} \
	1906	proc_unlock(p); \
	1907	} while (0);
	1908
	1909
	1910	/*
	1911	* called upon socket close. deque and free all events for
	1912	* the socket... socket must be locked by caller.
	1913	*/
	1914	void
	1915	evsofree(struct socket *sp)
	1916	{
	1917	struct eventqelt evq, next;
	1918	proc_t p;
	1919
	1920	if (sp == NULL)
	1921	return;
	1922
	1923	for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
	1924	next = evq->ee_slist.tqe_next;
	1925	p = evq->ee_proc;
	1926
	1927	if (evq->ee_flags & EV_QUEUED) {
	1928	EVPROCDEQUE(p, evq);
	1929	}
	1930	TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
	1931	FREE(evq, M_TEMP);
	1932	}
	1933	}
	1934
	1935
	1936	/*
	1937	* called upon pipe close. deque and free all events for
	1938	* the pipe... pipe must be locked by caller
	1939	*/
	1940	void
	1941	evpipefree(struct pipe *cpipe)
	1942	{
	1943	struct eventqelt evq, next;
	1944	proc_t p;
	1945
	1946	for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
	1947	next = evq->ee_slist.tqe_next;
	1948	p = evq->ee_proc;
	1949
	1950	EVPROCDEQUE(p, evq);
	1951
	1952	TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
	1953	FREE(evq, M_TEMP);
	1954	}
	1955	}
	1956
	1957
	1958	/*
	1959	* enqueue this event if it's not already queued. wakeup
	1960	* the proc if we do queue this event to it...
	1961	* entered with proc lock held... we drop it before
	1962	* doing the wakeup and return in that state
	1963	*/
	1964	static void
	1965	evprocenque(struct eventqelt *evq)
	1966	{
	1967	proc_t p;
	1968
	1969	assert(evq);
	1970	p = evq->ee_proc;
	1971
	1972	KERNEL_DEBUG(DBG_MISC_ENQUEUE\|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,0,0);
	1973
	1974	proc_lock(p);
	1975
	1976	if (evq->ee_flags & EV_QUEUED) {
	1977	proc_unlock(p);
	1978
	1979	KERNEL_DEBUG(DBG_MISC_ENQUEUE\|DBG_FUNC_END, 0,0,0,0,0);
	1980	return;
	1981	}
	1982	evq->ee_flags \|= EV_QUEUED;
	1983
	1984	TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
	1985
	1986	proc_unlock(p);
	1987
	1988	wakeup(&p->p_evlist);
	1989
	1990	KERNEL_DEBUG(DBG_MISC_ENQUEUE\|DBG_FUNC_END, 0,0,0,0,0);
	1991	}
	1992
	1993
	1994	/*
	1995	* pipe lock must be taken by the caller
	1996	*/
	1997	void
	1998	postpipeevent(struct pipe *pipep, int event)
	1999	{
	2000	int mask;
	2001	struct eventqelt *evq;
	2002
	2003	if (pipep == NULL)
	2004	return;
	2005	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_START, event,0,0,1,0);
	2006
	2007	for (evq = pipep->pipe_evlist.tqh_first;
	2008	evq != NULL; evq = evq->ee_slist.tqe_next) {
	2009
	2010	if (evq->ee_eventmask == 0)
	2011	continue;
	2012	mask = 0;
	2013
	2014	switch (event & (EV_RWBYTES \| EV_RCLOSED \| EV_WCLOSED)) {
	2015
	2016	case EV_RWBYTES:
	2017	if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
	2018	mask \|= EV_RE;
	2019	evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
	2020	}
	2021	if ((evq->ee_eventmask & EV_WR) &&
	2022	(MAX(pipep->pipe_buffer.size,PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
	2023
	2024	if (pipep->pipe_state & PIPE_EOF) {
	2025	mask \|= EV_WR\|EV_RESET;
	2026	break;
	2027	}
	2028	mask \|= EV_WR;
	2029	evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt;
	2030	}
	2031	break;
	2032
	2033	case EV_WCLOSED:
	2034	case EV_RCLOSED:
	2035	if ((evq->ee_eventmask & EV_RE)) {
	2036	mask \|= EV_RE\|EV_RCLOSED;
	2037	}
	2038	if ((evq->ee_eventmask & EV_WR)) {
	2039	mask \|= EV_WR\|EV_WCLOSED;
	2040	}
	2041	break;
	2042
	2043	default:
	2044	return;
	2045	}
	2046	if (mask) {
	2047	/*
	2048	* disarm... postevents are nops until this event is 'read' via
	2049	* waitevent and then re-armed via modwatch
	2050	*/
	2051	evq->ee_eventmask = 0;
	2052
	2053	/*
	2054	* since events are disarmed until after the waitevent
	2055	* the ee_req.er_xxxx fields can't change once we've
	2056	* inserted this event into the proc queue...
	2057	* therefore, the waitevent will see a 'consistent'
	2058	* snapshot of the event, even though it won't hold
	2059	* the pipe lock, and we're updating the event outside
	2060	* of the proc lock, which it will hold
	2061	*/
	2062	evq->ee_req.er_eventbits \|= mask;
	2063
	2064	KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1,0);
	2065
	2066	evprocenque(evq);
	2067	}
	2068	}
	2069	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_END, 0,0,0,1,0);
	2070	}
	2071
	2072	#if SOCKETS
	2073	/*
	2074	* given either a sockbuf or a socket run down the
	2075	* event list and queue ready events found...
	2076	* the socket must be locked by the caller
	2077	*/
	2078	void
	2079	postevent(struct socket sp, struct sockbuf sb, int event)
	2080	{
	2081	int mask;
	2082	struct eventqelt *evq;
	2083	struct tcpcb *tp;
	2084
	2085	if (sb)
	2086	sp = sb->sb_so;
	2087	if (sp == NULL)
	2088	return;
	2089
	2090	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
	2091
	2092	for (evq = sp->so_evlist.tqh_first;
	2093	evq != NULL; evq = evq->ee_slist.tqe_next) {
	2094
	2095	if (evq->ee_eventmask == 0)
	2096	continue;
	2097	mask = 0;
	2098
	2099	/* ready for reading:
	2100	- byte cnt >= receive low water mark
	2101	- read-half of conn closed
	2102	- conn pending for listening sock
	2103	- socket error pending
	2104
	2105	ready for writing
	2106	- byte cnt avail >= send low water mark
	2107	- write half of conn closed
	2108	- socket error pending
	2109	- non-blocking conn completed successfully
	2110
	2111	exception pending
	2112	- out of band data
	2113	- sock at out of band mark
	2114	*/
	2115
	2116	switch (event & EV_DMASK) {
	2117
	2118	case EV_OOB:
	2119	if ((evq->ee_eventmask & EV_EX)) {
	2120	if (sp->so_oobmark \|\| ((sp->so_state & SS_RCVATMARK)))
	2121	mask \|= EV_EX\|EV_OOB;
	2122	}
	2123	break;
	2124
	2125	case EV_RWBYTES\|EV_OOB:
	2126	if ((evq->ee_eventmask & EV_EX)) {
	2127	if (sp->so_oobmark \|\| ((sp->so_state & SS_RCVATMARK)))
	2128	mask \|= EV_EX\|EV_OOB;
	2129	}
	2130	/*
	2131	* fall into the next case
	2132	*/
	2133	case EV_RWBYTES:
	2134	if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
	2135	/* for AFP/OT purposes; may go away in future */
	2136	if ((SOCK_DOM(sp) == PF_INET \|\|
	2137	SOCK_DOM(sp) == PF_INET6) &&
	2138	SOCK_PROTO(sp) == IPPROTO_TCP &&
	2139	(sp->so_error == ECONNREFUSED \|\|
	2140	sp->so_error == ECONNRESET)) {
	2141	if (sp->so_pcb == NULL \|\|
	2142	sotoinpcb(sp)->inp_state ==
	2143	INPCB_STATE_DEAD \|\|
	2144	(tp = sototcpcb(sp)) == NULL \|\|
	2145	tp->t_state == TCPS_CLOSED) {
	2146	mask \|= EV_RE\|EV_RESET;
	2147	break;
	2148	}
	2149	}
	2150	mask \|= EV_RE;
	2151	evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
	2152
	2153	if (sp->so_state & SS_CANTRCVMORE) {
	2154	mask \|= EV_FIN;
	2155	break;
	2156	}
	2157	}
	2158	if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
	2159	/* for AFP/OT purposes; may go away in future */
	2160	if ((SOCK_DOM(sp) == PF_INET \|\|
	2161	SOCK_DOM(sp) == PF_INET6) &&
	2162	SOCK_PROTO(sp) == IPPROTO_TCP &&
	2163	(sp->so_error == ECONNREFUSED \|\|
	2164	sp->so_error == ECONNRESET)) {
	2165	if (sp->so_pcb == NULL \|\|
	2166	sotoinpcb(sp)->inp_state ==
	2167	INPCB_STATE_DEAD \|\|
	2168	(tp = sototcpcb(sp)) == NULL \|\|
	2169	tp->t_state == TCPS_CLOSED) {
	2170	mask \|= EV_WR\|EV_RESET;
	2171	break;
	2172	}
	2173	}
	2174	mask \|= EV_WR;
	2175	evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
	2176	}
	2177	break;
	2178
	2179	case EV_RCONN:
	2180	if ((evq->ee_eventmask & EV_RE)) {
	2181	mask \|= EV_RE\|EV_RCONN;
	2182	evq->ee_req.er_rcnt = sp->so_qlen + 1; // incl this one
	2183	}
	2184	break;
	2185
	2186	case EV_WCONN:
	2187	if ((evq->ee_eventmask & EV_WR)) {
	2188	mask \|= EV_WR\|EV_WCONN;
	2189	}
	2190	break;
	2191
	2192	case EV_RCLOSED:
	2193	if ((evq->ee_eventmask & EV_RE)) {
	2194	mask \|= EV_RE\|EV_RCLOSED;
	2195	}
	2196	break;
	2197
	2198	case EV_WCLOSED:
	2199	if ((evq->ee_eventmask & EV_WR)) {
	2200	mask \|= EV_WR\|EV_WCLOSED;
	2201	}
	2202	break;
	2203
	2204	case EV_FIN:
	2205	if (evq->ee_eventmask & EV_RE) {
	2206	mask \|= EV_RE\|EV_FIN;
	2207	}
	2208	break;
	2209
	2210	case EV_RESET:
	2211	case EV_TIMEOUT:
	2212	if (evq->ee_eventmask & EV_RE) {
	2213	mask \|= EV_RE \| event;
	2214	}
	2215	if (evq->ee_eventmask & EV_WR) {
	2216	mask \|= EV_WR \| event;
	2217	}
	2218	break;
	2219
	2220	default:
	2221	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
	2222	return;
	2223	} /* switch */
	2224
	2225	KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
	2226
	2227	if (mask) {
	2228	/*
	2229	* disarm... postevents are nops until this event is 'read' via
	2230	* waitevent and then re-armed via modwatch
	2231	*/
	2232	evq->ee_eventmask = 0;
	2233
	2234	/*
	2235	* since events are disarmed until after the waitevent
	2236	* the ee_req.er_xxxx fields can't change once we've
	2237	* inserted this event into the proc queue...
	2238	* since waitevent can't see this event until we
	2239	* enqueue it, waitevent will see a 'consistent'
	2240	* snapshot of the event, even though it won't hold
	2241	* the socket lock, and we're updating the event outside
	2242	* of the proc lock, which it will hold
	2243	*/
	2244	evq->ee_req.er_eventbits \|= mask;
	2245
	2246	evprocenque(evq);
	2247	}
	2248	}
	2249	KERNEL_DEBUG(DBG_MISC_POST\|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
	2250	}
	2251	#endif /* SOCKETS */
	2252
	2253
	2254	/*
	2255	* watchevent system call. user passes us an event to watch
	2256	* for. we malloc an event object, initialize it, and queue
	2257	* it to the open socket. when the event occurs, postevent()
	2258	* will enque it back to our proc where we can retrieve it
	2259	* via waitevent().
	2260	*
	2261	* should this prevent duplicate events on same socket?
	2262	*
	2263	* Returns:
	2264	* ENOMEM No memory for operation
	2265	* copyin:EFAULT
	2266	*/
	2267	int
	2268	watchevent(proc_t p, struct watchevent_args uap, __unused int retval)
	2269	{
	2270	struct eventqelt evq = (struct eventqelt )0;
	2271	struct eventqelt *np = NULL;
	2272	struct eventreq64 *erp;
	2273	struct fileproc *fp = NULL;
	2274	int error;
	2275
	2276	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_START, 0,0,0,0,0);
	2277
	2278	// get a qelt and fill with users req
	2279	MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
	2280
	2281	if (evq == NULL)
	2282	return (ENOMEM);
	2283	erp = &evq->ee_req;
	2284
	2285	// get users request pkt
	2286
	2287	if (IS_64BIT_PROCESS(p)) {
	2288	error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
	2289	} else {
	2290	struct eventreq32 er32;
	2291
	2292	error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
	2293	if (error == 0) {
	2294	/*
	2295	* the user only passes in the
	2296	* er_type, er_handle and er_data...
	2297	* the other fields are initialized
	2298	* below, so don't bother to copy
	2299	*/
	2300	erp->er_type = er32.er_type;
	2301	erp->er_handle = er32.er_handle;
	2302	erp->er_data = (user_addr_t)er32.er_data;
	2303	}
	2304	}
	2305	if (error) {
	2306	FREE(evq, M_TEMP);
	2307	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, error,0,0,0,0);
	2308
	2309	return(error);
	2310	}
	2311	KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
	2312
	2313	// validate, freeing qelt if errors
	2314	error = 0;
	2315	proc_fdlock(p);
	2316
	2317	if (erp->er_type != EV_FD) {
	2318	error = EINVAL;
	2319	} else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
	2320	error = EBADF;
	2321	#if SOCKETS
	2322	} else if (fp->f_type == DTYPE_SOCKET) {
	2323	socket_lock((struct socket *)fp->f_data, 1);
	2324	np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
	2325	#endif /* SOCKETS */
	2326	} else if (fp->f_type == DTYPE_PIPE) {
	2327	PIPE_LOCK((struct pipe *)fp->f_data);
	2328	np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
	2329	} else {
	2330	fp_drop(p, erp->er_handle, fp, 1);
	2331	error = EINVAL;
	2332	}
	2333	proc_fdunlock(p);
	2334
	2335	if (error) {
	2336	FREE(evq, M_TEMP);
	2337
	2338	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, error,0,0,0,0);
	2339	return(error);
	2340	}
	2341
	2342	/*
	2343	* only allow one watch per file per proc
	2344	*/
	2345	for ( ; np != NULL; np = np->ee_slist.tqe_next) {
	2346	if (np->ee_proc == p) {
	2347	#if SOCKETS
	2348	if (fp->f_type == DTYPE_SOCKET)
	2349	socket_unlock((struct socket *)fp->f_data, 1);
	2350	else
	2351	#endif /* SOCKETS */
	2352	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2353	fp_drop(p, erp->er_handle, fp, 0);
	2354	FREE(evq, M_TEMP);
	2355
	2356	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, EINVAL,0,0,0,0);
	2357	return(EINVAL);
	2358	}
	2359	}
	2360	erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
	2361	evq->ee_proc = p;
	2362	evq->ee_eventmask = uap->u_eventmask & EV_MASK;
	2363	evq->ee_flags = 0;
	2364
	2365	#if SOCKETS
	2366	if (fp->f_type == DTYPE_SOCKET) {
	2367	TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
	2368	postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
	2369
	2370	socket_unlock((struct socket *)fp->f_data, 1);
	2371	} else
	2372	#endif /* SOCKETS */
	2373	{
	2374	TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
	2375	postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
	2376
	2377	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2378	}
	2379	fp_drop_event(p, erp->er_handle, fp);
	2380
	2381	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, 0,0,0,0,0);
	2382	return(0);
	2383	}
	2384
	2385
	2386
	2387	/*
	2388	* waitevent system call.
	2389	* grabs the next waiting event for this proc and returns
	2390	* it. if no events, user can request to sleep with timeout
	2391	* or without or poll mode
	2392	* ((tv != NULL && interval == 0) \|\| tv == -1)
	2393	*/
	2394	int
	2395	waitevent(proc_t p, struct waitevent_args uap, int retval)
	2396	{
	2397	int error = 0;
	2398	struct eventqelt *evq;
	2399	struct eventreq64 *erp;
	2400	uint64_t abstime, interval;
	2401	boolean_t fast_poll = FALSE;
	2402	union {
	2403	struct eventreq64 er64;
	2404	struct eventreq32 er32;
	2405	} uer;
	2406
	2407	interval = 0;
	2408
	2409	if (uap->tv) {
	2410	struct timeval atv;
	2411	/*
	2412	* check for fast poll method
	2413	*/
	2414	if (IS_64BIT_PROCESS(p)) {
	2415	if (uap->tv == (user_addr_t)-1)
	2416	fast_poll = TRUE;
	2417	} else if (uap->tv == (user_addr_t)((uint32_t)-1))
	2418	fast_poll = TRUE;
	2419
	2420	if (fast_poll == TRUE) {
	2421	if (p->p_evlist.tqh_first == NULL) {
	2422	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_NONE, -1,0,0,0,0);
	2423	/*
	2424	* poll failed
	2425	*/
	2426	*retval = 1;
	2427	return (0);
	2428	}
	2429	proc_lock(p);
	2430	goto retry;
	2431	}
	2432	if (IS_64BIT_PROCESS(p)) {
	2433	struct user64_timeval atv64;
	2434	error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
	2435	/* Loses resolution - assume timeout < 68 years */
	2436	atv.tv_sec = atv64.tv_sec;
	2437	atv.tv_usec = atv64.tv_usec;
	2438	} else {
	2439	struct user32_timeval atv32;
	2440	error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
	2441	atv.tv_sec = atv32.tv_sec;
	2442	atv.tv_usec = atv32.tv_usec;
	2443	}
	2444
	2445	if (error)
	2446	return(error);
	2447	if (itimerfix(&atv)) {
	2448	error = EINVAL;
	2449	return(error);
	2450	}
	2451	interval = tvtoabstime(&atv);
	2452	}
	2453	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_START, 0,0,0,0,0);
	2454
	2455	proc_lock(p);
	2456	retry:
	2457	if ((evq = p->p_evlist.tqh_first) != NULL) {
	2458	/*
	2459	* found one... make a local copy while it's still on the queue
	2460	* to prevent it from changing while in the midst of copying
	2461	* don't want to hold the proc lock across a copyout because
	2462	* it might block on a page fault at the target in user space
	2463	*/
	2464	erp = &evq->ee_req;
	2465
	2466	if (IS_64BIT_PROCESS(p))
	2467	bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
	2468	else {
	2469	uer.er32.er_type = erp->er_type;
	2470	uer.er32.er_handle = erp->er_handle;
	2471	uer.er32.er_data = (uint32_t)erp->er_data;
	2472	uer.er32.er_ecnt = erp->er_ecnt;
	2473	uer.er32.er_rcnt = erp->er_rcnt;
	2474	uer.er32.er_wcnt = erp->er_wcnt;
	2475	uer.er32.er_eventbits = erp->er_eventbits;
	2476	}
	2477	TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
	2478
	2479	evq->ee_flags &= ~EV_QUEUED;
	2480
	2481	proc_unlock(p);
	2482
	2483	if (IS_64BIT_PROCESS(p))
	2484	error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
	2485	else
	2486	error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
	2487
	2488	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_END, error,
	2489	evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,0);
	2490	return (error);
	2491	}
	2492	else {
	2493	if (uap->tv && interval == 0) {
	2494	proc_unlock(p);
	2495	*retval = 1; // poll failed
	2496
	2497	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_END, error,0,0,0,0);
	2498	return (error);
	2499	}
	2500	if (interval != 0)
	2501	clock_absolutetime_interval_to_deadline(interval, &abstime);
	2502	else
	2503	abstime = 0;
	2504
	2505	KERNEL_DEBUG(DBG_MISC_WAIT, 1,(uint32_t)&p->p_evlist,0,0,0);
	2506
	2507	error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK \| PCATCH), "waitevent", abstime);
	2508
	2509	KERNEL_DEBUG(DBG_MISC_WAIT, 2,(uint32_t)&p->p_evlist,0,0,0);
	2510
	2511	if (error == 0)
	2512	goto retry;
	2513	if (error == ERESTART)
	2514	error = EINTR;
	2515	if (error == EWOULDBLOCK) {
	2516	*retval = 1;
	2517	error = 0;
	2518	}
	2519	}
	2520	proc_unlock(p);
	2521
	2522	KERNEL_DEBUG(DBG_MISC_WAIT\|DBG_FUNC_END, 0,0,0,0,0);
	2523	return (error);
	2524	}
	2525
	2526
	2527	/*
	2528	* modwatch system call. user passes in event to modify.
	2529	* if we find it we reset the event bits and que/deque event
	2530	* it needed.
	2531	*/
	2532	int
	2533	modwatch(proc_t p, struct modwatch_args uap, __unused int retval)
	2534	{
	2535	struct eventreq64 er;
	2536	struct eventreq64 *erp = &er;
	2537	struct eventqelt evq = NULL; / protected by error return */
	2538	int error;
	2539	struct fileproc *fp;
	2540	int flag;
	2541
	2542	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_START, 0,0,0,0,0);
	2543
	2544	/*
	2545	* get user's request pkt
	2546	* just need the er_type and er_handle which sit above the
	2547	* problematic er_data (32/64 issue)... so only copy in
	2548	* those 2 fields
	2549	*/
	2550	if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
	2551	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, error,0,0,0,0);
	2552	return(error);
	2553	}
	2554	proc_fdlock(p);
	2555
	2556	if (erp->er_type != EV_FD) {
	2557	error = EINVAL;
	2558	} else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
	2559	error = EBADF;
	2560	#if SOCKETS
	2561	} else if (fp->f_type == DTYPE_SOCKET) {
	2562	socket_lock((struct socket *)fp->f_data, 1);
	2563	evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
	2564	#endif /* SOCKETS */
	2565	} else if (fp->f_type == DTYPE_PIPE) {
	2566	PIPE_LOCK((struct pipe *)fp->f_data);
	2567	evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
	2568	} else {
	2569	fp_drop(p, erp->er_handle, fp, 1);
	2570	error = EINVAL;
	2571	}
	2572
	2573	if (error) {
	2574	proc_fdunlock(p);
	2575	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, error,0,0,0,0);
	2576	return(error);
	2577	}
	2578
	2579	if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
	2580	fp->f_flags &= ~FP_WAITEVENT;
	2581	}
	2582	proc_fdunlock(p);
	2583
	2584	// locate event if possible
	2585	for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
	2586	if (evq->ee_proc == p)
	2587	break;
	2588	}
	2589	if (evq == NULL) {
	2590	#if SOCKETS
	2591	if (fp->f_type == DTYPE_SOCKET)
	2592	socket_unlock((struct socket *)fp->f_data, 1);
	2593	else
	2594	#endif /* SOCKETS */
	2595	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2596	fp_drop(p, erp->er_handle, fp, 0);
	2597	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, EINVAL,0,0,0,0);
	2598	return(EINVAL);
	2599	}
	2600	KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
	2601
	2602	if (uap->u_eventmask == EV_RM) {
	2603	EVPROCDEQUE(p, evq);
	2604
	2605	#if SOCKETS
	2606	if (fp->f_type == DTYPE_SOCKET) {
	2607	TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
	2608	socket_unlock((struct socket *)fp->f_data, 1);
	2609	} else
	2610	#endif /* SOCKETS */
	2611	{
	2612	TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
	2613	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2614	}
	2615	fp_drop(p, erp->er_handle, fp, 0);
	2616	FREE(evq, M_TEMP);
	2617	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, 0,0,0,0,0);
	2618	return(0);
	2619	}
	2620	switch (uap->u_eventmask & EV_MASK) {
	2621
	2622	case 0:
	2623	flag = 0;
	2624	break;
	2625
	2626	case EV_RE:
	2627	case EV_WR:
	2628	case EV_RE\|EV_WR:
	2629	flag = EV_RWBYTES;
	2630	break;
	2631
	2632	case EV_EX:
	2633	flag = EV_OOB;
	2634	break;
	2635
	2636	case EV_EX\|EV_RE:
	2637	case EV_EX\|EV_WR:
	2638	case EV_EX\|EV_RE\|EV_WR:
	2639	flag = EV_OOB\|EV_RWBYTES;
	2640	break;
	2641
	2642	default:
	2643	#if SOCKETS
	2644	if (fp->f_type == DTYPE_SOCKET)
	2645	socket_unlock((struct socket *)fp->f_data, 1);
	2646	else
	2647	#endif /* SOCKETS */
	2648	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2649	fp_drop(p, erp->er_handle, fp, 0);
	2650	KERNEL_DEBUG(DBG_MISC_WATCH\|DBG_FUNC_END, EINVAL,0,0,0,0);
	2651	return(EINVAL);
	2652	}
	2653	/*
	2654	* since we're holding the socket/pipe lock, the event
	2655	* cannot go from the unqueued state to the queued state
	2656	* however, it can go from the queued state to the unqueued state
	2657	* since that direction is protected by the proc_lock...
	2658	* so do a quick check for EV_QUEUED w/o holding the proc lock
	2659	* since by far the common case will be NOT EV_QUEUED, this saves
	2660	* us taking the proc_lock the majority of the time
	2661	*/
	2662	if (evq->ee_flags & EV_QUEUED) {
	2663	/*
	2664	* EVPROCDEQUE will recheck the state after it grabs the proc_lock
	2665	*/
	2666	EVPROCDEQUE(p, evq);
	2667	}
	2668	/*
	2669	* while the event is off the proc queue and
	2670	* we're holding the socket/pipe lock
	2671	* it's safe to update these fields...
	2672	*/
	2673	evq->ee_req.er_eventbits = 0;
	2674	evq->ee_eventmask = uap->u_eventmask & EV_MASK;
	2675
	2676	#if SOCKETS
	2677	if (fp->f_type == DTYPE_SOCKET) {
	2678	postevent((struct socket *)fp->f_data, 0, flag);
	2679	socket_unlock((struct socket *)fp->f_data, 1);
	2680	} else
	2681	#endif /* SOCKETS */
	2682	{
	2683	postpipeevent((struct pipe *)fp->f_data, flag);
	2684	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2685	}
	2686	fp_drop(p, erp->er_handle, fp, 0);
	2687	KERNEL_DEBUG(DBG_MISC_MOD\|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,0);
	2688	return(0);
	2689	}
	2690
	2691	/* this routine is called from the close of fd with proc_fdlock held */
	2692	int
	2693	waitevent_close(struct proc p, struct fileproc fp)
	2694	{
	2695	struct eventqelt *evq;
	2696
	2697
	2698	fp->f_flags &= ~FP_WAITEVENT;
	2699
	2700	#if SOCKETS
	2701	if (fp->f_type == DTYPE_SOCKET) {
	2702	socket_lock((struct socket *)fp->f_data, 1);
	2703	evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
	2704	} else
	2705	#endif /* SOCKETS */
	2706	if (fp->f_type == DTYPE_PIPE) {
	2707	PIPE_LOCK((struct pipe *)fp->f_data);
	2708	evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
	2709	}
	2710	else {
	2711	return(EINVAL);
	2712	}
	2713	proc_fdunlock(p);
	2714
	2715
	2716	// locate event if possible
	2717	for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
	2718	if (evq->ee_proc == p)
	2719	break;
	2720	}
	2721	if (evq == NULL) {
	2722	#if SOCKETS
	2723	if (fp->f_type == DTYPE_SOCKET)
	2724	socket_unlock((struct socket *)fp->f_data, 1);
	2725	else
	2726	#endif /* SOCKETS */
	2727	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2728
	2729	proc_fdlock(p);
	2730
	2731	return(EINVAL);
	2732	}
	2733	EVPROCDEQUE(p, evq);
	2734
	2735	#if SOCKETS
	2736	if (fp->f_type == DTYPE_SOCKET) {
	2737	TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
	2738	socket_unlock((struct socket *)fp->f_data, 1);
	2739	} else
	2740	#endif /* SOCKETS */
	2741	{
	2742	TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
	2743	PIPE_UNLOCK((struct pipe *)fp->f_data);
	2744	}
	2745	FREE(evq, M_TEMP);
	2746
	2747	proc_fdlock(p);
	2748
	2749	return(0);
	2750	}
	2751
	2752
	2753	/*
	2754	* gethostuuid
	2755	*
	2756	* Description: Get the host UUID from IOKit and return it to user space.
	2757	*
	2758	* Parameters: uuid_buf Pointer to buffer to receive UUID
	2759	* timeout Timespec for timout
	2760	* spi SPI, skip sandbox check (temporary)
	2761	*
	2762	* Returns: 0 Success
	2763	* EWOULDBLOCK Timeout is too short
	2764	* copyout:EFAULT Bad user buffer
	2765	*
	2766	* Notes: A timeout seems redundant, since if it's tolerable to not
	2767	* have a system UUID in hand, then why ask for one?
	2768	*/
	2769	int
	2770	gethostuuid(struct proc p, struct gethostuuid_args uap, __unused int32_t *retval)
	2771	{
	2772	kern_return_t kret;
	2773	int error;
	2774	mach_timespec_t mach_ts; /* for IOKit call */
	2775	__darwin_uuid_t uuid_kern; /* for IOKit call */
	2776
	2777	if (!uap->spi) {
	2778	#if 13841988
	2779	uint32_t flags;
	2780	if (temp_debug_13841988 && (0 == proc_get_darwinbgstate(p->task, &flags)) && (flags & PROC_FLAG_IOS_APPLICATION)) {
	2781	printf("Unauthorized access to gethostuuid() by %s(%d)\n", p->p_comm, proc_pid(p));
	2782	return (EPERM);
	2783	}
	2784	#else
	2785	/* Perform sandbox check */
	2786	#endif
	2787	}
	2788
	2789	/* Convert the 32/64 bit timespec into a mach_timespec_t */
	2790	if ( proc_is64bit(p) ) {
	2791	struct user64_timespec ts;
	2792	error = copyin(uap->timeoutp, &ts, sizeof(ts));
	2793	if (error)
	2794	return (error);
	2795	mach_ts.tv_sec = ts.tv_sec;
	2796	mach_ts.tv_nsec = ts.tv_nsec;
	2797	} else {
	2798	struct user32_timespec ts;
	2799	error = copyin(uap->timeoutp, &ts, sizeof(ts) );
	2800	if (error)
	2801	return (error);
	2802	mach_ts.tv_sec = ts.tv_sec;
	2803	mach_ts.tv_nsec = ts.tv_nsec;
	2804	}
	2805
	2806	/* Call IOKit with the stack buffer to get the UUID */
	2807	kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
	2808
	2809	/*
	2810	* If we get it, copy out the data to the user buffer; note that a
	2811	* uuid_t is an array of characters, so this is size invariant for
	2812	* 32 vs. 64 bit.
	2813	*/
	2814	if (kret == KERN_SUCCESS) {
	2815	error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
	2816	} else {
	2817	error = EWOULDBLOCK;
	2818	}
	2819
	2820	return (error);
	2821	}
	2822
	2823	/*
	2824	* ledger
	2825	*
	2826	* Description: Omnibus system call for ledger operations
	2827	*/
	2828	int
	2829	ledger(struct proc p, struct ledger_args args, __unused int32_t *retval)
	2830	{
	2831	#if !CONFIG_MACF
	2832	#pragma unused(p)
	2833	#endif
	2834	int rval, pid, len, error;
	2835	#ifdef LEDGER_DEBUG
	2836	struct ledger_limit_args lla;
	2837	#endif
	2838	task_t task;
	2839	proc_t proc;
	2840
	2841	/* Finish copying in the necessary args before taking the proc lock */
	2842	error = 0;
	2843	len = 0;
	2844	if (args->cmd == LEDGER_ENTRY_INFO)
	2845	error = copyin(args->arg3, (char *)&len, sizeof (len));
	2846	else if (args->cmd == LEDGER_TEMPLATE_INFO)
	2847	error = copyin(args->arg2, (char *)&len, sizeof (len));
	2848	#ifdef LEDGER_DEBUG
	2849	else if (args->cmd == LEDGER_LIMIT)
	2850	error = copyin(args->arg2, (char *)&lla, sizeof (lla));
	2851	#endif
	2852	if (error)
	2853	return (error);
	2854	if (len < 0)
	2855	return (EINVAL);
	2856
	2857	rval = 0;
	2858	if (args->cmd != LEDGER_TEMPLATE_INFO) {
	2859	pid = args->arg1;
	2860	proc = proc_find(pid);
	2861	if (proc == NULL)
	2862	return (ESRCH);
	2863
	2864	#if CONFIG_MACF
	2865	error = mac_proc_check_ledger(p, proc, args->cmd);
	2866	if (error) {
	2867	proc_rele(proc);
	2868	return (error);
	2869	}
	2870	#endif
	2871
	2872	task = proc->task;
	2873	}
	2874
	2875	switch (args->cmd) {
	2876	#ifdef LEDGER_DEBUG
	2877	case LEDGER_LIMIT: {
	2878	if (!kauth_cred_issuser(kauth_cred_get()))
	2879	rval = EPERM;
	2880	rval = ledger_limit(task, &lla);
	2881	proc_rele(proc);
	2882	break;
	2883	}
	2884	#endif
	2885	case LEDGER_INFO: {
	2886	struct ledger_info info;
	2887
	2888	rval = ledger_info(task, &info);
	2889	proc_rele(proc);
	2890	if (rval == 0)
	2891	rval = copyout(&info, args->arg2,
	2892	sizeof (info));
	2893	break;
	2894	}
	2895
	2896	case LEDGER_ENTRY_INFO: {
	2897	void *buf;
	2898	int sz;
	2899
	2900	rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
	2901	proc_rele(proc);
	2902	if ((rval == 0) && (len > 0)) {
	2903	sz = len * sizeof (struct ledger_entry_info);
	2904	rval = copyout(buf, args->arg2, sz);
	2905	kfree(buf, sz);
	2906	}
	2907	if (rval == 0)
	2908	rval = copyout(&len, args->arg3, sizeof (len));
	2909	break;
	2910	}
	2911
	2912	case LEDGER_TEMPLATE_INFO: {
	2913	void *buf;
	2914	int sz;
	2915
	2916	rval = ledger_template_info(&buf, &len);
	2917	if ((rval == 0) && (len > 0)) {
	2918	sz = len * sizeof (struct ledger_template_info);
	2919	rval = copyout(buf, args->arg1, sz);
	2920	kfree(buf, sz);
	2921	}
	2922	if (rval == 0)
	2923	rval = copyout(&len, args->arg2, sizeof (len));
	2924	break;
	2925	}
	2926
	2927	default:
	2928	rval = EINVAL;
	2929	}
	2930
	2931	return (rval);
	2932	}
	2933
	2934	#if CONFIG_TELEMETRY
	2935	int
	2936	telemetry(__unused struct proc p, struct telemetry_args args, __unused int32_t *retval)
	2937	{
	2938	int error = 0;
	2939
	2940	switch (args->cmd) {
	2941	case TELEMETRY_CMD_TIMER_EVENT:
	2942	error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
	2943	break;
	2944	default:
	2945	error = EINVAL;
	2946	break;
	2947	}
	2948
	2949	return (error);
	2950	}
	2951	#endif /* CONFIG_TELEMETRY */