bsd/kern/sys_generic.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/ioctl.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/proc_internal.h>
  81 #include <sys/socketvar.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/kernel.h>
  84 #include <sys/stat.h>
  85 #include <sys/malloc.h>
  86 #include <sys/sysproto.h>
  87
  88 #include <sys/mount_internal.h>
  89 #include <sys/protosw.h>
  90 #include <sys/ev.h>
  91 #include <sys/user.h>
  92 #include <sys/kdebug.h>
  93 #include <sys/poll.h>
  94 #include <sys/event.h>
  95 #include <sys/eventvar.h>
  96
  97 #include <mach/mach_types.h>
  98 #include <kern/kern_types.h>
  99 #include <kern/assert.h>
 100 #include <kern/kalloc.h>
 101 #include <kern/thread.h>
 102 #include <kern/clock.h>
 103
 104 #include <sys/mbuf.h>
 105 #include <sys/socket.h>
 106 #include <sys/socketvar.h>
 107 #include <sys/errno.h>
 108 #include <sys/syscall.h>
 109 #include <sys/pipe.h>
 110
 111 #include <bsm/audit_kernel.h>
 112
 113 #include <net/if.h>
 114 #include <net/route.h>
 115
 116 #include <netinet/in.h>
 117 #include <netinet/in_systm.h>
 118 #include <netinet/ip.h>
 119 #include <netinet/in_pcb.h>
 120 #include <netinet/ip_var.h>
 121 #include <netinet/ip6.h>
 122 #include <netinet/tcp.h>
 123 #include <netinet/tcp_fsm.h>
 124 #include <netinet/tcp_seq.h>
 125 #include <netinet/tcp_timer.h>
 126 #include <netinet/tcp_var.h>
 127 #include <netinet/tcpip.h>
 128 #include <netinet/tcp_debug.h>
 129 /* for wait queue based select */
 130 #include <kern/wait_queue.h>
 131 #include <kern/kalloc.h>
 132 #include <sys/vnode_internal.h>
 133
 134 /* XXX should be in a header file somewhere */
 135 void evsofree(struct socket *);
 136 void evpipefree(struct pipe *);
 137 void postpipeevent(struct pipe *, int);
 138 void postevent(struct socket *, struct sockbuf *, int);
 139 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
 140
 141 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 142 int wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 143 extern void     *get_bsduthreadarg(thread_t);
 144 extern int      *get_bsduthreadrval(thread_t);
 145
 146 __private_extern__ int  dofileread(vfs_context_t ctx, struct fileproc *fp,
 147                                                                    user_addr_t bufp, user_size_t nbyte,
 148                                                                    off_t offset, int flags, user_ssize_t *retval);
 149 __private_extern__ int  dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 150                                                                         user_addr_t bufp, user_size_t nbyte,
 151                                                                         off_t offset, int flags, user_ssize_t *retval);
 152 __private_extern__ int  preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
 153 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
 154
 155 #if NETAT
 156 extern int appletalk_inited;
 157 #endif /* NETAT */
 158
 159 #define f_flag f_fglob->fg_flag
 160 #define f_type f_fglob->fg_type
 161 #define f_msgcount f_fglob->fg_msgcount
 162 #define f_cred f_fglob->fg_cred
 163 #define f_ops f_fglob->fg_ops
 164 #define f_offset f_fglob->fg_offset
 165 #define f_data f_fglob->fg_data
 166
 167 /*
 168  * Read system call.
 169  *
 170  * Returns:     0                       Success
 171  *      preparefileread:EBADF
 172  *      preparefileread:ESPIPE
 173  *      preparefileread:ENXIO
 174  *      preparefileread:EBADF
 175  *      dofileread:???
 176  */
 177 int
 178 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
 179 {
 180         __pthread_testcancel(1);
 181         return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
 182 }
 183
 184 int
 185 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
 186 {
 187         struct fileproc *fp;
 188         int error;
 189         int fd = uap->fd;
 190
 191         if ( (error = preparefileread(p, &fp, fd, 0)) )
 192                 return (error);
 193
 194         error = dofileread(vfs_context_current(), fp, uap->cbuf, uap->nbyte,
 195                            (off_t)-1, 0, retval);
 196
 197         donefileread(p, fp, fd);
 198
 199         return (error);
 200 }
 201
 202 /*
 203  * Pread system call
 204  *
 205  * Returns:     0                       Success
 206  *      preparefileread:EBADF
 207  *      preparefileread:ESPIPE
 208  *      preparefileread:ENXIO
 209  *      preparefileread:EBADF
 210  *      dofileread:???
 211  */
 212 int
 213 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
 214 {
 215         __pthread_testcancel(1);
 216         return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
 217 }
 218
 219 int
 220 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
 221 {
 222         struct fileproc *fp = NULL;     /* fp set by preparefileread() */
 223         int fd = uap->fd;
 224         int error;
 225
 226         if ( (error = preparefileread(p, &fp, fd, 1)) )
 227                 goto out;
 228
 229         error = dofileread(vfs_context_current(), fp, uap->buf, uap->nbyte,
 230                         uap->offset, FOF_OFFSET, retval);
 231
 232         donefileread(p, fp, fd);
 233
 234         if (!error)
 235             KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
 236               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 237
 238 out:
 239         return (error);
 240 }
 241
 242 /*
 243  * Code common for read and pread
 244  */
 245
 246 void
 247 donefileread(struct proc *p, struct fileproc *fp, int fd)
 248 {
 249         proc_fdlock_spin(p);
 250
 251         fp->f_flags &= ~FP_INCHRREAD;
 252
 253         fp_drop(p, fd, fp, 1);
 254         proc_fdunlock(p);
 255 }
 256
 257 /*
 258  * Returns:     0                       Success
 259  *              EBADF
 260  *              ESPIPE
 261  *              ENXIO
 262  *      fp_lookup:EBADF
 263  *      fo_read:???
 264  */
 265 int
 266 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
 267 {
 268         vnode_t vp;
 269         int     error;
 270         struct fileproc *fp;
 271
 272         proc_fdlock_spin(p);
 273
 274         error = fp_lookup(p, fd, &fp, 1);
 275
 276         if (error) {
 277                 proc_fdunlock(p);
 278                 return (error);
 279         }
 280         if ((fp->f_flag & FREAD) == 0) {
 281                 error = EBADF;
 282                 goto out;
 283         }
 284         if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
 285                 error = ESPIPE;
 286                 goto out;
 287         }
 288         if (fp->f_type == DTYPE_VNODE) {
 289                 vp = (struct vnode *)fp->f_fglob->fg_data;
 290
 291                 if (check_for_pread && (vnode_isfifo(vp))) {
 292                         error = ESPIPE;
 293                         goto out;
 294                 }
 295                 if (check_for_pread && (vp->v_flag & VISTTY)) {
 296                         error = ENXIO;
 297                         goto out;
 298                 }
 299                 if (vp->v_type == VCHR)
 300                         fp->f_flags |= FP_INCHRREAD;
 301         }
 302
 303         *fp_ret = fp;
 304
 305         proc_fdunlock(p);
 306         return (0);
 307
 308 out:
 309         fp_drop(p, fd, fp, 1);
 310         proc_fdunlock(p);
 311         return (error);
 312 }
 313
 314
 315 /*
 316  * Returns:     0                       Success
 317  *              EINVAL
 318  *      fo_read:???
 319  */
 320 __private_extern__ int
 321 dofileread(vfs_context_t ctx, struct fileproc *fp,
 322            user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
 323            user_ssize_t *retval)
 324 {
 325         uio_t auio;
 326         user_ssize_t bytecnt;
 327         long error = 0;
 328         char uio_buf[ UIO_SIZEOF(1) ];
 329
 330         // LP64todo - do we want to raise this?
 331         if (nbyte > INT_MAX)
 332                 return (EINVAL);
 333
 334         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
 335                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
 336                                                                           &uio_buf[0], sizeof(uio_buf));
 337         } else {
 338                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
 339                                                                           &uio_buf[0], sizeof(uio_buf));
 340         }
 341         uio_addiov(auio, bufp, nbyte);
 342
 343         bytecnt = nbyte;
 344
 345         if ((error = fo_read(fp, auio, flags, ctx))) {
 346                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 347                         error == EINTR || error == EWOULDBLOCK))
 348                         error = 0;
 349         }
 350         bytecnt -= uio_resid(auio);
 351
 352         *retval = bytecnt;
 353
 354         return (error);
 355 }
 356
 357 /*
 358  * Scatter read system call.
 359  *
 360  * Returns:     0                       Success
 361  *              EINVAL
 362  *              ENOMEM
 363  *      copyin:EFAULT
 364  *      rd_uio:???
 365  */
 366 int
 367 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
 368 {
 369         __pthread_testcancel(1);
 370         return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
 371 }
 372
 373 int
 374 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
 375 {
 376         uio_t auio = NULL;
 377         int error;
 378         int size_of_iovec;
 379         struct user_iovec *iovp;
 380
 381         /* Verify range bedfore calling uio_create() */
 382         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 383                 return (EINVAL);
 384
 385         /* allocate a uio large enough to hold the number of iovecs passed */
 386         auio = uio_create(uap->iovcnt, 0,
 387                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 388                                   UIO_READ);
 389
 390         /* get location of iovecs within the uio.  then copyin the iovecs from
 391          * user space.
 392          */
 393         iovp = uio_iovsaddr(auio);
 394         if (iovp == NULL) {
 395                 error = ENOMEM;
 396                 goto ExitThisRoutine;
 397         }
 398         size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec));
 399         error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec));
 400         if (error) {
 401                 goto ExitThisRoutine;
 402         }
 403
 404         /* finalize uio_t for use and do the IO
 405          */
 406         uio_calculateresid(auio);
 407         error = rd_uio(p, uap->fd, auio, retval);
 408
 409 ExitThisRoutine:
 410         if (auio != NULL) {
 411                 uio_free(auio);
 412         }
 413         return (error);
 414 }
 415
 416 /*
 417  * Write system call
 418  *
 419  * Returns:     0                       Success
 420  *              EBADF
 421  *      fp_lookup:EBADF
 422  *      dofilewrite:???
 423  */
 424 int
 425 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
 426 {
 427         __pthread_testcancel(1);
 428         return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
 429
 430 }
 431
 432 int
 433 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
 434 {
 435         struct fileproc *fp;
 436         int error;
 437         int fd = uap->fd;
 438
 439         error = fp_lookup(p,fd,&fp,0);
 440         if (error)
 441                 return(error);
 442         if ((fp->f_flag & FWRITE) == 0) {
 443                 error = EBADF;
 444         } else {
 445                 struct vfs_context context = *(vfs_context_current());
 446                 context.vc_ucred = fp->f_fglob->fg_cred;
 447
 448                 error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
 449                         (off_t)-1, 0, retval);
 450         }
 451         if (error == 0)
 452                 fp_drop_written(p, fd, fp);
 453         else
 454                 fp_drop(p, fd, fp, 0);
 455         return(error);
 456 }
 457
 458 /*
 459  * pwrite system call
 460  *
 461  * Returns:     0                       Success
 462  *              EBADF
 463  *              ESPIPE
 464  *              ENXIO
 465  *              EINVAL
 466  *      fp_lookup:EBADF
 467  *      dofilewrite:???
 468  */
 469 int
 470 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
 471 {
 472         __pthread_testcancel(1);
 473         return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
 474 }
 475
 476 int
 477 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
 478 {
 479         struct fileproc *fp;
 480         int error;
 481         int fd = uap->fd;
 482         vnode_t vp  = (vnode_t)0;
 483
 484         error = fp_lookup(p,fd,&fp,0);
 485         if (error)
 486                 return(error);
 487
 488         if ((fp->f_flag & FWRITE) == 0) {
 489                 error = EBADF;
 490         } else {
 491                 struct vfs_context context = *vfs_context_current();
 492                 context.vc_ucred = fp->f_fglob->fg_cred;
 493
 494                 if (fp->f_type != DTYPE_VNODE) {
 495                         error = ESPIPE;
 496                         goto errout;
 497                 }
 498                 vp = (vnode_t)fp->f_fglob->fg_data;
 499                 if (vnode_isfifo(vp)) {
 500                         error = ESPIPE;
 501                         goto errout;
 502                 }
 503                 if ((vp->v_flag & VISTTY)) {
 504                         error = ENXIO;
 505                         goto errout;
 506                 }
 507                 if (uap->offset == (off_t)-1) {
 508                         error = EINVAL;
 509                         goto errout;
 510                 }
 511
 512                     error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
 513                         uap->offset, FOF_OFFSET, retval);
 514         }
 515 errout:
 516         if (error == 0)
 517                 fp_drop_written(p, fd, fp);
 518         else
 519                 fp_drop(p, fd, fp, 0);
 520
 521         if (!error)
 522             KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
 523               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 524
 525         return(error);
 526 }
 527
 528 /*
 529  * Returns:     0                       Success
 530  *              EINVAL
 531  *      <fo_write>:EPIPE
 532  *      <fo_write>:???                  [indirect through struct fileops]
 533  */
 534 __private_extern__ int
 535 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 536             user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
 537             user_ssize_t *retval)
 538 {
 539         uio_t auio;
 540         long error = 0;
 541         user_ssize_t bytecnt;
 542         char uio_buf[ UIO_SIZEOF(1) ];
 543
 544         // LP64todo - do we want to raise this?
 545         if (nbyte > INT_MAX)
 546                 return (EINVAL);
 547
 548         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
 549                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
 550                                                                           &uio_buf[0], sizeof(uio_buf));
 551         } else {
 552                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
 553                                                                           &uio_buf[0], sizeof(uio_buf));
 554         }
 555         uio_addiov(auio, bufp, nbyte);
 556
 557         bytecnt = nbyte;
 558         if ((error = fo_write(fp, auio, flags, ctx))) {
 559                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 560                         error == EINTR || error == EWOULDBLOCK))
 561                         error = 0;
 562                 /* The socket layer handles SIGPIPE */
 563                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
 564                         /* XXX Raise the signal on the thread? */
 565                         psignal(vfs_context_proc(ctx), SIGPIPE);
 566                 }
 567         }
 568         bytecnt -= uio_resid(auio);
 569         *retval = bytecnt;
 570
 571         return (error);
 572 }
 573
 574 /*
 575  * Gather write system call
 576  */
 577 int
 578 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
 579 {
 580         __pthread_testcancel(1);
 581         return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
 582 }
 583
 584 int
 585 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
 586 {
 587         uio_t auio = NULL;
 588         int error;
 589         int size_of_iovec;
 590         struct user_iovec *iovp;
 591
 592         /* Verify range bedfore calling uio_create() */
 593         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 594                 return (EINVAL);
 595
 596         /* allocate a uio large enough to hold the number of iovecs passed */
 597         auio = uio_create(uap->iovcnt, 0,
 598                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 599                                   UIO_WRITE);
 600
 601         /* get location of iovecs within the uio.  then copyin the iovecs from
 602          * user space.
 603          */
 604         iovp = uio_iovsaddr(auio);
 605         if (iovp == NULL) {
 606                 error = ENOMEM;
 607                 goto ExitThisRoutine;
 608         }
 609         size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec));
 610         error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec));
 611         if (error) {
 612                 goto ExitThisRoutine;
 613         }
 614
 615         /* finalize uio_t for use and do the IO
 616          */
 617         uio_calculateresid(auio);
 618         error = wr_uio(p, uap->fd, auio, retval);
 619
 620 ExitThisRoutine:
 621         if (auio != NULL) {
 622                 uio_free(auio);
 623         }
 624         return (error);
 625 }
 626
 627
 628 int
 629 wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
 630 {
 631         struct fileproc *fp;
 632         int error;
 633         user_ssize_t count;
 634         struct vfs_context context = *vfs_context_current();
 635
 636         error = fp_lookup(p,fdes,&fp,0);
 637         if (error)
 638                 return(error);
 639
 640         if ((fp->f_flag & FWRITE) == 0) {
 641                 error = EBADF;
 642                 goto out;
 643         }
 644         count = uio_resid(uio);
 645
 646         context.vc_ucred = fp->f_cred;
 647         error = fo_write(fp, uio, 0, &context);
 648         if (error) {
 649                 if (uio_resid(uio) != count && (error == ERESTART ||
 650                                                 error == EINTR || error == EWOULDBLOCK))
 651                         error = 0;
 652                 /* The socket layer handles SIGPIPE */
 653                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
 654                         psignal(p, SIGPIPE);
 655         }
 656         *retval = count - uio_resid(uio);
 657
 658 out:
 659         if ( (error == 0) )
 660                 fp_drop_written(p, fdes, fp);
 661         else
 662                 fp_drop(p, fdes, fp, 0);
 663         return(error);
 664 }
 665
 666
 667 int
 668 rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
 669 {
 670         struct fileproc *fp;
 671         int error;
 672         user_ssize_t count;
 673         struct vfs_context context = *vfs_context_current();
 674
 675         if ( (error = preparefileread(p, &fp, fdes, 0)) )
 676                 return (error);
 677
 678         count = uio_resid(uio);
 679
 680         context.vc_ucred = fp->f_cred;
 681
 682         error = fo_read(fp, uio, 0, &context);
 683
 684         if (error) {
 685                 if (uio_resid(uio) != count && (error == ERESTART ||
 686                                                 error == EINTR || error == EWOULDBLOCK))
 687                         error = 0;
 688         }
 689         *retval = count - uio_resid(uio);
 690
 691         donefileread(p, fp, fdes);
 692
 693         return (error);
 694 }
 695
 696 /*
 697  * Ioctl system call
 698  *
 699  * Returns:     0                       Success
 700  *              EBADF
 701  *              ENOTTY
 702  *              ENOMEM
 703  *              ESRCH
 704  *      copyin:EFAULT
 705  *      copyoutEFAULT
 706  *      fp_lookup:EBADF                 Bad file descriptor
 707  *      fo_ioctl:???
 708  */
 709 int
 710 ioctl(struct proc *p, struct ioctl_args *uap, __unused register_t *retval)
 711 {
 712         struct fileproc *fp;
 713         u_long com;
 714         int error = 0;
 715         u_int size;
 716         caddr_t datap, memp;
 717         boolean_t is64bit;
 718         int tmp;
 719 #define STK_PARAMS      128
 720         char stkbuf[STK_PARAMS];
 721         int fd = uap->fd;
 722         struct vfs_context context = *vfs_context_current();
 723
 724         AUDIT_ARG(fd, uap->fd);
 725         AUDIT_ARG(cmd, CAST_DOWN(int, uap->com)); /* LP64todo: uap->com is a user-land long */
 726         AUDIT_ARG(addr, uap->data);
 727
 728         is64bit = proc_is64bit(p);
 729
 730         proc_fdlock(p);
 731         error = fp_lookup(p,fd,&fp,1);
 732         if (error)  {
 733                 proc_fdunlock(p);
 734                 return(error);
 735         }
 736
 737         AUDIT_ARG(file, p, fp);
 738
 739         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 740                         error = EBADF;
 741                         goto out;
 742         }
 743
 744         context.vc_ucred = fp->f_fglob->fg_cred;
 745
 746 #if CONFIG_MACF
 747         error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, uap->com);
 748         if (error)
 749                 goto out;
 750 #endif
 751
 752 #if NETAT
 753         /*
 754          * ### LD 6/11/97 Hack Alert: this is to get AppleTalk to work
 755          * while implementing an ATioctl system call
 756          */
 757         {
 758                 if (appletalk_inited && ((uap->com & 0x0000FFFF) == 0xff99)) {
 759                         u_long  fixed_command;
 760
 761 #ifdef APPLETALK_DEBUG
 762                         kprintf("ioctl: special AppleTalk \n");
 763 #endif
 764                         datap = &stkbuf[0];
 765                         *(user_addr_t *)datap = uap->data;
 766                         fixed_command = _IOW(0, 0xff99, uap->data);
 767                         error = fo_ioctl(fp, fixed_command, datap, &context);
 768                         goto out;
 769                 }
 770         }
 771
 772 #endif /* NETAT */
 773
 774
 775         switch (com = uap->com) {
 776         case FIONCLEX:
 777                 *fdflags(p, uap->fd) &= ~UF_EXCLOSE;
 778                 error =0;
 779                 goto out;
 780         case FIOCLEX:
 781                 *fdflags(p, uap->fd) |= UF_EXCLOSE;
 782                 error =0;
 783                 goto out;
 784         }
 785
 786         /*
 787          * Interpret high order word to find amount of data to be
 788          * copied to/from the user's address space.
 789          */
 790         size = IOCPARM_LEN(com);
 791         if (size > IOCPARM_MAX) {
 792                         error = ENOTTY;
 793                         goto out;
 794         }
 795         memp = NULL;
 796         if (size > sizeof (stkbuf)) {
 797                 proc_fdunlock(p);
 798                 if ((memp = (caddr_t)kalloc(size)) == 0) {
 799                         proc_fdlock(p);
 800                         error = ENOMEM;
 801                         goto out;
 802                 }
 803                 proc_fdlock(p);
 804                 datap = memp;
 805         } else
 806                 datap = &stkbuf[0];
 807         if (com&IOC_IN) {
 808                 if (size) {
 809                         proc_fdunlock(p);
 810                         error = copyin(uap->data, datap, size);
 811                         if (error) {
 812                                 if (memp)
 813                                         kfree(memp, size);
 814                                 proc_fdlock(p);
 815                                 goto out;
 816                         }
 817                         proc_fdlock(p);
 818                 } else {
 819                         /* XXX - IOC_IN and no size?  we should proably return an error here!! */
 820                         if (is64bit) {
 821                                 *(user_addr_t *)datap = uap->data;
 822                         }
 823                         else {
 824                                 *(uint32_t *)datap = (uint32_t)uap->data;
 825                         }
 826                 }
 827         } else if ((com&IOC_OUT) && size)
 828                 /*
 829                  * Zero the buffer so the user always
 830                  * gets back something deterministic.
 831                  */
 832                 bzero(datap, size);
 833         else if (com&IOC_VOID) {
 834                 /* XXX - this is odd since IOC_VOID means no parameters */
 835                 if (is64bit) {
 836                         *(user_addr_t *)datap = uap->data;
 837                 }
 838                 else {
 839                         *(uint32_t *)datap = (uint32_t)uap->data;
 840                 }
 841         }
 842
 843         switch (com) {
 844
 845         case FIONBIO:
 846                 if ( (tmp = *(int *)datap) )
 847                         fp->f_flag |= FNONBLOCK;
 848                 else
 849                         fp->f_flag &= ~FNONBLOCK;
 850                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
 851                 break;
 852
 853         case FIOASYNC:
 854                 if ( (tmp = *(int *)datap) )
 855                         fp->f_flag |= FASYNC;
 856                 else
 857                         fp->f_flag &= ~FASYNC;
 858                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
 859                 break;
 860
 861         case FIOSETOWN:
 862                 tmp = *(int *)datap;
 863                 if (fp->f_type == DTYPE_SOCKET) {
 864                         ((struct socket *)fp->f_data)->so_pgid = tmp;
 865                         error = 0;
 866                         break;
 867                 }
 868                 if (fp->f_type == DTYPE_PIPE) {
 869                         error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
 870                         break;
 871                 }
 872                 if (tmp <= 0) {
 873                         tmp = -tmp;
 874                 } else {
 875                         struct proc *p1 = proc_find(tmp);
 876                         if (p1 == 0) {
 877                                 error = ESRCH;
 878                                 break;
 879                         }
 880                         tmp = p1->p_pgrpid;
 881                         proc_rele(p1);
 882                 }
 883                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
 884                 break;
 885
 886         case FIOGETOWN:
 887                 if (fp->f_type == DTYPE_SOCKET) {
 888                         error = 0;
 889                         *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
 890                         break;
 891                 }
 892                 error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
 893                 *(int *)datap = -*(int *)datap;
 894                 break;
 895
 896         default:
 897                 error = fo_ioctl(fp, com, datap, &context);
 898                 /*
 899                  * Copy any data to user, size was
 900                  * already set and checked above.
 901                  */
 902                 if (error == 0 && (com&IOC_OUT) && size)
 903                         error = copyout(datap, uap->data, (u_int)size);
 904                 break;
 905         }
 906         proc_fdunlock(p);
 907         if (memp)
 908                 kfree(memp, size);
 909         proc_fdlock(p);
 910 out:
 911         fp_drop(p, fd, fp, 1);
 912         proc_fdunlock(p);
 913         return(error);
 914 }
 915
 916 int     selwait, nselcoll;
 917 #define SEL_FIRSTPASS 1
 918 #define SEL_SECONDPASS 2
 919 extern int selcontinue(int error);
 920 extern int selprocess(int error, int sel_pass);
 921 static int selscan(struct proc *p, struct _select * sel,
 922                         int nfd, register_t *retval, int sel_pass, wait_queue_sub_t wqsub);
 923 static int selcount(struct proc *p, u_int32_t *ibits, u_int32_t *obits,
 924                         int nfd, int * count, int *kfcount);
 925 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
 926 extern uint64_t tvtoabstime(struct timeval      *tvp);
 927
 928 /*
 929  * Select system call.
 930  *
 931  * Returns:     0                       Success
 932  *              EINVAL                  Invalid argument
 933  *              EAGAIN                  Nonconformant error if allocation fails
 934  *      selprocess:???
 935  */
 936 int
 937 select(struct proc *p, struct select_args *uap, register_t *retval)
 938 {
 939         __pthread_testcancel(1);
 940         return(select_nocancel(p, (struct select_nocancel_args *)uap, retval));
 941 }
 942
 943 int
 944 select_nocancel(struct proc *p, struct select_nocancel_args *uap, register_t *retval)
 945 {
 946         int error = 0;
 947         u_int ni, nw, size;
 948         thread_t th_act;
 949         struct uthread  *uth;
 950         struct _select *sel;
 951         int needzerofill = 1;
 952         int count = 0;
 953         int kfcount = 0;
 954
 955         th_act = current_thread();
 956         uth = get_bsdthread_info(th_act);
 957         sel = &uth->uu_select;
 958         retval = (int *)get_bsduthreadrval(th_act);
 959         *retval = 0;
 960
 961         if (uap->nd < 0) {
 962                 return (EINVAL);
 963         }
 964
 965         /* select on thread of process that already called proc_exit() */
 966         if (p->p_fd == NULL) {
 967                 return (EBADF);
 968         }
 969
 970         if (uap->nd > p->p_fd->fd_nfiles)
 971                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
 972
 973         nw = howmany(uap->nd, NFDBITS);
 974         ni = nw * sizeof(fd_mask);
 975
 976         /*
 977          * if the previously allocated space for the bits is smaller than
 978          * what is requested or no space has yet been allocated for this
 979          * thread, allocate enough space now.
 980          *
 981          * Note: If this process fails, select() will return EAGAIN; this
 982          * is the same thing pool() returns in a no-memory situation, but
 983          * it is not a POSIX compliant error code for select().
 984          */
 985         if (sel->nbytes < (3 * ni)) {
 986                 int nbytes = 3 * ni;
 987
 988                 /* Free previous allocation, if any */
 989                 if (sel->ibits != NULL)
 990                         FREE(sel->ibits, M_TEMP);
 991                 if (sel->obits != NULL) {
 992                         FREE(sel->obits, M_TEMP);
 993                         /* NULL out; subsequent ibits allocation may fail */
 994                         sel->obits = NULL;
 995                 }
 996
 997                 MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
 998                 if (sel->ibits == NULL)
 999                         return (EAGAIN);
1000                 MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1001                 if (sel->obits == NULL) {
1002                         FREE(sel->ibits, M_TEMP);
1003                         sel->ibits = NULL;
1004                         return (EAGAIN);
1005                 }
1006                 sel->nbytes = nbytes;
1007                 needzerofill = 0;
1008         }
1009
1010         if (needzerofill) {
1011                 bzero((caddr_t)sel->ibits, sel->nbytes);
1012                 bzero((caddr_t)sel->obits, sel->nbytes);
1013         }
1014
1015         /*
1016          * get the bits from the user address space
1017          */
1018 #define getbits(name, x) \
1019         do { \
1020                 if (uap->name && (error = copyin(uap->name, \
1021                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
1022                         goto continuation; \
1023         } while (0)
1024
1025         getbits(in, 0);
1026         getbits(ou, 1);
1027         getbits(ex, 2);
1028 #undef  getbits
1029
1030         if (uap->tv) {
1031                 struct timeval atv;
1032                 if (IS_64BIT_PROCESS(p)) {
1033                         struct user_timeval atv64;
1034                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
1035                         /* Loses resolution - assume timeout < 68 years */
1036                         atv.tv_sec = atv64.tv_sec;
1037                         atv.tv_usec = atv64.tv_usec;
1038                 } else {
1039                         error = copyin(uap->tv, (caddr_t)&atv, sizeof(atv));
1040                 }
1041                 if (error)
1042                         goto continuation;
1043                 if (itimerfix(&atv)) {
1044                         error = EINVAL;
1045                         goto continuation;
1046                 }
1047
1048                 clock_absolutetime_interval_to_deadline(
1049                                                                                 tvtoabstime(&atv), &sel->abstime);
1050         }
1051         else
1052                 sel->abstime = 0;
1053
1054         sel->kfcount = 0;
1055         if ( (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count, &kfcount)) ) {
1056                         goto continuation;
1057         }
1058         sel->count = count;
1059         sel->kfcount = kfcount;
1060         size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK);
1061         if (uth->uu_allocsize) {
1062                 if (uth->uu_wqset == 0)
1063                         panic("select: wql memory smashed");
1064                 /* needed for the select now */
1065                 if (size > uth->uu_allocsize) {
1066                         kfree(uth->uu_wqset,  uth->uu_allocsize);
1067                         uth->uu_allocsize = size;
1068                         uth->uu_wqset = (wait_queue_set_t)kalloc(size);
1069                         if (uth->uu_wqset == (wait_queue_set_t)NULL)
1070                                 panic("failed to allocate memory for waitqueue\n");
1071                 }
1072         } else {
1073                 sel->count = count;
1074                 uth->uu_allocsize = size;
1075                 uth->uu_wqset = (wait_queue_set_t)kalloc(uth->uu_allocsize);
1076                 if (uth->uu_wqset == (wait_queue_set_t)NULL)
1077                         panic("failed to allocate memory for waitqueue\n");
1078         }
1079         bzero(uth->uu_wqset, size);
1080         sel->wql = (char *)uth->uu_wqset + SIZEOF_WAITQUEUE_SET;
1081         wait_queue_set_init(uth->uu_wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST));
1082
1083 continuation:
1084         return selprocess(error, SEL_FIRSTPASS);
1085 }
1086
1087 int
1088 selcontinue(int error)
1089 {
1090         return selprocess(error, SEL_SECONDPASS);
1091 }
1092
1093 int
1094 selprocess(int error, int sel_pass)
1095 {
1096         int ncoll;
1097         u_int ni, nw;
1098         thread_t th_act;
1099         struct uthread  *uth;
1100         struct proc *p;
1101         struct select_args *uap;
1102         int *retval;
1103         struct _select *sel;
1104         int unwind = 1;
1105         int prepost = 0;
1106         int somewakeup = 0;
1107         int doretry = 0;
1108         wait_result_t wait_result;
1109
1110         p = current_proc();
1111         th_act = current_thread();
1112         uap = (struct select_args *)get_bsduthreadarg(th_act);
1113         retval = (int *)get_bsduthreadrval(th_act);
1114         uth = get_bsdthread_info(th_act);
1115         sel = &uth->uu_select;
1116
1117         /* if it is first pass wait queue is not setup yet */
1118         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
1119                         unwind = 0;
1120         if (sel->count == 0)
1121                         unwind = 0;
1122 retry:
1123         if (error != 0) {
1124           goto done;
1125         }
1126
1127         ncoll = nselcoll;
1128         OSBitOrAtomic(P_SELECT, (UInt32 *)&p->p_flag);
1129         /* skip scans if the select is just for timeouts */
1130         if (sel->count) {
1131                 if (sel_pass == SEL_FIRSTPASS)
1132                         wait_queue_sub_clearrefs(uth->uu_wqset);
1133
1134                 error = selscan(p, sel, uap->nd, retval, sel_pass, (wait_queue_sub_t)uth->uu_wqset);
1135                 if (error || *retval) {
1136                         goto done;
1137                 }
1138                 if (prepost) {
1139                         /* if the select of log, then we canwakeup and discover some one
1140                         * else already read the data; go toselct again if time permits
1141                         */
1142                         prepost = 0;
1143                         doretry = 1;
1144                 }
1145                 if (somewakeup) {
1146                         somewakeup = 0;
1147                         doretry = 1;
1148                 }
1149         }
1150
1151         if (uap->tv) {
1152                 uint64_t        now;
1153
1154                 clock_get_uptime(&now);
1155                 if (now >= sel->abstime)
1156                         goto done;
1157         }
1158
1159         if (doretry) {
1160                 /* cleanup obits and try again */
1161                 doretry = 0;
1162                 sel_pass = SEL_FIRSTPASS;
1163                 goto retry;
1164         }
1165
1166         /*
1167          * To effect a poll, the timeout argument should be
1168          * non-nil, pointing to a zero-valued timeval structure.
1169          */
1170         if (uap->tv && sel->abstime == 0) {
1171                 goto done;
1172         }
1173
1174         /* No spurious wakeups due to colls,no need to check for them */
1175          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1176                 sel_pass = SEL_FIRSTPASS;
1177                 goto retry;
1178         }
1179
1180         OSBitAndAtomic(~((uint32_t)P_SELECT), (UInt32 *)&p->p_flag);
1181
1182         /* if the select is just for timeout skip check */
1183         if (sel->count &&(sel_pass == SEL_SECONDPASS))
1184                 panic("selprocess: 2nd pass assertwaiting");
1185
1186         /* Wait Queue Subordinate has waitqueue as first element */
1187         wait_result = wait_queue_assert_wait((wait_queue_t)uth->uu_wqset,
1188                                              &selwait, THREAD_ABORTSAFE, sel->abstime);
1189         if (wait_result != THREAD_AWAKENED) {
1190                 /* there are no preposted events */
1191                 error = tsleep1(NULL, PSOCK | PCATCH,
1192                                 "select", 0, selcontinue);
1193         } else  {
1194                 prepost = 1;
1195                 error = 0;
1196         }
1197
1198         sel_pass = SEL_SECONDPASS;
1199         if (error == 0) {
1200                 if (!prepost)
1201                         somewakeup =1;
1202                 goto retry;
1203         }
1204 done:
1205         if (unwind) {
1206                 wait_subqueue_unlink_all(uth->uu_wqset);
1207                 seldrop(p, sel->ibits, uap->nd);
1208         }
1209         OSBitAndAtomic(~((uint32_t)P_SELECT), (UInt32 *)&p->p_flag);
1210         /* select is not restarted after signals... */
1211         if (error == ERESTART)
1212                 error = EINTR;
1213         if (error == EWOULDBLOCK)
1214                 error = 0;
1215         nw = howmany(uap->nd, NFDBITS);
1216         ni = nw * sizeof(fd_mask);
1217
1218 #define putbits(name, x) \
1219         do { \
1220                 if (uap->name && (error2 = \
1221                         copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1222                         error = error2; \
1223         } while (0)
1224
1225         if (error == 0) {
1226                 int error2;
1227
1228                 putbits(in, 0);
1229                 putbits(ou, 1);
1230                 putbits(ex, 2);
1231 #undef putbits
1232         }
1233         return(error);
1234 }
1235
1236 static int
1237 selscan(struct proc *p, struct _select *sel, int nfd, register_t *retval,
1238         int sel_pass, wait_queue_sub_t wqsub)
1239 {
1240         struct filedesc *fdp = p->p_fd;
1241         int msk, i, j, fd;
1242         u_int32_t bits;
1243         struct fileproc *fp;
1244         int n = 0;
1245         int nc = 0;
1246         static int flag[3] = { FREAD, FWRITE, 0 };
1247         u_int32_t *iptr, *optr;
1248         u_int nw;
1249         u_int32_t *ibits, *obits;
1250         char * wql;
1251         char * wql_ptr;
1252         int count, kfcount;
1253         boolean_t funnel_state;
1254         vnode_t vp;
1255         struct vfs_context context = *vfs_context_current();
1256
1257         /*
1258          * Problems when reboot; due to MacOSX signal probs
1259          * in Beaker1C ; verify that the p->p_fd is valid
1260          */
1261         if (fdp == NULL) {
1262                 *retval=0;
1263                 return(EIO);
1264         }
1265         ibits = sel->ibits;
1266         obits = sel->obits;
1267         wql = sel->wql;
1268
1269         nw = howmany(nfd, NFDBITS);
1270
1271         count = sel->count;
1272         kfcount = sel->kfcount;
1273
1274         if (kfcount > count)
1275                 panic("selscan: count < kfcount");
1276
1277         if (kfcount != 0) {
1278                 funnel_state = thread_funnel_set(kernel_flock, TRUE);
1279
1280                 proc_fdlock(p);
1281                 for (msk = 0; msk < 3; msk++) {
1282                         iptr = (u_int32_t *)&ibits[msk * nw];
1283                         optr = (u_int32_t *)&obits[msk * nw];
1284
1285                         for (i = 0; i < nfd; i += NFDBITS) {
1286                                 bits = iptr[i/NFDBITS];
1287
1288                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1289                                         bits &= ~(1 << j);
1290                                         fp = fdp->fd_ofiles[fd];
1291
1292                                         if (fp == NULL ||
1293                                                 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1294                                                 proc_fdunlock(p);
1295                                                 thread_funnel_set(kernel_flock, funnel_state);
1296                                                 return(EBADF);
1297                                         }
1298                                         if (sel_pass == SEL_SECONDPASS) {
1299                                                 wql_ptr = (char *)0;
1300                                                 fp->f_flags &= ~FP_INSELECT;
1301                                                 fp->f_waddr = (void *)0;
1302                                         } else {
1303                                                 wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
1304                                                 fp->f_flags |= FP_INSELECT;
1305                                                 fp->f_waddr = (void *)wqsub;
1306                                         }
1307
1308                                         context.vc_ucred = fp->f_cred;
1309
1310                                         if (fp->f_ops && (fp->f_type == DTYPE_VNODE)
1311                                                         && ((vp = (struct vnode *)fp->f_data)  != NULLVP)
1312                                                         && (vp->v_type == VCHR)
1313                                                 && fo_select(fp, flag[msk], wql_ptr, &context)) {
1314                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1315                                                 n++;
1316                                         }
1317                                         nc++;
1318                                 }
1319                         }
1320                 }
1321                 proc_fdunlock(p);
1322                 thread_funnel_set(kernel_flock, funnel_state);
1323         }
1324
1325         nc = 0;
1326         if (kfcount != count) {
1327                 proc_fdlock(p);
1328                 for (msk = 0; msk < 3; msk++) {
1329                         iptr = (u_int32_t *)&ibits[msk * nw];
1330                         optr = (u_int32_t *)&obits[msk * nw];
1331
1332                         for (i = 0; i < nfd; i += NFDBITS) {
1333                                 bits = iptr[i/NFDBITS];
1334
1335                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1336                                         bits &= ~(1 << j);
1337                                         fp = fdp->fd_ofiles[fd];
1338
1339                                         if (fp == NULL ||
1340                                                 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1341                                                 proc_fdunlock(p);
1342                                                 return(EBADF);
1343                                         }
1344                                         if (sel_pass == SEL_SECONDPASS) {
1345                                                 wql_ptr = (char *)0;
1346                                                 fp->f_flags &= ~FP_INSELECT;
1347                                                 fp->f_waddr = (void *)0;
1348                                         } else {
1349                                                 wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
1350                                                 fp->f_flags |= FP_INSELECT;
1351                                                 fp->f_waddr = (void *)wqsub;
1352                                         }
1353
1354                                         context.vc_ucred = fp->f_cred;
1355
1356                                         if ((fp->f_ops &&
1357                                                 ((fp->f_type != DTYPE_VNODE)
1358                                                 || (((vp = (struct vnode *)fp->f_data)  != NULLVP)
1359                                                         && (vp->v_type != VCHR))
1360                                                 )
1361                                                 && fo_select(fp, flag[msk], wql_ptr, &context))) {
1362                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1363                                                 n++;
1364                                         }
1365                                         nc++;
1366                                 }
1367                         }
1368                 }
1369                 proc_fdunlock(p);
1370         }
1371         *retval = n;
1372         return (0);
1373 }
1374
1375 int poll_callback(struct kqueue *, struct kevent *, void *);
1376
1377 struct poll_continue_args {
1378         user_addr_t pca_fds;
1379         u_int pca_nfds;
1380         u_int pca_rfds;
1381 };
1382
1383 int
1384 poll(struct proc *p, struct poll_args *uap, register_t *retval)
1385 {
1386         __pthread_testcancel(1);
1387         return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
1388 }
1389
1390
1391 int
1392 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, register_t *retval)
1393 {
1394         struct poll_continue_args *cont;
1395         struct pollfd *fds;
1396         struct kqueue *kq;
1397         struct timeval atv;
1398         int ncoll, error = 0;
1399         u_int nfds = uap->nfds;
1400         u_int rfds = 0;
1401         u_int i;
1402         size_t ni;
1403
1404         /*
1405          * This is kinda bogus.  We have fd limits, but that is not
1406          * really related to the size of the pollfd array.  Make sure
1407          * we let the process use at least FD_SETSIZE entries and at
1408          * least enough for the current limits.  We want to be reasonably
1409          * safe, but not overly restrictive.
1410          */
1411         if (nfds > OPEN_MAX ||
1412             (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) || nfds > FD_SETSIZE)))
1413                 return (EINVAL);
1414
1415         kq = kqueue_alloc(p);
1416         if (kq == NULL)
1417                 return (EAGAIN);
1418
1419         ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1420         MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1421         if (NULL == cont) {
1422                 error = EAGAIN;
1423                 goto out;
1424         }
1425
1426         fds = (struct pollfd *)&cont[1];
1427         error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1428         if (error)
1429                 goto out;
1430
1431         if (uap->timeout != -1) {
1432                 struct timeval rtv;
1433
1434                 atv.tv_sec = uap->timeout / 1000;
1435                 atv.tv_usec = (uap->timeout % 1000) * 1000;
1436                 if (itimerfix(&atv)) {
1437                         error = EINVAL;
1438                         goto out;
1439                 }
1440                 getmicrouptime(&rtv);
1441                 timevaladd(&atv, &rtv);
1442         } else {
1443                 atv.tv_sec = 0;
1444                 atv.tv_usec = 0;
1445         }
1446
1447         /* JMM - all this P_SELECT stuff is bogus */
1448         ncoll = nselcoll;
1449         OSBitOrAtomic(P_SELECT, (UInt32 *)&p->p_flag);
1450         for (i = 0; i < nfds; i++) {
1451                 short events = fds[i].events;
1452                 struct kevent kev;
1453                 int kerror = 0;
1454
1455                 /* per spec, ignore fd values below zero */
1456                 if (fds[i].fd < 0) {
1457                         fds[i].revents = 0;
1458                         continue;
1459                 }
1460
1461                 /* convert the poll event into a kqueue kevent */
1462                 kev.ident = fds[i].fd;
1463                 kev.flags = EV_ADD | EV_ONESHOT | EV_POLL;
1464                 kev.fflags = NOTE_LOWAT;
1465                 kev.data = 1; /* efficiency be damned: any data should trigger */
1466                 kev.udata = CAST_USER_ADDR_T(&fds[i]);
1467
1468                 /* Handle input events */
1469                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
1470                         kev.filter = EVFILT_READ;
1471                         if (!(events & ( POLLIN | POLLRDNORM )))
1472                                 kev.flags |= EV_OOBAND;
1473                         kerror = kevent_register(kq, &kev, p);
1474                 }
1475
1476                 /* Handle output events */
1477                 if (kerror == 0 &&
1478                     events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) {
1479                         kev.filter = EVFILT_WRITE;
1480                         kerror = kevent_register(kq, &kev, p);
1481                 }
1482
1483                 /* Handle BSD extension vnode events */
1484                 if (kerror == 0 &&
1485                     events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) {
1486                         kev.filter = EVFILT_VNODE;
1487                         kev.fflags = 0;
1488                         if (events & POLLEXTEND)
1489                                 kev.fflags |= NOTE_EXTEND;
1490                         if (events & POLLATTRIB)
1491                                 kev.fflags |= NOTE_ATTRIB;
1492                         if (events & POLLNLINK)
1493                                 kev.fflags |= NOTE_LINK;
1494                         if (events & POLLWRITE)
1495                                 kev.fflags |= NOTE_WRITE;
1496                         kerror = kevent_register(kq, &kev, p);
1497                 }
1498
1499                 if (kerror != 0) {
1500                         fds[i].revents = POLLNVAL;
1501                         rfds++;
1502                 } else
1503                         fds[i].revents = 0;
1504         }
1505
1506         /* Did we have any trouble registering? */
1507         if (rfds > 0)
1508                 goto done;
1509
1510         /* scan for, and possibly wait for, the kevents to trigger */
1511         cont->pca_fds = uap->fds;
1512         cont->pca_nfds = nfds;
1513         cont->pca_rfds = rfds;
1514         error = kevent_scan(kq, poll_callback, NULL, cont, &atv, p);
1515         rfds = cont->pca_rfds;
1516
1517  done:
1518         OSBitAndAtomic(~((uint32_t)P_SELECT), (UInt32 *)&p->p_flag);
1519         /* poll is not restarted after signals... */
1520         if (error == ERESTART)
1521                 error = EINTR;
1522         if (error == EWOULDBLOCK)
1523                 error = 0;
1524         if (error == 0) {
1525                 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1526                 *retval = rfds;
1527         }
1528
1529  out:
1530         if (NULL != cont)
1531                 FREE(cont, M_TEMP);
1532
1533         kqueue_dealloc(kq);
1534         return (error);
1535 }
1536
1537 int
1538 poll_callback(__unused struct kqueue *kq, struct kevent *kevp, void *data)
1539 {
1540         struct poll_continue_args *cont = (struct poll_continue_args *)data;
1541         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1542         short mask;
1543
1544         /* convert the results back into revents */
1545         if (kevp->flags & EV_EOF)
1546                 fds->revents |= POLLHUP;
1547         if (kevp->flags & EV_ERROR)
1548                 fds->revents |= POLLERR;
1549
1550         switch (kevp->filter) {
1551         case EVFILT_READ:
1552                 if (fds->revents & POLLHUP)
1553                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
1554                 else {
1555                         mask = 0;
1556                         if (kevp->data != 0)
1557                                 mask |= (POLLIN | POLLRDNORM );
1558                         if (kevp->flags & EV_OOBAND)
1559                                 mask |= ( POLLPRI | POLLRDBAND );
1560                 }
1561                 fds->revents |= (fds->events & mask);
1562                 break;
1563
1564         case EVFILT_WRITE:
1565                 if (!(fds->revents & POLLHUP))
1566                         fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
1567                 break;
1568
1569         case EVFILT_VNODE:
1570                 if (kevp->fflags & NOTE_EXTEND)
1571                         fds->revents |= (fds->events & POLLEXTEND);
1572                 if (kevp->fflags & NOTE_ATTRIB)
1573                         fds->revents |= (fds->events & POLLATTRIB);
1574                 if (kevp->fflags & NOTE_LINK)
1575                         fds->revents |= (fds->events & POLLNLINK);
1576                 if (kevp->fflags & NOTE_WRITE)
1577                         fds->revents |= (fds->events & POLLWRITE);
1578                 break;
1579         }
1580
1581         if (fds->revents)
1582                 cont->pca_rfds++;
1583
1584         return 0;
1585 }
1586
1587 int
1588 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1589 {
1590
1591         return (1);
1592 }
1593
1594 static int
1595 selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits,
1596                  int nfd, int *countp, int * kfcountp)
1597 {
1598         struct filedesc *fdp = p->p_fd;
1599         int msk, i, j, fd;
1600         u_int32_t bits;
1601         struct fileproc *fp;
1602         int n = 0;
1603         u_int32_t *iptr;
1604         u_int nw;
1605         int error=0;
1606         int kfc = 0;
1607         int dropcount;
1608         vnode_t vp;
1609
1610         /*
1611          * Problems when reboot; due to MacOSX signal probs
1612          * in Beaker1C ; verify that the p->p_fd is valid
1613          */
1614         if (fdp == NULL) {
1615                 *countp = 0;
1616                 *kfcountp = 0;
1617                 return(EIO);
1618         }
1619         nw = howmany(nfd, NFDBITS);
1620
1621         proc_fdlock(p);
1622         for (msk = 0; msk < 3; msk++) {
1623                 iptr = (u_int32_t *)&ibits[msk * nw];
1624                 for (i = 0; i < nfd; i += NFDBITS) {
1625                         bits = iptr[i/NFDBITS];
1626                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1627                                 bits &= ~(1 << j);
1628                                 fp = fdp->fd_ofiles[fd];
1629                                 if (fp == NULL ||
1630                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1631                                                 *countp = 0;
1632                                                 *kfcountp = 0;
1633                                                 error = EBADF;
1634                                                 goto bad;
1635                                 }
1636                                 fp->f_iocount++;
1637                                 if ((fp->f_type == DTYPE_VNODE)
1638                                                 && ((vp = (struct vnode *)fp->f_data)  != NULLVP)
1639                                                 && (vp->v_type == VCHR) )
1640                                         kfc++;
1641
1642                                 n++;
1643                         }
1644                 }
1645         }
1646         proc_fdunlock(p);
1647
1648         *countp = n;
1649         *kfcountp = kfc;
1650         return (0);
1651 bad:
1652         dropcount = 0;
1653
1654         if (n== 0)
1655                 goto out;
1656         /* undo the iocounts */
1657         for (msk = 0; msk < 3; msk++) {
1658                 iptr = (u_int32_t *)&ibits[msk * nw];
1659                 for (i = 0; i < nfd; i += NFDBITS) {
1660                         bits = iptr[i/NFDBITS];
1661                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1662                                 bits &= ~(1 << j);
1663                                 fp = fdp->fd_ofiles[fd];
1664                                 if (dropcount >= n)
1665                                         goto out;
1666                                 fp->f_iocount--;
1667
1668                                 if (p->p_fpdrainwait && fp->f_iocount == 0) {
1669                                         p->p_fpdrainwait = 0;
1670                                         wakeup(&p->p_fpdrainwait);
1671                                 }
1672                                 dropcount++;
1673                         }
1674                 }
1675         }
1676 out:
1677         proc_fdunlock(p);
1678         return(error);
1679 }
1680
1681 static int
1682 seldrop(struct proc *p, u_int32_t *ibits, int nfd)
1683 {
1684         struct filedesc *fdp = p->p_fd;
1685         int msk, i, j, fd;
1686         u_int32_t bits;
1687         struct fileproc *fp;
1688         int n = 0;
1689         u_int32_t *iptr;
1690         u_int nw;
1691
1692         /*
1693          * Problems when reboot; due to MacOSX signal probs
1694          * in Beaker1C ; verify that the p->p_fd is valid
1695          */
1696         if (fdp == NULL) {
1697                 return(EIO);
1698         }
1699
1700         nw = howmany(nfd, NFDBITS);
1701
1702
1703         proc_fdlock(p);
1704         for (msk = 0; msk < 3; msk++) {
1705                 iptr = (u_int32_t *)&ibits[msk * nw];
1706                 for (i = 0; i < nfd; i += NFDBITS) {
1707                         bits = iptr[i/NFDBITS];
1708                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1709                                 bits &= ~(1 << j);
1710                                 fp = fdp->fd_ofiles[fd];
1711                                 if (fp == NULL
1712 #if 0
1713                         /* if you are here then it is being closed */
1714                                         || (fdp->fd_ofileflags[fd] & UF_RESERVED)
1715 #endif
1716                                         ) {
1717                                                 proc_fdunlock(p);
1718                                                 return(EBADF);
1719                                 }
1720                                 n++;
1721                                 fp->f_iocount--;
1722                                 fp->f_flags &= ~FP_INSELECT;
1723
1724                                 if (p->p_fpdrainwait && fp->f_iocount == 0) {
1725                                         p->p_fpdrainwait = 0;
1726                                         wakeup(&p->p_fpdrainwait);
1727                                 }
1728                         }
1729                 }
1730         }
1731         proc_fdunlock(p);
1732         return (0);
1733 }
1734
1735 /*
1736  * Record a select request.
1737  */
1738 void
1739 selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql)
1740 {
1741         thread_t        cur_act = current_thread();
1742         struct uthread * ut = get_bsdthread_info(cur_act);
1743
1744         /* need to look at collisions */
1745
1746         if ((p_wql == (void *)0) && ((sip->si_flags & SI_INITED) == 0)) {
1747                 return;
1748         }
1749
1750         /*do not record if this is second pass of select */
1751         if((p_wql == (void *)0)) {
1752                 return;
1753         }
1754
1755         if ((sip->si_flags & SI_INITED) == 0) {
1756                 wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
1757                 sip->si_flags |= SI_INITED;
1758                 sip->si_flags &= ~SI_CLEAR;
1759         }
1760
1761         if (sip->si_flags & SI_RECORDED) {
1762                 sip->si_flags |= SI_COLL;
1763         } else
1764                 sip->si_flags &= ~SI_COLL;
1765
1766         sip->si_flags |= SI_RECORDED;
1767         if (!wait_queue_member(&sip->si_wait_queue, ut->uu_wqset))
1768                 wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_wqset,
1769                                         (wait_queue_link_t)p_wql);
1770
1771         return;
1772 }
1773
1774 void
1775 selwakeup(struct selinfo *sip)
1776 {
1777
1778         if ((sip->si_flags & SI_INITED) == 0) {
1779                 return;
1780         }
1781
1782         if (sip->si_flags & SI_COLL) {
1783                 nselcoll++;
1784                 sip->si_flags &= ~SI_COLL;
1785 #if 0
1786                 /* will not  support */
1787                 //wakeup((caddr_t)&selwait);
1788 #endif
1789         }
1790
1791         if (sip->si_flags & SI_RECORDED) {
1792                 wait_queue_wakeup_all(&sip->si_wait_queue, &selwait, THREAD_AWAKENED);
1793                 sip->si_flags &= ~SI_RECORDED;
1794         }
1795
1796 }
1797
1798 void
1799 selthreadclear(struct selinfo *sip)
1800 {
1801
1802         if ((sip->si_flags & SI_INITED) == 0) {
1803                 return;
1804         }
1805         if (sip->si_flags & SI_RECORDED) {
1806                         selwakeup(sip);
1807                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
1808         }
1809         sip->si_flags |= SI_CLEAR;
1810         wait_queue_unlinkall_nofree(&sip->si_wait_queue);
1811 }
1812
1813
1814
1815
1816 #define DBG_POST        0x10
1817 #define DBG_WATCH       0x11
1818 #define DBG_WAIT        0x12
1819 #define DBG_MOD         0x13
1820 #define DBG_EWAKEUP     0x14
1821 #define DBG_ENQUEUE     0x15
1822 #define DBG_DEQUEUE     0x16
1823
1824 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
1825 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
1826 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
1827 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
1828 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
1829 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
1830 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
1831
1832
1833 #define EVPROCDEQUE(p, evq)     do {                            \
1834         proc_lock(p);                                           \
1835         if (evq->ee_flags & EV_QUEUED) {                        \
1836                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);      \
1837                 evq->ee_flags &= ~EV_QUEUED;                    \
1838         }                                                       \
1839         proc_unlock(p);                                         \
1840 } while (0);
1841
1842
1843 /*
1844  * called upon socket close. deque and free all events for
1845  * the socket...  socket must be locked by caller.
1846  */
1847 void
1848 evsofree(struct socket *sp)
1849 {
1850         struct eventqelt *evq, *next;
1851         proc_t  p;
1852
1853         if (sp == NULL)
1854                 return;
1855
1856         for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
1857                 next = evq->ee_slist.tqe_next;
1858                 p = evq->ee_proc;
1859
1860                 if (evq->ee_flags & EV_QUEUED) {
1861                         EVPROCDEQUE(p, evq);
1862                 }
1863                 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
1864                 FREE(evq, M_TEMP);
1865         }
1866 }
1867
1868
1869 /*
1870  * called upon pipe close. deque and free all events for
1871  * the pipe... pipe must be locked by caller
1872  */
1873 void
1874 evpipefree(struct pipe *cpipe)
1875 {
1876         struct eventqelt *evq, *next;
1877         proc_t  p;
1878
1879         for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
1880                 next = evq->ee_slist.tqe_next;
1881                 p = evq->ee_proc;
1882
1883                 EVPROCDEQUE(p, evq);
1884
1885                 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
1886                 FREE(evq, M_TEMP);
1887         }
1888 }
1889
1890
1891 /*
1892  * enqueue this event if it's not already queued. wakeup
1893  * the proc if we do queue this event to it...
1894  * entered with proc lock held... we drop it before
1895  * doing the wakeup and return in that state
1896  */
1897 static void
1898 evprocenque(struct eventqelt *evq)
1899 {
1900         proc_t  p;
1901
1902         assert(evq);
1903         p = evq->ee_proc;
1904
1905         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,0,0);
1906
1907         proc_lock(p);
1908
1909         if (evq->ee_flags & EV_QUEUED) {
1910                 proc_unlock(p);
1911
1912                 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1913                 return;
1914         }
1915         evq->ee_flags |= EV_QUEUED;
1916
1917         TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
1918
1919         proc_unlock(p);
1920
1921         wakeup(&p->p_evlist);
1922
1923         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1924 }
1925
1926
1927 /*
1928  * pipe lock must be taken by the caller
1929  */
1930 void
1931 postpipeevent(struct pipe *pipep, int event)
1932 {
1933         int     mask;
1934         struct eventqelt *evq;
1935
1936         if (pipep == NULL)
1937                 return;
1938         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
1939
1940         for (evq = pipep->pipe_evlist.tqh_first;
1941              evq != NULL; evq = evq->ee_slist.tqe_next) {
1942
1943                 if (evq->ee_eventmask == 0)
1944                         continue;
1945                 mask = 0;
1946
1947                 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
1948
1949                 case EV_RWBYTES:
1950                   if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
1951                           mask |= EV_RE;
1952                           evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
1953                   }
1954                   if ((evq->ee_eventmask & EV_WR) &&
1955                       (pipep->pipe_buffer.size - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
1956
1957                           if (pipep->pipe_state & PIPE_EOF) {
1958                                   mask |= EV_WR|EV_RESET;
1959                                   break;
1960                           }
1961                           mask |= EV_WR;
1962                           evq->ee_req.er_wcnt = pipep->pipe_buffer.size - pipep->pipe_buffer.cnt;
1963                   }
1964                   break;
1965
1966                 case EV_WCLOSED:
1967                 case EV_RCLOSED:
1968                   if ((evq->ee_eventmask & EV_RE)) {
1969                           mask |= EV_RE|EV_RCLOSED;
1970                   }
1971                   if ((evq->ee_eventmask & EV_WR)) {
1972                           mask |= EV_WR|EV_WCLOSED;
1973                   }
1974                   break;
1975
1976                 default:
1977                   return;
1978                 }
1979                 if (mask) {
1980                         /*
1981                          * disarm... postevents are nops until this event is 'read' via
1982                          * waitevent and then re-armed via modwatch
1983                          */
1984                         evq->ee_eventmask = 0;
1985
1986                         /*
1987                          * since events are disarmed until after the waitevent
1988                          * the ee_req.er_xxxx fields can't change once we've
1989                          * inserted this event into the proc queue...
1990                          * therefore, the waitevent will see a 'consistent'
1991                          * snapshot of the event, even though it won't hold
1992                          * the pipe lock, and we're updating the event outside
1993                          * of the proc lock, which it will hold
1994                          */
1995                         evq->ee_req.er_eventbits |= mask;
1996
1997                         KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1,0);
1998
1999                         evprocenque(evq);
2000                 }
2001         }
2002         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
2003 }
2004
2005 #if SOCKETS
2006 /*
2007  * given either a sockbuf or a socket run down the
2008  * event list and queue ready events found...
2009  * the socket must be locked by the caller
2010  */
2011 void
2012 postevent(struct socket *sp, struct sockbuf *sb, int event)
2013 {
2014         int     mask;
2015         struct  eventqelt *evq;
2016         struct  tcpcb *tp;
2017
2018         if (sb)
2019                 sp = sb->sb_so;
2020         if (sp == NULL)
2021                 return;
2022
2023         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
2024
2025         for (evq = sp->so_evlist.tqh_first;
2026              evq != NULL; evq = evq->ee_slist.tqe_next) {
2027
2028                 if (evq->ee_eventmask == 0)
2029                         continue;
2030                 mask = 0;
2031
2032                 /* ready for reading:
2033                    - byte cnt >= receive low water mark
2034                    - read-half of conn closed
2035                    - conn pending for listening sock
2036                    - socket error pending
2037
2038                    ready for writing
2039                    - byte cnt avail >= send low water mark
2040                    - write half of conn closed
2041                    - socket error pending
2042                    - non-blocking conn completed successfully
2043
2044                    exception pending
2045                    - out of band data
2046                    - sock at out of band mark
2047                 */
2048
2049                 switch (event & EV_DMASK) {
2050
2051                 case EV_OOB:
2052                   if ((evq->ee_eventmask & EV_EX)) {
2053                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2054                                   mask |= EV_EX|EV_OOB;
2055                   }
2056                   break;
2057
2058                 case EV_RWBYTES|EV_OOB:
2059                   if ((evq->ee_eventmask & EV_EX)) {
2060                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2061                                   mask |= EV_EX|EV_OOB;
2062                   }
2063                   /*
2064                    * fall into the next case
2065                    */
2066                 case EV_RWBYTES:
2067                   if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
2068                           if (sp->so_error) {
2069                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
2070                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
2071                                               (tp->t_state == TCPS_CLOSED)) {
2072                                                   mask |= EV_RE|EV_RESET;
2073                                                   break;
2074                                           }
2075                                   }
2076                           }
2077                           mask |= EV_RE;
2078                           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
2079
2080                           if (sp->so_state & SS_CANTRCVMORE) {
2081                                   mask |= EV_FIN;
2082                                   break;
2083                           }
2084                   }
2085                   if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
2086                           if (sp->so_error) {
2087                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
2088                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
2089                                               (tp->t_state == TCPS_CLOSED)) {
2090                                                   mask |= EV_WR|EV_RESET;
2091                                                   break;
2092                                           }
2093                                   }
2094                           }
2095                           mask |= EV_WR;
2096                           evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
2097                   }
2098                   break;
2099
2100                 case EV_RCONN:
2101                   if ((evq->ee_eventmask & EV_RE)) {
2102                           mask |= EV_RE|EV_RCONN;
2103                           evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
2104                   }
2105                   break;
2106
2107                 case EV_WCONN:
2108                   if ((evq->ee_eventmask & EV_WR)) {
2109                           mask |= EV_WR|EV_WCONN;
2110                   }
2111                   break;
2112
2113                 case EV_RCLOSED:
2114                   if ((evq->ee_eventmask & EV_RE)) {
2115                           mask |= EV_RE|EV_RCLOSED;
2116                   }
2117                   break;
2118
2119                 case EV_WCLOSED:
2120                   if ((evq->ee_eventmask & EV_WR)) {
2121                           mask |= EV_WR|EV_WCLOSED;
2122                   }
2123                   break;
2124
2125                 case EV_FIN:
2126                   if (evq->ee_eventmask & EV_RE) {
2127                           mask |= EV_RE|EV_FIN;
2128                   }
2129                   break;
2130
2131                 case EV_RESET:
2132                 case EV_TIMEOUT:
2133                   if (evq->ee_eventmask & EV_RE) {
2134                           mask |= EV_RE | event;
2135                   }
2136                   if (evq->ee_eventmask & EV_WR) {
2137                           mask |= EV_WR | event;
2138                   }
2139                   break;
2140
2141                 default:
2142                   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
2143                   return;
2144                 } /* switch */
2145
2146                 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
2147
2148                 if (mask) {
2149                         /*
2150                          * disarm... postevents are nops until this event is 'read' via
2151                          * waitevent and then re-armed via modwatch
2152                          */
2153                         evq->ee_eventmask = 0;
2154
2155                         /*
2156                          * since events are disarmed until after the waitevent
2157                          * the ee_req.er_xxxx fields can't change once we've
2158                          * inserted this event into the proc queue...
2159                          * since waitevent can't see this event until we
2160                          * enqueue it, waitevent will see a 'consistent'
2161                          * snapshot of the event, even though it won't hold
2162                          * the socket lock, and we're updating the event outside
2163                          * of the proc lock, which it will hold
2164                          */
2165                         evq->ee_req.er_eventbits |= mask;
2166
2167                         evprocenque(evq);
2168                 }
2169         }
2170         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
2171 }
2172 #endif /* SOCKETS */
2173
2174
2175 /*
2176  * watchevent system call. user passes us an event to watch
2177  * for. we malloc an event object, initialize it, and queue
2178  * it to the open socket. when the event occurs, postevent()
2179  * will enque it back to our proc where we can retrieve it
2180  * via waitevent().
2181  *
2182  * should this prevent duplicate events on same socket?
2183  *
2184  * Returns:
2185  *              ENOMEM                  No memory for operation
2186  *      copyin:EFAULT
2187  */
2188 int
2189 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
2190 {
2191         struct eventqelt *evq = (struct eventqelt *)0;
2192         struct eventqelt *np = NULL;
2193         struct eventreq64 *erp;
2194         struct fileproc *fp = NULL;
2195         int error;
2196
2197         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
2198
2199         // get a qelt and fill with users req
2200         MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
2201
2202         if (evq == NULL)
2203                 return (ENOMEM);
2204         erp = &evq->ee_req;
2205
2206         // get users request pkt
2207
2208         if (IS_64BIT_PROCESS(p)) {
2209                 error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
2210         } else {
2211                 struct eventreq32 er32;
2212
2213                 error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
2214                 if (error == 0) {
2215                        /*
2216                         * the user only passes in the
2217                         * er_type, er_handle and er_data...
2218                         * the other fields are initialized
2219                         * below, so don't bother to copy
2220                         */
2221                         erp->er_type = er32.er_type;
2222                         erp->er_handle = er32.er_handle;
2223                         erp->er_data = (user_addr_t)er32.er_data;
2224                 }
2225         }
2226         if (error) {
2227                 FREE(evq, M_TEMP);
2228                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2229
2230                 return(error);
2231         }
2232         KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2233
2234         // validate, freeing qelt if errors
2235         error = 0;
2236         proc_fdlock(p);
2237
2238         if (erp->er_type != EV_FD) {
2239                 error = EINVAL;
2240         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2241                 error = EBADF;
2242 #if SOCKETS
2243         } else if (fp->f_type == DTYPE_SOCKET) {
2244                 socket_lock((struct socket *)fp->f_data, 1);
2245                 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2246 #endif /* SOCKETS */
2247         } else if (fp->f_type == DTYPE_PIPE) {
2248                 PIPE_LOCK((struct pipe *)fp->f_data);
2249                 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2250         } else {
2251                 fp_drop(p, erp->er_handle, fp, 1);
2252                 error = EINVAL;
2253         }
2254         proc_fdunlock(p);
2255
2256         if (error) {
2257                 FREE(evq, M_TEMP);
2258
2259                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2260                 return(error);
2261         }
2262
2263         /*
2264          * only allow one watch per file per proc
2265          */
2266         for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2267                 if (np->ee_proc == p) {
2268 #if SOCKETS
2269                         if (fp->f_type == DTYPE_SOCKET)
2270                                 socket_unlock((struct socket *)fp->f_data, 1);
2271                         else
2272 #endif /* SOCKETS */
2273                                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2274                         fp_drop(p, erp->er_handle, fp, 0);
2275                         FREE(evq, M_TEMP);
2276
2277                         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2278                         return(EINVAL);
2279                 }
2280         }
2281         erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
2282         evq->ee_proc = p;
2283         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2284         evq->ee_flags = 0;
2285
2286 #if SOCKETS
2287         if (fp->f_type == DTYPE_SOCKET) {
2288                 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2289                 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
2290
2291                 socket_unlock((struct socket *)fp->f_data, 1);
2292         } else
2293 #endif /* SOCKETS */
2294         {
2295                 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2296                 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2297
2298                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2299         }
2300         fp_drop_event(p, erp->er_handle, fp);
2301
2302         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
2303         return(0);
2304 }
2305
2306
2307
2308 /*
2309  * waitevent system call.
2310  * grabs the next waiting event for this proc and returns
2311  * it. if no events, user can request to sleep with timeout
2312  * or without or poll mode
2313  *    ((tv != NULL && interval == 0) || tv == -1)
2314  */
2315 int
2316 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
2317 {
2318         int error = 0;
2319         struct eventqelt *evq;
2320         struct eventreq64 *erp;
2321         uint64_t abstime, interval;
2322         boolean_t fast_poll = FALSE;
2323         union {
2324                 struct eventreq64 er64;
2325                 struct eventreq32 er32;
2326         } uer;
2327
2328         interval = 0;
2329
2330         if (uap->tv) {
2331                 struct timeval atv;
2332                 /*
2333                  * check for fast poll method
2334                  */
2335                 if (IS_64BIT_PROCESS(p)) {
2336                         if (uap->tv == (user_addr_t)-1)
2337                                 fast_poll = TRUE;
2338                 } else if (uap->tv == (user_addr_t)((uint32_t)-1))
2339                         fast_poll = TRUE;
2340
2341                 if (fast_poll == TRUE) {
2342                         if (p->p_evlist.tqh_first == NULL) {
2343                                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_NONE, -1,0,0,0,0);
2344                                 /*
2345                                  * poll failed
2346                                  */
2347                                 *retval = 1;
2348                                 return (0);
2349                         }
2350                         proc_lock(p);
2351                         goto retry;
2352                 }
2353                 error = copyin(uap->tv, (caddr_t)&atv, sizeof (atv));
2354
2355                 if (error)
2356                         return(error);
2357                 if (itimerfix(&atv)) {
2358                         error = EINVAL;
2359                         return(error);
2360                 }
2361                 interval = tvtoabstime(&atv);
2362         }
2363         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
2364
2365         proc_lock(p);
2366 retry:
2367         if ((evq = p->p_evlist.tqh_first) != NULL) {
2368                 /*
2369                  * found one... make a local copy while it's still on the queue
2370                  * to prevent it from changing while in the midst of copying
2371                  * don't want to hold the proc lock across a copyout because
2372                  * it might block on a page fault at the target in user space
2373                  */
2374                 erp = &evq->ee_req;
2375
2376                 if (IS_64BIT_PROCESS(p))
2377                         bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
2378                 else {
2379                         uer.er32.er_type  = erp->er_type;
2380                         uer.er32.er_handle  = erp->er_handle;
2381                         uer.er32.er_data  = (uint32_t)erp->er_data;
2382                         uer.er32.er_ecnt  = erp->er_ecnt;
2383                         uer.er32.er_rcnt  = erp->er_rcnt;
2384                         uer.er32.er_wcnt  = erp->er_wcnt;
2385                         uer.er32.er_eventbits = erp->er_eventbits;
2386                 }
2387                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2388
2389                 evq->ee_flags &= ~EV_QUEUED;
2390
2391                 proc_unlock(p);
2392
2393                 if (IS_64BIT_PROCESS(p))
2394                         error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
2395                 else
2396                         error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
2397
2398                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
2399                              evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,0);
2400                 return (error);
2401         }
2402         else {
2403                 if (uap->tv && interval == 0) {
2404                         proc_unlock(p);
2405                         *retval = 1;  // poll failed
2406
2407                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
2408                         return (error);
2409                 }
2410                 if (interval != 0)
2411                         clock_absolutetime_interval_to_deadline(interval, &abstime);
2412                 else
2413                         abstime = 0;
2414
2415                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,(uint32_t)&p->p_evlist,0,0,0);
2416
2417                 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
2418
2419                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,(uint32_t)&p->p_evlist,0,0,0);
2420
2421                 if (error == 0)
2422                         goto retry;
2423                 if (error == ERESTART)
2424                         error = EINTR;
2425                 if (error == EWOULDBLOCK) {
2426                         *retval = 1;
2427                         error = 0;
2428                 }
2429         }
2430         proc_unlock(p);
2431
2432         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
2433         return (error);
2434 }
2435
2436
2437 /*
2438  * modwatch system call. user passes in event to modify.
2439  * if we find it we reset the event bits and que/deque event
2440  * it needed.
2441  */
2442 int
2443 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
2444 {
2445         struct eventreq64 er;
2446         struct eventreq64 *erp = &er;
2447         struct eventqelt *evq = NULL;   /* protected by error return */
2448         int error;
2449         struct fileproc *fp;
2450         int flag;
2451
2452         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
2453
2454         /*
2455          * get user's request pkt
2456          * just need the er_type and er_handle which sit above the
2457          * problematic er_data (32/64 issue)... so only copy in
2458          * those 2 fields
2459          */
2460         if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
2461                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2462                 return(error);
2463         }
2464         proc_fdlock(p);
2465
2466         if (erp->er_type != EV_FD) {
2467                 error = EINVAL;
2468         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2469                 error = EBADF;
2470 #if SOCKETS
2471         } else if (fp->f_type == DTYPE_SOCKET) {
2472                 socket_lock((struct socket *)fp->f_data, 1);
2473                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2474 #endif /* SOCKETS */
2475         } else if (fp->f_type == DTYPE_PIPE) {
2476                 PIPE_LOCK((struct pipe *)fp->f_data);
2477                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2478         } else {
2479                 fp_drop(p, erp->er_handle, fp, 1);
2480                 error = EINVAL;
2481         }
2482
2483         if (error) {
2484                 proc_fdunlock(p);
2485                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2486                 return(error);
2487         }
2488
2489         if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2490                 fp->f_flags &= ~FP_WAITEVENT;
2491         }
2492         proc_fdunlock(p);
2493
2494         // locate event if possible
2495         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2496                 if (evq->ee_proc == p)
2497                         break;
2498         }
2499         if (evq == NULL) {
2500 #if SOCKETS
2501                 if (fp->f_type == DTYPE_SOCKET)
2502                         socket_unlock((struct socket *)fp->f_data, 1);
2503                 else
2504 #endif /* SOCKETS */
2505                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2506                 fp_drop(p, erp->er_handle, fp, 0);
2507                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
2508                 return(EINVAL);
2509         }
2510         KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2511
2512         if (uap->u_eventmask == EV_RM) {
2513                 EVPROCDEQUE(p, evq);
2514
2515 #if SOCKETS
2516                 if (fp->f_type == DTYPE_SOCKET) {
2517                         TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2518                         socket_unlock((struct socket *)fp->f_data, 1);
2519                 } else
2520 #endif /* SOCKETS */
2521                 {
2522                         TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2523                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2524                 }
2525                 fp_drop(p, erp->er_handle, fp, 0);
2526                 FREE(evq, M_TEMP);
2527                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
2528                 return(0);
2529         }
2530         switch (uap->u_eventmask & EV_MASK) {
2531
2532         case 0:
2533                 flag = 0;
2534                 break;
2535
2536         case EV_RE:
2537         case EV_WR:
2538         case EV_RE|EV_WR:
2539                 flag = EV_RWBYTES;
2540                 break;
2541
2542         case EV_EX:
2543                 flag = EV_OOB;
2544                 break;
2545
2546         case EV_EX|EV_RE:
2547         case EV_EX|EV_WR:
2548         case EV_EX|EV_RE|EV_WR:
2549                 flag = EV_OOB|EV_RWBYTES;
2550                 break;
2551
2552         default:
2553 #if SOCKETS
2554                 if (fp->f_type == DTYPE_SOCKET)
2555                         socket_unlock((struct socket *)fp->f_data, 1);
2556                 else
2557 #endif /* SOCKETS */
2558                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2559                 fp_drop(p, erp->er_handle, fp, 0);
2560                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2561                 return(EINVAL);
2562         }
2563         /*
2564          * since we're holding the socket/pipe lock, the event
2565          * cannot go from the unqueued state to the queued state
2566          * however, it can go from the queued state to the unqueued state
2567          * since that direction is protected by the proc_lock...
2568          * so do a quick check for EV_QUEUED w/o holding the proc lock
2569          * since by far the common case will be NOT EV_QUEUED, this saves
2570          * us taking the proc_lock the majority of the time
2571          */
2572         if (evq->ee_flags & EV_QUEUED) {
2573                 /*
2574                  * EVPROCDEQUE will recheck the state after it grabs the proc_lock
2575                  */
2576                 EVPROCDEQUE(p, evq);
2577         }
2578         /*
2579          * while the event is off the proc queue and
2580          * we're holding the socket/pipe lock
2581          * it's safe to update these fields...
2582          */
2583         evq->ee_req.er_eventbits = 0;
2584         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2585
2586 #if SOCKETS
2587         if (fp->f_type == DTYPE_SOCKET) {
2588                 postevent((struct socket *)fp->f_data, 0, flag);
2589                 socket_unlock((struct socket *)fp->f_data, 1);
2590         } else
2591 #endif /* SOCKETS */
2592         {
2593                 postpipeevent((struct pipe *)fp->f_data, flag);
2594                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2595         }
2596         fp_drop(p, erp->er_handle, fp, 0);
2597         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,0);
2598         return(0);
2599 }
2600
2601 /* this routine is called from the close of fd with proc_fdlock held */
2602 int
2603 waitevent_close(struct proc *p, struct fileproc *fp)
2604 {
2605         struct eventqelt *evq;
2606
2607
2608         fp->f_flags &= ~FP_WAITEVENT;
2609
2610 #if SOCKETS
2611         if (fp->f_type == DTYPE_SOCKET) {
2612                 socket_lock((struct socket *)fp->f_data, 1);
2613                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2614         } else
2615 #endif /* SOCKETS */
2616         if (fp->f_type == DTYPE_PIPE) {
2617                 PIPE_LOCK((struct pipe *)fp->f_data);
2618                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2619         }
2620         else {
2621                 return(EINVAL);
2622         }
2623         proc_fdunlock(p);
2624
2625
2626         // locate event if possible
2627         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2628                 if (evq->ee_proc == p)
2629                         break;
2630         }
2631         if (evq == NULL) {
2632 #if SOCKETS
2633                 if (fp->f_type == DTYPE_SOCKET)
2634                         socket_unlock((struct socket *)fp->f_data, 1);
2635                 else
2636 #endif /* SOCKETS */
2637                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2638
2639                 proc_fdlock(p);
2640
2641                 return(EINVAL);
2642         }
2643         EVPROCDEQUE(p, evq);
2644
2645 #if SOCKETS
2646         if (fp->f_type == DTYPE_SOCKET) {
2647                 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2648                 socket_unlock((struct socket *)fp->f_data, 1);
2649         } else
2650 #endif /* SOCKETS */
2651         {
2652                 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2653                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2654         }
2655         FREE(evq, M_TEMP);
2656
2657         proc_fdlock(p);
2658
2659         return(0);
2660 }
2661
2662
2663 /*
2664  * gethostuuid
2665  *
2666  * Description: Get the host UUID from IOKit and return it to user space.
2667  *
2668  * Parameters:  uuid_buf                Pointer to buffer to receive UUID
2669  *              timeout                 Timespec for timout
2670  *
2671  * Returns:     0                       Success
2672  *              EWOULDBLOCK             Timeout is too short
2673  *              copyout:EFAULT          Bad user buffer
2674  *
2675  * Notes:       A timeout seems redundant, since if it's tolerable to not
2676  *              have a system UUID in hand, then why ask for one?
2677  */
2678 int
2679 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused register_t *retval)
2680 {
2681         kern_return_t kret;
2682         int error;
2683         mach_timespec_t mach_ts;        /* for IOKit call */
2684         __darwin_uuid_t uuid_kern;      /* for IOKit call */
2685
2686         /* Convert the 32/64 bit timespec into a mach_timespec_t */
2687         if ( proc_is64bit(p) ) {
2688                 struct user_timespec ts;
2689                 error = copyin(uap->timeoutp, &ts, sizeof(ts));
2690                 if (error)
2691                         return (error);
2692                 mach_ts.tv_sec = ts.tv_sec;
2693                 mach_ts.tv_nsec = ts.tv_nsec;
2694         } else {
2695                 struct timespec ts;
2696                 error = copyin(uap->timeoutp, &ts, sizeof(ts) );
2697                 if (error)
2698                         return (error);
2699                 mach_ts.tv_sec = ts.tv_sec;
2700                 mach_ts.tv_nsec = ts.tv_nsec;
2701         }
2702
2703         /* Call IOKit with the stack buffer to get the UUID */
2704         kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
2705
2706         /*
2707          * If we get it, copy out the data to the user buffer; note that a
2708          * uuid_t is an array of characters, so this is size invariant for
2709          * 32 vs. 64 bit.
2710          */
2711         if (kret == KERN_SUCCESS) {
2712                 error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
2713         } else {
2714                 error = EWOULDBLOCK;
2715         }
2716
2717         return (error);
2718 }