bsd/kern/sys_generic.c

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/ioctl.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/proc_internal.h>
  81 #include <sys/socketvar.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/kernel.h>
  84 #include <sys/stat.h>
  85 #include <sys/malloc.h>
  86 #include <sys/sysproto.h>
  87
  88 #include <sys/mount_internal.h>
  89 #include <sys/protosw.h>
  90 #include <sys/ev.h>
  91 #include <sys/user.h>
  92 #include <sys/kdebug.h>
  93 #include <sys/poll.h>
  94 #include <sys/event.h>
  95 #include <sys/eventvar.h>
  96 #include <sys/proc.h>
  97 #include <sys/kauth.h>
  98
  99 #include <mach/mach_types.h>
 100 #include <kern/kern_types.h>
 101 #include <kern/assert.h>
 102 #include <kern/kalloc.h>
 103 #include <kern/thread.h>
 104 #include <kern/clock.h>
 105 #include <kern/ledger.h>
 106 #include <kern/task.h>
 107 #if CONFIG_TELEMETRY
 108 #include <kern/telemetry.h>
 109 #endif
 110
 111 #include <sys/mbuf.h>
 112 #include <sys/domain.h>
 113 #include <sys/socket.h>
 114 #include <sys/socketvar.h>
 115 #include <sys/errno.h>
 116 #include <sys/syscall.h>
 117 #include <sys/pipe.h>
 118
 119 #include <security/audit/audit.h>
 120
 121 #include <net/if.h>
 122 #include <net/route.h>
 123
 124 #include <netinet/in.h>
 125 #include <netinet/in_systm.h>
 126 #include <netinet/ip.h>
 127 #include <netinet/in_pcb.h>
 128 #include <netinet/ip_var.h>
 129 #include <netinet/ip6.h>
 130 #include <netinet/tcp.h>
 131 #include <netinet/tcp_fsm.h>
 132 #include <netinet/tcp_seq.h>
 133 #include <netinet/tcp_timer.h>
 134 #include <netinet/tcp_var.h>
 135 #include <netinet/tcpip.h>
 136 #include <netinet/tcp_debug.h>
 137 /* for wait queue based select */
 138 #include <kern/wait_queue.h>
 139 #include <kern/kalloc.h>
 140 #include <sys/vnode_internal.h>
 141
 142 #include <pexpert/pexpert.h>
 143
 144 /* XXX should be in a header file somewhere */
 145 void evsofree(struct socket *);
 146 void evpipefree(struct pipe *);
 147 void postpipeevent(struct pipe *, int);
 148 void postevent(struct socket *, struct sockbuf *, int);
 149 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
 150
 151 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 152 int wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 153 extern void     *get_bsduthreadarg(thread_t);
 154 extern int      *get_bsduthreadrval(thread_t);
 155
 156 __private_extern__ int  dofileread(vfs_context_t ctx, struct fileproc *fp,
 157                                                                    user_addr_t bufp, user_size_t nbyte,
 158                                                                    off_t offset, int flags, user_ssize_t *retval);
 159 __private_extern__ int  dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 160                                                                         user_addr_t bufp, user_size_t nbyte,
 161                                                                         off_t offset, int flags, user_ssize_t *retval);
 162 __private_extern__ int  preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
 163 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
 164
 165
 166 /* Conflict wait queue for when selects collide (opaque type) */
 167 struct wait_queue select_conflict_queue;
 168
 169 #if 13841988
 170 int temp_debug_13841988 = 0;
 171 #endif
 172
 173 /*
 174  * Init routine called from bsd_init.c
 175  */
 176 void select_wait_queue_init(void);
 177 void
 178 select_wait_queue_init(void)
 179 {
 180         wait_queue_init(&select_conflict_queue, SYNC_POLICY_FIFO);
 181 #if 13841988
 182         if (PE_parse_boot_argn("temp_debug_13841988", &temp_debug_13841988, sizeof(temp_debug_13841988))) {
 183                 kprintf("Temporary debugging for 13841988 enabled\n");
 184         }
 185 #endif
 186 }
 187
 188 #define f_flag f_fglob->fg_flag
 189 #define f_type f_fglob->fg_ops->fo_type
 190 #define f_msgcount f_fglob->fg_msgcount
 191 #define f_cred f_fglob->fg_cred
 192 #define f_ops f_fglob->fg_ops
 193 #define f_offset f_fglob->fg_offset
 194 #define f_data f_fglob->fg_data
 195
 196 /*
 197  * Read system call.
 198  *
 199  * Returns:     0                       Success
 200  *      preparefileread:EBADF
 201  *      preparefileread:ESPIPE
 202  *      preparefileread:ENXIO
 203  *      preparefileread:EBADF
 204  *      dofileread:???
 205  */
 206 int
 207 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
 208 {
 209         __pthread_testcancel(1);
 210         return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
 211 }
 212
 213 int
 214 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
 215 {
 216         struct fileproc *fp;
 217         int error;
 218         int fd = uap->fd;
 219         struct vfs_context context;
 220
 221         if ( (error = preparefileread(p, &fp, fd, 0)) )
 222                 return (error);
 223
 224         context = *(vfs_context_current());
 225         context.vc_ucred = fp->f_fglob->fg_cred;
 226
 227         error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
 228                            (off_t)-1, 0, retval);
 229
 230         donefileread(p, fp, fd);
 231
 232         return (error);
 233 }
 234
 235 /*
 236  * Pread system call
 237  *
 238  * Returns:     0                       Success
 239  *      preparefileread:EBADF
 240  *      preparefileread:ESPIPE
 241  *      preparefileread:ENXIO
 242  *      preparefileread:EBADF
 243  *      dofileread:???
 244  */
 245 int
 246 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
 247 {
 248         __pthread_testcancel(1);
 249         return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
 250 }
 251
 252 int
 253 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
 254 {
 255         struct fileproc *fp = NULL;     /* fp set by preparefileread() */
 256         int fd = uap->fd;
 257         int error;
 258         struct vfs_context context;
 259
 260         if ( (error = preparefileread(p, &fp, fd, 1)) )
 261                 goto out;
 262
 263         context = *(vfs_context_current());
 264         context.vc_ucred = fp->f_fglob->fg_cred;
 265
 266         error = dofileread(&context, fp, uap->buf, uap->nbyte,
 267                         uap->offset, FOF_OFFSET, retval);
 268
 269         donefileread(p, fp, fd);
 270
 271         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
 272               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 273
 274 out:
 275         return (error);
 276 }
 277
 278 /*
 279  * Code common for read and pread
 280  */
 281
 282 void
 283 donefileread(struct proc *p, struct fileproc *fp, int fd)
 284 {
 285         proc_fdlock_spin(p);
 286         fp_drop(p, fd, fp, 1);
 287         proc_fdunlock(p);
 288 }
 289
 290 /*
 291  * Returns:     0                       Success
 292  *              EBADF
 293  *              ESPIPE
 294  *              ENXIO
 295  *      fp_lookup:EBADF
 296  *      fo_read:???
 297  */
 298 int
 299 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
 300 {
 301         vnode_t vp;
 302         int     error;
 303         struct fileproc *fp;
 304
 305         AUDIT_ARG(fd, fd);
 306
 307         proc_fdlock_spin(p);
 308
 309         error = fp_lookup(p, fd, &fp, 1);
 310
 311         if (error) {
 312                 proc_fdunlock(p);
 313                 return (error);
 314         }
 315         if ((fp->f_flag & FREAD) == 0) {
 316                 error = EBADF;
 317                 goto out;
 318         }
 319         if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
 320                 error = ESPIPE;
 321                 goto out;
 322         }
 323         if (fp->f_type == DTYPE_VNODE) {
 324                 vp = (struct vnode *)fp->f_fglob->fg_data;
 325
 326                 if (check_for_pread && (vnode_isfifo(vp))) {
 327                         error = ESPIPE;
 328                         goto out;
 329                 }
 330                 if (check_for_pread && (vp->v_flag & VISTTY)) {
 331                         error = ENXIO;
 332                         goto out;
 333                 }
 334         }
 335
 336         *fp_ret = fp;
 337
 338         proc_fdunlock(p);
 339         return (0);
 340
 341 out:
 342         fp_drop(p, fd, fp, 1);
 343         proc_fdunlock(p);
 344         return (error);
 345 }
 346
 347
 348 /*
 349  * Returns:     0                       Success
 350  *              EINVAL
 351  *      fo_read:???
 352  */
 353 __private_extern__ int
 354 dofileread(vfs_context_t ctx, struct fileproc *fp,
 355            user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
 356            user_ssize_t *retval)
 357 {
 358         uio_t auio;
 359         user_ssize_t bytecnt;
 360         long error = 0;
 361         char uio_buf[ UIO_SIZEOF(1) ];
 362
 363         if (nbyte > INT_MAX)
 364                 return (EINVAL);
 365
 366         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
 367                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
 368                                                                           &uio_buf[0], sizeof(uio_buf));
 369         } else {
 370                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
 371                                                                           &uio_buf[0], sizeof(uio_buf));
 372         }
 373         uio_addiov(auio, bufp, nbyte);
 374
 375         bytecnt = nbyte;
 376
 377         if ((error = fo_read(fp, auio, flags, ctx))) {
 378                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 379                         error == EINTR || error == EWOULDBLOCK))
 380                         error = 0;
 381         }
 382         bytecnt -= uio_resid(auio);
 383
 384         *retval = bytecnt;
 385
 386         return (error);
 387 }
 388
 389 /*
 390  * Scatter read system call.
 391  *
 392  * Returns:     0                       Success
 393  *              EINVAL
 394  *              ENOMEM
 395  *      copyin:EFAULT
 396  *      rd_uio:???
 397  */
 398 int
 399 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
 400 {
 401         __pthread_testcancel(1);
 402         return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
 403 }
 404
 405 int
 406 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
 407 {
 408         uio_t auio = NULL;
 409         int error;
 410         struct user_iovec *iovp;
 411
 412         /* Verify range bedfore calling uio_create() */
 413         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 414                 return (EINVAL);
 415
 416         /* allocate a uio large enough to hold the number of iovecs passed */
 417         auio = uio_create(uap->iovcnt, 0,
 418                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 419                                   UIO_READ);
 420
 421         /* get location of iovecs within the uio.  then copyin the iovecs from
 422          * user space.
 423          */
 424         iovp = uio_iovsaddr(auio);
 425         if (iovp == NULL) {
 426                 error = ENOMEM;
 427                 goto ExitThisRoutine;
 428         }
 429         error = copyin_user_iovec_array(uap->iovp,
 430                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
 431                 uap->iovcnt, iovp);
 432         if (error) {
 433                 goto ExitThisRoutine;
 434         }
 435
 436         /* finalize uio_t for use and do the IO
 437          */
 438         error = uio_calculateresid(auio);
 439         if (error) {
 440                 goto ExitThisRoutine;
 441         }
 442         error = rd_uio(p, uap->fd, auio, retval);
 443
 444 ExitThisRoutine:
 445         if (auio != NULL) {
 446                 uio_free(auio);
 447         }
 448         return (error);
 449 }
 450
 451 /*
 452  * Write system call
 453  *
 454  * Returns:     0                       Success
 455  *              EBADF
 456  *      fp_lookup:EBADF
 457  *      dofilewrite:???
 458  */
 459 int
 460 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
 461 {
 462         __pthread_testcancel(1);
 463         return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
 464
 465 }
 466
 467 int
 468 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
 469 {
 470         struct fileproc *fp;
 471         int error;
 472         int fd = uap->fd;
 473
 474         AUDIT_ARG(fd, fd);
 475
 476         error = fp_lookup(p,fd,&fp,0);
 477         if (error)
 478                 return(error);
 479         if ((fp->f_flag & FWRITE) == 0) {
 480                 error = EBADF;
 481         } else {
 482                 struct vfs_context context = *(vfs_context_current());
 483                 context.vc_ucred = fp->f_fglob->fg_cred;
 484
 485                 error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
 486                         (off_t)-1, 0, retval);
 487         }
 488         if (error == 0)
 489                 fp_drop_written(p, fd, fp);
 490         else
 491                 fp_drop(p, fd, fp, 0);
 492         return(error);
 493 }
 494
 495 /*
 496  * pwrite system call
 497  *
 498  * Returns:     0                       Success
 499  *              EBADF
 500  *              ESPIPE
 501  *              ENXIO
 502  *              EINVAL
 503  *      fp_lookup:EBADF
 504  *      dofilewrite:???
 505  */
 506 int
 507 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
 508 {
 509         __pthread_testcancel(1);
 510         return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
 511 }
 512
 513 int
 514 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
 515 {
 516         struct fileproc *fp;
 517         int error;
 518         int fd = uap->fd;
 519         vnode_t vp  = (vnode_t)0;
 520
 521         AUDIT_ARG(fd, fd);
 522
 523         error = fp_lookup(p,fd,&fp,0);
 524         if (error)
 525                 return(error);
 526
 527         if ((fp->f_flag & FWRITE) == 0) {
 528                 error = EBADF;
 529         } else {
 530                 struct vfs_context context = *vfs_context_current();
 531                 context.vc_ucred = fp->f_fglob->fg_cred;
 532
 533                 if (fp->f_type != DTYPE_VNODE) {
 534                         error = ESPIPE;
 535                         goto errout;
 536                 }
 537                 vp = (vnode_t)fp->f_fglob->fg_data;
 538                 if (vnode_isfifo(vp)) {
 539                         error = ESPIPE;
 540                         goto errout;
 541                 }
 542                 if ((vp->v_flag & VISTTY)) {
 543                         error = ENXIO;
 544                         goto errout;
 545                 }
 546                 if (uap->offset == (off_t)-1) {
 547                         error = EINVAL;
 548                         goto errout;
 549                 }
 550
 551                     error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
 552                         uap->offset, FOF_OFFSET, retval);
 553         }
 554 errout:
 555         if (error == 0)
 556                 fp_drop_written(p, fd, fp);
 557         else
 558                 fp_drop(p, fd, fp, 0);
 559
 560         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
 561               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 562
 563         return(error);
 564 }
 565
 566 /*
 567  * Returns:     0                       Success
 568  *              EINVAL
 569  *      <fo_write>:EPIPE
 570  *      <fo_write>:???                  [indirect through struct fileops]
 571  */
 572 __private_extern__ int
 573 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 574             user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
 575             user_ssize_t *retval)
 576 {
 577         uio_t auio;
 578         long error = 0;
 579         user_ssize_t bytecnt;
 580         char uio_buf[ UIO_SIZEOF(1) ];
 581
 582         if (nbyte > INT_MAX)
 583                 return (EINVAL);
 584
 585         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
 586                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
 587                                                                           &uio_buf[0], sizeof(uio_buf));
 588         } else {
 589                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
 590                                                                           &uio_buf[0], sizeof(uio_buf));
 591         }
 592         uio_addiov(auio, bufp, nbyte);
 593
 594         bytecnt = nbyte;
 595         if ((error = fo_write(fp, auio, flags, ctx))) {
 596                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 597                         error == EINTR || error == EWOULDBLOCK))
 598                         error = 0;
 599                 /* The socket layer handles SIGPIPE */
 600                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
 601                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
 602                         /* XXX Raise the signal on the thread? */
 603                         psignal(vfs_context_proc(ctx), SIGPIPE);
 604                 }
 605         }
 606         bytecnt -= uio_resid(auio);
 607         *retval = bytecnt;
 608
 609         return (error);
 610 }
 611
 612 /*
 613  * Gather write system call
 614  */
 615 int
 616 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
 617 {
 618         __pthread_testcancel(1);
 619         return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
 620 }
 621
 622 int
 623 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
 624 {
 625         uio_t auio = NULL;
 626         int error;
 627         struct user_iovec *iovp;
 628
 629         AUDIT_ARG(fd, uap->fd);
 630
 631         /* Verify range bedfore calling uio_create() */
 632         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 633                 return (EINVAL);
 634
 635         /* allocate a uio large enough to hold the number of iovecs passed */
 636         auio = uio_create(uap->iovcnt, 0,
 637                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 638                                   UIO_WRITE);
 639
 640         /* get location of iovecs within the uio.  then copyin the iovecs from
 641          * user space.
 642          */
 643         iovp = uio_iovsaddr(auio);
 644         if (iovp == NULL) {
 645                 error = ENOMEM;
 646                 goto ExitThisRoutine;
 647         }
 648         error = copyin_user_iovec_array(uap->iovp,
 649                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
 650                 uap->iovcnt, iovp);
 651         if (error) {
 652                 goto ExitThisRoutine;
 653         }
 654
 655         /* finalize uio_t for use and do the IO
 656          */
 657         error = uio_calculateresid(auio);
 658         if (error) {
 659                 goto ExitThisRoutine;
 660         }
 661         error = wr_uio(p, uap->fd, auio, retval);
 662
 663 ExitThisRoutine:
 664         if (auio != NULL) {
 665                 uio_free(auio);
 666         }
 667         return (error);
 668 }
 669
 670
 671 int
 672 wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
 673 {
 674         struct fileproc *fp;
 675         int error;
 676         user_ssize_t count;
 677         struct vfs_context context = *vfs_context_current();
 678
 679         error = fp_lookup(p,fdes,&fp,0);
 680         if (error)
 681                 return(error);
 682
 683         if ((fp->f_flag & FWRITE) == 0) {
 684                 error = EBADF;
 685                 goto out;
 686         }
 687         count = uio_resid(uio);
 688
 689         context.vc_ucred = fp->f_cred;
 690         error = fo_write(fp, uio, 0, &context);
 691         if (error) {
 692                 if (uio_resid(uio) != count && (error == ERESTART ||
 693                                                 error == EINTR || error == EWOULDBLOCK))
 694                         error = 0;
 695                 /* The socket layer handles SIGPIPE */
 696                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
 697                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0)
 698                         psignal(p, SIGPIPE);
 699         }
 700         *retval = count - uio_resid(uio);
 701
 702 out:
 703         if (error == 0)
 704                 fp_drop_written(p, fdes, fp);
 705         else
 706                 fp_drop(p, fdes, fp, 0);
 707         return(error);
 708 }
 709
 710
 711 int
 712 rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
 713 {
 714         struct fileproc *fp;
 715         int error;
 716         user_ssize_t count;
 717         struct vfs_context context = *vfs_context_current();
 718
 719         if ( (error = preparefileread(p, &fp, fdes, 0)) )
 720                 return (error);
 721
 722         count = uio_resid(uio);
 723
 724         context.vc_ucred = fp->f_cred;
 725
 726         error = fo_read(fp, uio, 0, &context);
 727
 728         if (error) {
 729                 if (uio_resid(uio) != count && (error == ERESTART ||
 730                                                 error == EINTR || error == EWOULDBLOCK))
 731                         error = 0;
 732         }
 733         *retval = count - uio_resid(uio);
 734
 735         donefileread(p, fp, fdes);
 736
 737         return (error);
 738 }
 739
 740 /*
 741  * Ioctl system call
 742  *
 743  * Returns:     0                       Success
 744  *              EBADF
 745  *              ENOTTY
 746  *              ENOMEM
 747  *              ESRCH
 748  *      copyin:EFAULT
 749  *      copyoutEFAULT
 750  *      fp_lookup:EBADF                 Bad file descriptor
 751  *      fo_ioctl:???
 752  */
 753 int
 754 ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
 755 {
 756         struct fileproc *fp = NULL;
 757         int error = 0;
 758         u_int size = 0;
 759         caddr_t datap = NULL, memp = NULL;
 760         boolean_t is64bit = FALSE;
 761         int tmp = 0;
 762 #define STK_PARAMS      128
 763         char stkbuf[STK_PARAMS];
 764         int fd = uap->fd;
 765         u_long com = uap->com;
 766         struct vfs_context context = *vfs_context_current();
 767
 768         AUDIT_ARG(fd, uap->fd);
 769         AUDIT_ARG(addr, uap->data);
 770
 771         is64bit = proc_is64bit(p);
 772 #if CONFIG_AUDIT
 773         if (is64bit)
 774                 AUDIT_ARG(value64, com);
 775         else
 776                 AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
 777 #endif /* CONFIG_AUDIT */
 778
 779         /*
 780          * Interpret high order word to find amount of data to be
 781          * copied to/from the user's address space.
 782          */
 783         size = IOCPARM_LEN(com);
 784         if (size > IOCPARM_MAX)
 785                         return ENOTTY;
 786         if (size > sizeof (stkbuf)) {
 787                 if ((memp = (caddr_t)kalloc(size)) == 0)
 788                         return ENOMEM;
 789                 datap = memp;
 790         } else
 791                 datap = &stkbuf[0];
 792         if (com & IOC_IN) {
 793                 if (size) {
 794                         error = copyin(uap->data, datap, size);
 795                         if (error)
 796                                 goto out_nofp;
 797                 } else {
 798                         /* XXX - IOC_IN and no size?  we should proably return an error here!! */
 799                         if (is64bit) {
 800                                 *(user_addr_t *)datap = uap->data;
 801                         }
 802                         else {
 803                                 *(uint32_t *)datap = (uint32_t)uap->data;
 804                         }
 805                 }
 806         } else if ((com & IOC_OUT) && size)
 807                 /*
 808                  * Zero the buffer so the user always
 809                  * gets back something deterministic.
 810                  */
 811                 bzero(datap, size);
 812         else if (com & IOC_VOID) {
 813                 /* XXX - this is odd since IOC_VOID means no parameters */
 814                 if (is64bit) {
 815                         *(user_addr_t *)datap = uap->data;
 816                 }
 817                 else {
 818                         *(uint32_t *)datap = (uint32_t)uap->data;
 819                 }
 820         }
 821
 822         proc_fdlock(p);
 823         error = fp_lookup(p,fd,&fp,1);
 824         if (error)  {
 825                 proc_fdunlock(p);
 826                 goto out_nofp;
 827         }
 828
 829         AUDIT_ARG(file, p, fp);
 830
 831         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 832                         error = EBADF;
 833                         goto out;
 834         }
 835
 836         context.vc_ucred = fp->f_fglob->fg_cred;
 837
 838 #if CONFIG_MACF
 839         error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, com);
 840         if (error)
 841                 goto out;
 842 #endif
 843
 844         switch (com) {
 845         case FIONCLEX:
 846                 *fdflags(p, fd) &= ~UF_EXCLOSE;
 847                 break;
 848
 849         case FIOCLEX:
 850                 *fdflags(p, fd) |= UF_EXCLOSE;
 851                 break;
 852
 853         case FIONBIO:
 854                 if ( (tmp = *(int *)datap) )
 855                         fp->f_flag |= FNONBLOCK;
 856                 else
 857                         fp->f_flag &= ~FNONBLOCK;
 858                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
 859                 break;
 860
 861         case FIOASYNC:
 862                 if ( (tmp = *(int *)datap) )
 863                         fp->f_flag |= FASYNC;
 864                 else
 865                         fp->f_flag &= ~FASYNC;
 866                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
 867                 break;
 868
 869         case FIOSETOWN:
 870                 tmp = *(int *)datap;
 871                 if (fp->f_type == DTYPE_SOCKET) {
 872                         ((struct socket *)fp->f_data)->so_pgid = tmp;
 873                         break;
 874                 }
 875                 if (fp->f_type == DTYPE_PIPE) {
 876                         error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
 877                         break;
 878                 }
 879                 if (tmp <= 0) {
 880                         tmp = -tmp;
 881                 } else {
 882                         struct proc *p1 = proc_find(tmp);
 883                         if (p1 == 0) {
 884                                 error = ESRCH;
 885                                 break;
 886                         }
 887                         tmp = p1->p_pgrpid;
 888                         proc_rele(p1);
 889                 }
 890                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
 891                 break;
 892
 893         case FIOGETOWN:
 894                 if (fp->f_type == DTYPE_SOCKET) {
 895                         *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
 896                         break;
 897                 }
 898                 error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
 899                 *(int *)datap = -*(int *)datap;
 900                 break;
 901
 902         default:
 903                 error = fo_ioctl(fp, com, datap, &context);
 904                 /*
 905                  * Copy any data to user, size was
 906                  * already set and checked above.
 907                  */
 908                 if (error == 0 && (com & IOC_OUT) && size)
 909                         error = copyout(datap, uap->data, (u_int)size);
 910                 break;
 911         }
 912 out:
 913         fp_drop(p, fd, fp, 1);
 914         proc_fdunlock(p);
 915
 916 out_nofp:
 917         if (memp)
 918                 kfree(memp, size);
 919         return(error);
 920 }
 921
 922 int     selwait, nselcoll;
 923 #define SEL_FIRSTPASS 1
 924 #define SEL_SECONDPASS 2
 925 extern int selcontinue(int error);
 926 extern int selprocess(int error, int sel_pass);
 927 static int selscan(struct proc *p, struct _select * sel,
 928                         int nfd, int32_t *retval, int sel_pass, wait_queue_sub_t wqsub);
 929 static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
 930 static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount);
 931 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
 932
 933 /*
 934  * Select system call.
 935  *
 936  * Returns:     0                       Success
 937  *              EINVAL                  Invalid argument
 938  *              EAGAIN                  Nonconformant error if allocation fails
 939  *      selprocess:???
 940  */
 941 int
 942 select(struct proc *p, struct select_args *uap, int32_t *retval)
 943 {
 944         __pthread_testcancel(1);
 945         return(select_nocancel(p, (struct select_nocancel_args *)uap, retval));
 946 }
 947
 948 int
 949 select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
 950 {
 951         int error = 0;
 952         u_int ni, nw, size;
 953         thread_t th_act;
 954         struct uthread  *uth;
 955         struct _select *sel;
 956         int needzerofill = 1;
 957         int count = 0;
 958
 959         th_act = current_thread();
 960         uth = get_bsdthread_info(th_act);
 961         sel = &uth->uu_select;
 962         sel->data = &uth->uu_kevent.ss_select_data;
 963         retval = (int *)get_bsduthreadrval(th_act);
 964         *retval = 0;
 965
 966         if (uap->nd < 0) {
 967                 return (EINVAL);
 968         }
 969
 970         /* select on thread of process that already called proc_exit() */
 971         if (p->p_fd == NULL) {
 972                 return (EBADF);
 973         }
 974
 975         if (uap->nd > p->p_fd->fd_nfiles)
 976                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
 977
 978         nw = howmany(uap->nd, NFDBITS);
 979         ni = nw * sizeof(fd_mask);
 980
 981         /*
 982          * if the previously allocated space for the bits is smaller than
 983          * what is requested or no space has yet been allocated for this
 984          * thread, allocate enough space now.
 985          *
 986          * Note: If this process fails, select() will return EAGAIN; this
 987          * is the same thing pool() returns in a no-memory situation, but
 988          * it is not a POSIX compliant error code for select().
 989          */
 990         if (sel->nbytes < (3 * ni)) {
 991                 int nbytes = 3 * ni;
 992
 993                 /* Free previous allocation, if any */
 994                 if (sel->ibits != NULL)
 995                         FREE(sel->ibits, M_TEMP);
 996                 if (sel->obits != NULL) {
 997                         FREE(sel->obits, M_TEMP);
 998                         /* NULL out; subsequent ibits allocation may fail */
 999                         sel->obits = NULL;
1000                 }
1001
1002                 MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1003                 if (sel->ibits == NULL)
1004                         return (EAGAIN);
1005                 MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1006                 if (sel->obits == NULL) {
1007                         FREE(sel->ibits, M_TEMP);
1008                         sel->ibits = NULL;
1009                         return (EAGAIN);
1010                 }
1011                 sel->nbytes = nbytes;
1012                 needzerofill = 0;
1013         }
1014
1015         if (needzerofill) {
1016                 bzero((caddr_t)sel->ibits, sel->nbytes);
1017                 bzero((caddr_t)sel->obits, sel->nbytes);
1018         }
1019
1020         /*
1021          * get the bits from the user address space
1022          */
1023 #define getbits(name, x) \
1024         do { \
1025                 if (uap->name && (error = copyin(uap->name, \
1026                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
1027                         goto continuation; \
1028         } while (0)
1029
1030         getbits(in, 0);
1031         getbits(ou, 1);
1032         getbits(ex, 2);
1033 #undef  getbits
1034
1035         if (uap->tv) {
1036                 struct timeval atv;
1037                 if (IS_64BIT_PROCESS(p)) {
1038                         struct user64_timeval atv64;
1039                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
1040                         /* Loses resolution - assume timeout < 68 years */
1041                         atv.tv_sec = atv64.tv_sec;
1042                         atv.tv_usec = atv64.tv_usec;
1043                 } else {
1044                         struct user32_timeval atv32;
1045                         error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
1046                         atv.tv_sec = atv32.tv_sec;
1047                         atv.tv_usec = atv32.tv_usec;
1048                 }
1049                 if (error)
1050                         goto continuation;
1051                 if (itimerfix(&atv)) {
1052                         error = EINVAL;
1053                         goto continuation;
1054                 }
1055
1056                 clock_absolutetime_interval_to_deadline(
1057                                                                                 tvtoabstime(&atv), &sel->data->abstime);
1058         }
1059         else
1060                 sel->data->abstime = 0;
1061
1062         if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) {
1063                         goto continuation;
1064         }
1065
1066         sel->data->count = count;
1067         size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK);
1068         if (uth->uu_allocsize) {
1069                 if (uth->uu_wqset == 0)
1070                         panic("select: wql memory smashed");
1071                 /* needed for the select now */
1072                 if (size > uth->uu_allocsize) {
1073                         kfree(uth->uu_wqset,  uth->uu_allocsize);
1074                         uth->uu_allocsize = size;
1075                         uth->uu_wqset = (wait_queue_set_t)kalloc(size);
1076                         if (uth->uu_wqset == (wait_queue_set_t)NULL)
1077                                 panic("failed to allocate memory for waitqueue\n");
1078                 }
1079         } else {
1080                 uth->uu_allocsize = size;
1081                 uth->uu_wqset = (wait_queue_set_t)kalloc(uth->uu_allocsize);
1082                 if (uth->uu_wqset == (wait_queue_set_t)NULL)
1083                         panic("failed to allocate memory for waitqueue\n");
1084         }
1085         bzero(uth->uu_wqset, size);
1086         sel->data->wql = (char *)uth->uu_wqset + SIZEOF_WAITQUEUE_SET;
1087         wait_queue_set_init(uth->uu_wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST));
1088
1089 continuation:
1090
1091         if (error) {
1092                 /*
1093                  * We have already cleaned up any state we established,
1094                  * either locally or as a result of selcount().  We don't
1095                  * need to wait_subqueue_unlink_all(), since we haven't set
1096                  * anything at this point.
1097                  */
1098                 return (error);
1099         }
1100
1101         return selprocess(0, SEL_FIRSTPASS);
1102 }
1103
1104 int
1105 selcontinue(int error)
1106 {
1107         return selprocess(error, SEL_SECONDPASS);
1108 }
1109
1110
1111 /*
1112  * selprocess
1113  *
1114  * Parameters:  error                   The error code from our caller
1115  *              sel_pass                The pass we are on
1116  */
1117 int
1118 selprocess(int error, int sel_pass)
1119 {
1120         int ncoll;
1121         u_int ni, nw;
1122         thread_t th_act;
1123         struct uthread  *uth;
1124         struct proc *p;
1125         struct select_args *uap;
1126         int *retval;
1127         struct _select *sel;
1128         int unwind = 1;
1129         int prepost = 0;
1130         int somewakeup = 0;
1131         int doretry = 0;
1132         wait_result_t wait_result;
1133
1134         p = current_proc();
1135         th_act = current_thread();
1136         uap = (struct select_args *)get_bsduthreadarg(th_act);
1137         retval = (int *)get_bsduthreadrval(th_act);
1138         uth = get_bsdthread_info(th_act);
1139         sel = &uth->uu_select;
1140
1141         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
1142                         unwind = 0;
1143         if (sel->data->count == 0)
1144                         unwind = 0;
1145 retry:
1146         if (error != 0) {
1147                 sel_pass = SEL_FIRSTPASS;       /* Reset for seldrop */
1148                 goto done;
1149         }
1150
1151         ncoll = nselcoll;
1152         OSBitOrAtomic(P_SELECT, &p->p_flag);
1153         /* skip scans if the select is just for timeouts */
1154         if (sel->data->count) {
1155                 /*
1156                  * Clear out any dangling refs from prior calls; technically
1157                  * there should not be any.
1158                  */
1159                 if (sel_pass == SEL_FIRSTPASS)
1160                         wait_queue_sub_clearrefs(uth->uu_wqset);
1161
1162                 error = selscan(p, sel, uap->nd, retval, sel_pass, (wait_queue_sub_t)uth->uu_wqset);
1163                 if (error || *retval) {
1164                         goto done;
1165                 }
1166                 if (prepost) {
1167                         /* if the select of log, then we canwakeup and discover some one
1168                         * else already read the data; go toselct again if time permits
1169                         */
1170                         prepost = 0;
1171                         doretry = 1;
1172                 }
1173                 if (somewakeup) {
1174                         somewakeup = 0;
1175                         doretry = 1;
1176                 }
1177         }
1178
1179         if (uap->tv) {
1180                 uint64_t        now;
1181
1182                 clock_get_uptime(&now);
1183                 if (now >= sel->data->abstime)
1184                         goto done;
1185         }
1186
1187         if (doretry) {
1188                 /* cleanup obits and try again */
1189                 doretry = 0;
1190                 sel_pass = SEL_FIRSTPASS;
1191                 goto retry;
1192         }
1193
1194         /*
1195          * To effect a poll, the timeout argument should be
1196          * non-nil, pointing to a zero-valued timeval structure.
1197          */
1198         if (uap->tv && sel->data->abstime == 0) {
1199                 goto done;
1200         }
1201
1202         /* No spurious wakeups due to colls,no need to check for them */
1203          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1204                 sel_pass = SEL_FIRSTPASS;
1205                 goto retry;
1206         }
1207
1208         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1209
1210         /* if the select is just for timeout skip check */
1211         if (sel->data->count &&(sel_pass == SEL_SECONDPASS))
1212                 panic("selprocess: 2nd pass assertwaiting");
1213
1214         /* Wait Queue Subordinate has waitqueue as first element */
1215         wait_result = wait_queue_assert_wait_with_leeway((wait_queue_t)uth->uu_wqset,
1216                                              NULL, THREAD_ABORTSAFE,
1217                                              TIMEOUT_URGENCY_USER_NORMAL, sel->data->abstime, 0);
1218         if (wait_result != THREAD_AWAKENED) {
1219                 /* there are no preposted events */
1220                 error = tsleep1(NULL, PSOCK | PCATCH,
1221                                 "select", 0, selcontinue);
1222         } else  {
1223                 prepost = 1;
1224                 error = 0;
1225         }
1226
1227         if (error == 0) {
1228                 sel_pass = SEL_SECONDPASS;
1229                 if (!prepost)
1230                         somewakeup = 1;
1231                 goto retry;
1232         }
1233 done:
1234         if (unwind) {
1235                 wait_subqueue_unlink_all(uth->uu_wqset);
1236                 seldrop(p, sel->ibits, uap->nd);
1237         }
1238         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1239         /* select is not restarted after signals... */
1240         if (error == ERESTART)
1241                 error = EINTR;
1242         if (error == EWOULDBLOCK)
1243                 error = 0;
1244         nw = howmany(uap->nd, NFDBITS);
1245         ni = nw * sizeof(fd_mask);
1246
1247 #define putbits(name, x) \
1248         do { \
1249                 if (uap->name && (error2 = \
1250                         copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1251                         error = error2; \
1252         } while (0)
1253
1254         if (error == 0) {
1255                 int error2;
1256
1257                 putbits(in, 0);
1258                 putbits(ou, 1);
1259                 putbits(ex, 2);
1260 #undef putbits
1261         }
1262         return(error);
1263 }
1264
1265
1266 /*
1267  * selscan
1268  *
1269  * Parameters:  p                       Process performing the select
1270  *              sel                     The per-thread select context structure
1271  *              nfd                     The number of file descriptors to scan
1272  *              retval                  The per thread system call return area
1273  *              sel_pass                Which pass this is; allowed values are
1274  *                                              SEL_FIRSTPASS and SEL_SECONDPASS
1275  *              wqsub                   The per thread wait queue set
1276  *
1277  * Returns:     0                       Success
1278  *              EIO                     Invalid p->p_fd field XXX Obsolete?
1279  *              EBADF                   One of the files in the bit vector is
1280  *                                              invalid.
1281  */
1282 static int
1283 selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval,
1284         int sel_pass, wait_queue_sub_t wqsub)
1285 {
1286         struct filedesc *fdp = p->p_fd;
1287         int msk, i, j, fd;
1288         u_int32_t bits;
1289         struct fileproc *fp;
1290         int n = 0;              /* count of bits */
1291         int nc = 0;             /* bit vector offset (nc'th bit) */
1292         static int flag[3] = { FREAD, FWRITE, 0 };
1293         u_int32_t *iptr, *optr;
1294         u_int nw;
1295         u_int32_t *ibits, *obits;
1296         char * wql;
1297         char * wql_ptr;
1298         int count;
1299         struct vfs_context context = *vfs_context_current();
1300
1301         /*
1302          * Problems when reboot; due to MacOSX signal probs
1303          * in Beaker1C ; verify that the p->p_fd is valid
1304          */
1305         if (fdp == NULL) {
1306                 *retval=0;
1307                 return(EIO);
1308         }
1309         ibits = sel->ibits;
1310         obits = sel->obits;
1311         wql = sel->data->wql;
1312
1313         nw = howmany(nfd, NFDBITS);
1314
1315         count = sel->data->count;
1316
1317         nc = 0;
1318         if (count) {
1319                 proc_fdlock(p);
1320                 for (msk = 0; msk < 3; msk++) {
1321                         iptr = (u_int32_t *)&ibits[msk * nw];
1322                         optr = (u_int32_t *)&obits[msk * nw];
1323
1324                         for (i = 0; i < nfd; i += NFDBITS) {
1325                                 bits = iptr[i/NFDBITS];
1326
1327                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1328                                         bits &= ~(1 << j);
1329                                         fp = fdp->fd_ofiles[fd];
1330
1331                                         if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1332                                                 /*
1333                                                  * If we abort because of a bad
1334                                                  * fd, let the caller unwind...
1335                                                  */
1336                                                 proc_fdunlock(p);
1337                                                 return(EBADF);
1338                                         }
1339                                         if (sel_pass == SEL_SECONDPASS) {
1340                                                 wql_ptr = (char *)0;
1341                                                 if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)wqsub)) {
1342                                                         fp->f_flags &= ~FP_INSELECT;
1343                                                         fp->f_waddr = (void *)0;
1344                                                 }
1345                                         } else {
1346                                                 wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
1347                                                 if (fp->f_flags & FP_INSELECT) {
1348                                                         /* someone is already in select on this fp */
1349                                                         fp->f_flags |= FP_SELCONFLICT;
1350                                                         wait_queue_link(&select_conflict_queue, (wait_queue_set_t)wqsub);
1351                                                 } else {
1352                                                         fp->f_flags |= FP_INSELECT;
1353                                                         fp->f_waddr = (void *)wqsub;
1354                                                 }
1355                                         }
1356
1357                                         context.vc_ucred = fp->f_cred;
1358
1359                                         /* The select; set the bit, if true */
1360                                         if (fp->f_ops && fp->f_type
1361                                                 && fo_select(fp, flag[msk], wql_ptr, &context)) {
1362                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1363                                                 n++;
1364                                         }
1365                                         nc++;
1366                                 }
1367                         }
1368                 }
1369                 proc_fdunlock(p);
1370         }
1371         *retval = n;
1372         return (0);
1373 }
1374
1375 int poll_callback(struct kqueue *, struct kevent64_s *, void *);
1376
1377 struct poll_continue_args {
1378         user_addr_t pca_fds;
1379         u_int pca_nfds;
1380         u_int pca_rfds;
1381 };
1382
1383 int
1384 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
1385 {
1386         __pthread_testcancel(1);
1387         return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
1388 }
1389
1390
1391 int
1392 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
1393 {
1394         struct poll_continue_args *cont;
1395         struct pollfd *fds;
1396         struct kqueue *kq;
1397         struct timeval atv;
1398         int ncoll, error = 0;
1399         u_int nfds = uap->nfds;
1400         u_int rfds = 0;
1401         u_int i;
1402         size_t ni;
1403
1404         /*
1405          * This is kinda bogus.  We have fd limits, but that is not
1406          * really related to the size of the pollfd array.  Make sure
1407          * we let the process use at least FD_SETSIZE entries and at
1408          * least enough for the current limits.  We want to be reasonably
1409          * safe, but not overly restrictive.
1410          */
1411         if (nfds > OPEN_MAX ||
1412             (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) || nfds > FD_SETSIZE)))
1413                 return (EINVAL);
1414
1415         kq = kqueue_alloc(p);
1416         if (kq == NULL)
1417                 return (EAGAIN);
1418
1419         ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1420         MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1421         if (NULL == cont) {
1422                 error = EAGAIN;
1423                 goto out;
1424         }
1425
1426         fds = (struct pollfd *)&cont[1];
1427         error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1428         if (error)
1429                 goto out;
1430
1431         if (uap->timeout != -1) {
1432                 struct timeval rtv;
1433
1434                 atv.tv_sec = uap->timeout / 1000;
1435                 atv.tv_usec = (uap->timeout % 1000) * 1000;
1436                 if (itimerfix(&atv)) {
1437                         error = EINVAL;
1438                         goto out;
1439                 }
1440                 getmicrouptime(&rtv);
1441                 timevaladd(&atv, &rtv);
1442         } else {
1443                 atv.tv_sec = 0;
1444                 atv.tv_usec = 0;
1445         }
1446
1447         /* JMM - all this P_SELECT stuff is bogus */
1448         ncoll = nselcoll;
1449         OSBitOrAtomic(P_SELECT, &p->p_flag);
1450         for (i = 0; i < nfds; i++) {
1451                 short events = fds[i].events;
1452                 struct kevent64_s kev;
1453                 int kerror = 0;
1454
1455                 /* per spec, ignore fd values below zero */
1456                 if (fds[i].fd < 0) {
1457                         fds[i].revents = 0;
1458                         continue;
1459                 }
1460
1461                 /* convert the poll event into a kqueue kevent */
1462                 kev.ident = fds[i].fd;
1463                 kev.flags = EV_ADD | EV_ONESHOT | EV_POLL;
1464                 kev.udata = CAST_USER_ADDR_T(&fds[i]);
1465                 kev.fflags = 0;
1466                 kev.data = 0;
1467                 kev.ext[0] = 0;
1468                 kev.ext[1] = 0;
1469
1470                 /* Handle input events */
1471                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
1472                         kev.filter = EVFILT_READ;
1473                         if (!(events & ( POLLIN | POLLRDNORM )))
1474                                 kev.flags |= EV_OOBAND;
1475                         kerror = kevent_register(kq, &kev, p);
1476                 }
1477
1478                 /* Handle output events */
1479                 if (kerror == 0 &&
1480                     events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) {
1481                         kev.filter = EVFILT_WRITE;
1482                         kerror = kevent_register(kq, &kev, p);
1483                 }
1484
1485                 /* Handle BSD extension vnode events */
1486                 if (kerror == 0 &&
1487                     events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) {
1488                         kev.filter = EVFILT_VNODE;
1489                         kev.fflags = 0;
1490                         if (events & POLLEXTEND)
1491                                 kev.fflags |= NOTE_EXTEND;
1492                         if (events & POLLATTRIB)
1493                                 kev.fflags |= NOTE_ATTRIB;
1494                         if (events & POLLNLINK)
1495                                 kev.fflags |= NOTE_LINK;
1496                         if (events & POLLWRITE)
1497                                 kev.fflags |= NOTE_WRITE;
1498                         kerror = kevent_register(kq, &kev, p);
1499                 }
1500
1501                 if (kerror != 0) {
1502                         fds[i].revents = POLLNVAL;
1503                         rfds++;
1504                 } else
1505                         fds[i].revents = 0;
1506         }
1507
1508         /* Did we have any trouble registering? */
1509         if (rfds > 0)
1510                 goto done;
1511
1512         /* scan for, and possibly wait for, the kevents to trigger */
1513         cont->pca_fds = uap->fds;
1514         cont->pca_nfds = nfds;
1515         cont->pca_rfds = rfds;
1516         error = kqueue_scan(kq, poll_callback, NULL, cont, &atv, p);
1517         rfds = cont->pca_rfds;
1518
1519  done:
1520         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1521         /* poll is not restarted after signals... */
1522         if (error == ERESTART)
1523                 error = EINTR;
1524         if (error == EWOULDBLOCK)
1525                 error = 0;
1526         if (error == 0) {
1527                 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1528                 *retval = rfds;
1529         }
1530
1531  out:
1532         if (NULL != cont)
1533                 FREE(cont, M_TEMP);
1534
1535         kqueue_dealloc(kq);
1536         return (error);
1537 }
1538
1539 int
1540 poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data)
1541 {
1542         struct poll_continue_args *cont = (struct poll_continue_args *)data;
1543         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1544         short prev_revents = fds->revents;
1545         short mask;
1546
1547         /* convert the results back into revents */
1548         if (kevp->flags & EV_EOF)
1549                 fds->revents |= POLLHUP;
1550         if (kevp->flags & EV_ERROR)
1551                 fds->revents |= POLLERR;
1552
1553         switch (kevp->filter) {
1554         case EVFILT_READ:
1555                 if (fds->revents & POLLHUP)
1556                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
1557                 else {
1558                         mask = 0;
1559                         if (kevp->data != 0)
1560                                 mask |= (POLLIN | POLLRDNORM );
1561                         if (kevp->flags & EV_OOBAND)
1562                                 mask |= ( POLLPRI | POLLRDBAND );
1563                 }
1564                 fds->revents |= (fds->events & mask);
1565                 break;
1566
1567         case EVFILT_WRITE:
1568                 if (!(fds->revents & POLLHUP))
1569                         fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
1570                 break;
1571
1572         case EVFILT_VNODE:
1573                 if (kevp->fflags & NOTE_EXTEND)
1574                         fds->revents |= (fds->events & POLLEXTEND);
1575                 if (kevp->fflags & NOTE_ATTRIB)
1576                         fds->revents |= (fds->events & POLLATTRIB);
1577                 if (kevp->fflags & NOTE_LINK)
1578                         fds->revents |= (fds->events & POLLNLINK);
1579                 if (kevp->fflags & NOTE_WRITE)
1580                         fds->revents |= (fds->events & POLLWRITE);
1581                 break;
1582         }
1583
1584         if (fds->revents != 0 && prev_revents == 0)
1585                 cont->pca_rfds++;
1586
1587         return 0;
1588 }
1589
1590 int
1591 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1592 {
1593
1594         return (1);
1595 }
1596
1597 /*
1598  * selcount
1599  *
1600  * Count the number of bits set in the input bit vector, and establish an
1601  * outstanding fp->f_iocount for each of the descriptors which will be in
1602  * use in the select operation.
1603  *
1604  * Parameters:  p                       The process doing the select
1605  *              ibits                   The input bit vector
1606  *              nfd                     The number of fd's in the vector
1607  *              countp                  Pointer to where to store the bit count
1608  *
1609  * Returns:     0                       Success
1610  *              EIO                     Bad per process open file table
1611  *              EBADF                   One of the bits in the input bit vector
1612  *                                              references an invalid fd
1613  *
1614  * Implicit:    *countp (modified)      Count of fd's
1615  *
1616  * Notes:       This function is the first pass under the proc_fdlock() that
1617  *              permits us to recognize invalid descriptors in the bit vector;
1618  *              the may, however, not remain valid through the drop and
1619  *              later reacquisition of the proc_fdlock().
1620  */
1621 static int
1622 selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
1623 {
1624         struct filedesc *fdp = p->p_fd;
1625         int msk, i, j, fd;
1626         u_int32_t bits;
1627         struct fileproc *fp;
1628         int n = 0;
1629         u_int32_t *iptr;
1630         u_int nw;
1631         int error=0;
1632         int dropcount;
1633         int need_wakeup = 0;
1634
1635         /*
1636          * Problems when reboot; due to MacOSX signal probs
1637          * in Beaker1C ; verify that the p->p_fd is valid
1638          */
1639         if (fdp == NULL) {
1640                 *countp = 0;
1641                 return(EIO);
1642         }
1643         nw = howmany(nfd, NFDBITS);
1644
1645         proc_fdlock(p);
1646         for (msk = 0; msk < 3; msk++) {
1647                 iptr = (u_int32_t *)&ibits[msk * nw];
1648                 for (i = 0; i < nfd; i += NFDBITS) {
1649                         bits = iptr[i/NFDBITS];
1650                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1651                                 bits &= ~(1 << j);
1652                                 fp = fdp->fd_ofiles[fd];
1653                                 if (fp == NULL ||
1654                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1655                                                 *countp = 0;
1656                                                 error = EBADF;
1657                                                 goto bad;
1658                                 }
1659                                 fp->f_iocount++;
1660                                 n++;
1661                         }
1662                 }
1663         }
1664         proc_fdunlock(p);
1665
1666         *countp = n;
1667         return (0);
1668
1669 bad:
1670         dropcount = 0;
1671
1672         if (n== 0)
1673                 goto out;
1674         /* Ignore error return; it's already EBADF */
1675         (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1);
1676
1677 out:
1678         proc_fdunlock(p);
1679         if (need_wakeup) {
1680                 wakeup(&p->p_fpdrainwait);
1681         }
1682         return(error);
1683 }
1684
1685
1686 /*
1687  * seldrop_locked
1688  *
1689  * Drop outstanding wait queue references set up during selscan(); drop the
1690  * outstanding per fileproc f_iocount() picked up during the selcount().
1691  *
1692  * Parameters:  p                       Process performing the select
1693  *              ibits                   Input pit bector of fd's
1694  *              nfd                     Number of fd's
1695  *              lim                     Limit to number of vector entries to
1696  *                                              consider, or -1 for "all"
1697  *              inselect                True if
1698  *              need_wakeup             Pointer to flag to set to do a wakeup
1699  *                                      if f_iocont on any descriptor goes to 0
1700  *
1701  * Returns:     0                       Success
1702  *              EBADF                   One or more fds in the bit vector
1703  *                                              were invalid, but the rest
1704  *                                              were successfully dropped
1705  *
1706  * Notes:       An fd make become bad while the proc_fdlock() is not held,
1707  *              if a multithreaded application closes the fd out from under
1708  *              the in progress select.  In this case, we still have to
1709  *              clean up after the set up on the remaining fds.
1710  */
1711 static int
1712 seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount)
1713 {
1714         struct filedesc *fdp = p->p_fd;
1715         int msk, i, j, fd;
1716         u_int32_t bits;
1717         struct fileproc *fp;
1718         u_int32_t *iptr;
1719         u_int nw;
1720         int error = 0;
1721         int dropcount = 0;
1722         uthread_t uth = get_bsdthread_info(current_thread());
1723
1724         *need_wakeup = 0;
1725
1726         /*
1727          * Problems when reboot; due to MacOSX signal probs
1728          * in Beaker1C ; verify that the p->p_fd is valid
1729          */
1730         if (fdp == NULL) {
1731                 return(EIO);
1732         }
1733
1734         nw = howmany(nfd, NFDBITS);
1735
1736         for (msk = 0; msk < 3; msk++) {
1737                 iptr = (u_int32_t *)&ibits[msk * nw];
1738                 for (i = 0; i < nfd; i += NFDBITS) {
1739                         bits = iptr[i/NFDBITS];
1740                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1741                                 bits &= ~(1 << j);
1742                                 fp = fdp->fd_ofiles[fd];
1743                                 /*
1744                                  * If we've already dropped as many as were
1745                                  * counted/scanned, then we are done.
1746                                  */
1747                                 if ((fromselcount != 0) && (++dropcount > lim))
1748                                         goto done;
1749
1750                                 if (fp == NULL) {
1751                                         /* skip (now) bad fds */
1752                                         error = EBADF;
1753                                         continue;
1754                                 }
1755                                 /*
1756                                  * Only clear the flag if we set it.  We'll
1757                                  * only find that we set it if we had made
1758                                  * at least one [partial] pass through selscan().
1759                                  */
1760                                 if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)uth->uu_wqset)) {
1761                                         fp->f_flags &= ~FP_INSELECT;
1762                                         fp->f_waddr = (void *)0;
1763                                 }
1764
1765                                 fp->f_iocount--;
1766                                 if (fp->f_iocount < 0)
1767                                         panic("f_iocount overdecrement!");
1768
1769                                 if (fp->f_iocount == 0) {
1770                                         /*
1771                                          * The last iocount is responsible for clearing
1772                                          * selconfict flag - even if we didn't set it -
1773                                          * and is also responsible for waking up anyone
1774                                          * waiting on iocounts to drain.
1775                                          */
1776                                         if (fp->f_flags & FP_SELCONFLICT)
1777                                                 fp->f_flags &= ~FP_SELCONFLICT;
1778                                         if (p->p_fpdrainwait) {
1779                                                 p->p_fpdrainwait = 0;
1780                                                 *need_wakeup = 1;
1781                                         }
1782                                 }
1783                         }
1784                 }
1785         }
1786 done:
1787         return (error);
1788 }
1789
1790
1791 static int
1792 seldrop(struct proc *p, u_int32_t *ibits, int nfd)
1793 {
1794         int error;
1795         int need_wakeup = 0;
1796
1797         proc_fdlock(p);
1798         error =  seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0);
1799         proc_fdunlock(p);
1800         if (need_wakeup) {
1801                 wakeup(&p->p_fpdrainwait);
1802         }
1803         return (error);
1804 }
1805
1806 /*
1807  * Record a select request.
1808  */
1809 void
1810 selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql)
1811 {
1812         thread_t        cur_act = current_thread();
1813         struct uthread * ut = get_bsdthread_info(cur_act);
1814
1815         /* need to look at collisions */
1816
1817         /*do not record if this is second pass of select */
1818         if(p_wql == (void *)0) {
1819                 return;
1820         }
1821
1822         if ((sip->si_flags & SI_INITED) == 0) {
1823                 wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
1824                 sip->si_flags |= SI_INITED;
1825                 sip->si_flags &= ~SI_CLEAR;
1826         }
1827
1828         if (sip->si_flags & SI_RECORDED) {
1829                 sip->si_flags |= SI_COLL;
1830         } else
1831                 sip->si_flags &= ~SI_COLL;
1832
1833         sip->si_flags |= SI_RECORDED;
1834         if (!wait_queue_member(&sip->si_wait_queue, ut->uu_wqset))
1835                 wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_wqset,
1836                                         (wait_queue_link_t)p_wql);
1837
1838         return;
1839 }
1840
1841 void
1842 selwakeup(struct selinfo *sip)
1843 {
1844
1845         if ((sip->si_flags & SI_INITED) == 0) {
1846                 return;
1847         }
1848
1849         if (sip->si_flags & SI_COLL) {
1850                 nselcoll++;
1851                 sip->si_flags &= ~SI_COLL;
1852 #if 0
1853                 /* will not  support */
1854                 //wakeup((caddr_t)&selwait);
1855 #endif
1856         }
1857
1858         if (sip->si_flags & SI_RECORDED) {
1859                 wait_queue_wakeup_all(&sip->si_wait_queue, NULL, THREAD_AWAKENED);
1860                 sip->si_flags &= ~SI_RECORDED;
1861         }
1862
1863 }
1864
1865 void
1866 selthreadclear(struct selinfo *sip)
1867 {
1868
1869         if ((sip->si_flags & SI_INITED) == 0) {
1870                 return;
1871         }
1872         if (sip->si_flags & SI_RECORDED) {
1873                         selwakeup(sip);
1874                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
1875         }
1876         sip->si_flags |= SI_CLEAR;
1877         wait_queue_unlink_all(&sip->si_wait_queue);
1878 }
1879
1880
1881
1882
1883 #define DBG_POST        0x10
1884 #define DBG_WATCH       0x11
1885 #define DBG_WAIT        0x12
1886 #define DBG_MOD         0x13
1887 #define DBG_EWAKEUP     0x14
1888 #define DBG_ENQUEUE     0x15
1889 #define DBG_DEQUEUE     0x16
1890
1891 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
1892 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
1893 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
1894 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
1895 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
1896 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
1897 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
1898
1899
1900 #define EVPROCDEQUE(p, evq)     do {                            \
1901         proc_lock(p);                                           \
1902         if (evq->ee_flags & EV_QUEUED) {                        \
1903                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);      \
1904                 evq->ee_flags &= ~EV_QUEUED;                    \
1905         }                                                       \
1906         proc_unlock(p);                                         \
1907 } while (0);
1908
1909
1910 /*
1911  * called upon socket close. deque and free all events for
1912  * the socket...  socket must be locked by caller.
1913  */
1914 void
1915 evsofree(struct socket *sp)
1916 {
1917         struct eventqelt *evq, *next;
1918         proc_t  p;
1919
1920         if (sp == NULL)
1921                 return;
1922
1923         for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
1924                 next = evq->ee_slist.tqe_next;
1925                 p = evq->ee_proc;
1926
1927                 if (evq->ee_flags & EV_QUEUED) {
1928                         EVPROCDEQUE(p, evq);
1929                 }
1930                 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
1931                 FREE(evq, M_TEMP);
1932         }
1933 }
1934
1935
1936 /*
1937  * called upon pipe close. deque and free all events for
1938  * the pipe... pipe must be locked by caller
1939  */
1940 void
1941 evpipefree(struct pipe *cpipe)
1942 {
1943         struct eventqelt *evq, *next;
1944         proc_t  p;
1945
1946         for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
1947                 next = evq->ee_slist.tqe_next;
1948                 p = evq->ee_proc;
1949
1950                 EVPROCDEQUE(p, evq);
1951
1952                 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
1953                 FREE(evq, M_TEMP);
1954         }
1955 }
1956
1957
1958 /*
1959  * enqueue this event if it's not already queued. wakeup
1960  * the proc if we do queue this event to it...
1961  * entered with proc lock held... we drop it before
1962  * doing the wakeup and return in that state
1963  */
1964 static void
1965 evprocenque(struct eventqelt *evq)
1966 {
1967         proc_t  p;
1968
1969         assert(evq);
1970         p = evq->ee_proc;
1971
1972         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,0,0);
1973
1974         proc_lock(p);
1975
1976         if (evq->ee_flags & EV_QUEUED) {
1977                 proc_unlock(p);
1978
1979                 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1980                 return;
1981         }
1982         evq->ee_flags |= EV_QUEUED;
1983
1984         TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
1985
1986         proc_unlock(p);
1987
1988         wakeup(&p->p_evlist);
1989
1990         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1991 }
1992
1993
1994 /*
1995  * pipe lock must be taken by the caller
1996  */
1997 void
1998 postpipeevent(struct pipe *pipep, int event)
1999 {
2000         int     mask;
2001         struct eventqelt *evq;
2002
2003         if (pipep == NULL)
2004                 return;
2005         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
2006
2007         for (evq = pipep->pipe_evlist.tqh_first;
2008              evq != NULL; evq = evq->ee_slist.tqe_next) {
2009
2010                 if (evq->ee_eventmask == 0)
2011                         continue;
2012                 mask = 0;
2013
2014                 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
2015
2016                 case EV_RWBYTES:
2017                   if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
2018                           mask |= EV_RE;
2019                           evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
2020                   }
2021                   if ((evq->ee_eventmask & EV_WR) &&
2022                       (MAX(pipep->pipe_buffer.size,PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
2023
2024                           if (pipep->pipe_state & PIPE_EOF) {
2025                                   mask |= EV_WR|EV_RESET;
2026                                   break;
2027                           }
2028                           mask |= EV_WR;
2029                           evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt;
2030                   }
2031                   break;
2032
2033                 case EV_WCLOSED:
2034                 case EV_RCLOSED:
2035                   if ((evq->ee_eventmask & EV_RE)) {
2036                           mask |= EV_RE|EV_RCLOSED;
2037                   }
2038                   if ((evq->ee_eventmask & EV_WR)) {
2039                           mask |= EV_WR|EV_WCLOSED;
2040                   }
2041                   break;
2042
2043                 default:
2044                   return;
2045                 }
2046                 if (mask) {
2047                         /*
2048                          * disarm... postevents are nops until this event is 'read' via
2049                          * waitevent and then re-armed via modwatch
2050                          */
2051                         evq->ee_eventmask = 0;
2052
2053                         /*
2054                          * since events are disarmed until after the waitevent
2055                          * the ee_req.er_xxxx fields can't change once we've
2056                          * inserted this event into the proc queue...
2057                          * therefore, the waitevent will see a 'consistent'
2058                          * snapshot of the event, even though it won't hold
2059                          * the pipe lock, and we're updating the event outside
2060                          * of the proc lock, which it will hold
2061                          */
2062                         evq->ee_req.er_eventbits |= mask;
2063
2064                         KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1,0);
2065
2066                         evprocenque(evq);
2067                 }
2068         }
2069         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
2070 }
2071
2072 #if SOCKETS
2073 /*
2074  * given either a sockbuf or a socket run down the
2075  * event list and queue ready events found...
2076  * the socket must be locked by the caller
2077  */
2078 void
2079 postevent(struct socket *sp, struct sockbuf *sb, int event)
2080 {
2081         int     mask;
2082         struct  eventqelt *evq;
2083         struct  tcpcb *tp;
2084
2085         if (sb)
2086                 sp = sb->sb_so;
2087         if (sp == NULL)
2088                 return;
2089
2090         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
2091
2092         for (evq = sp->so_evlist.tqh_first;
2093              evq != NULL; evq = evq->ee_slist.tqe_next) {
2094
2095                 if (evq->ee_eventmask == 0)
2096                         continue;
2097                 mask = 0;
2098
2099                 /* ready for reading:
2100                    - byte cnt >= receive low water mark
2101                    - read-half of conn closed
2102                    - conn pending for listening sock
2103                    - socket error pending
2104
2105                    ready for writing
2106                    - byte cnt avail >= send low water mark
2107                    - write half of conn closed
2108                    - socket error pending
2109                    - non-blocking conn completed successfully
2110
2111                    exception pending
2112                    - out of band data
2113                    - sock at out of band mark
2114                 */
2115
2116                 switch (event & EV_DMASK) {
2117
2118                 case EV_OOB:
2119                   if ((evq->ee_eventmask & EV_EX)) {
2120                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2121                                   mask |= EV_EX|EV_OOB;
2122                   }
2123                   break;
2124
2125                 case EV_RWBYTES|EV_OOB:
2126                   if ((evq->ee_eventmask & EV_EX)) {
2127                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2128                                   mask |= EV_EX|EV_OOB;
2129                   }
2130                   /*
2131                    * fall into the next case
2132                    */
2133                 case EV_RWBYTES:
2134                   if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
2135                           /* for AFP/OT purposes; may go away in future */
2136                           if ((SOCK_DOM(sp) == PF_INET ||
2137                               SOCK_DOM(sp) == PF_INET6) &&
2138                               SOCK_PROTO(sp) == IPPROTO_TCP &&
2139                               (sp->so_error == ECONNREFUSED ||
2140                               sp->so_error == ECONNRESET)) {
2141                                   if (sp->so_pcb == NULL ||
2142                                       sotoinpcb(sp)->inp_state ==
2143                                       INPCB_STATE_DEAD ||
2144                                       (tp = sototcpcb(sp)) == NULL ||
2145                                       tp->t_state == TCPS_CLOSED) {
2146                                           mask |= EV_RE|EV_RESET;
2147                                           break;
2148                                   }
2149                           }
2150                           mask |= EV_RE;
2151                           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
2152
2153                           if (sp->so_state & SS_CANTRCVMORE) {
2154                                   mask |= EV_FIN;
2155                                   break;
2156                           }
2157                   }
2158                   if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
2159                           /* for AFP/OT purposes; may go away in future */
2160                           if ((SOCK_DOM(sp) == PF_INET ||
2161                               SOCK_DOM(sp) == PF_INET6) &&
2162                               SOCK_PROTO(sp) == IPPROTO_TCP &&
2163                               (sp->so_error == ECONNREFUSED ||
2164                               sp->so_error == ECONNRESET)) {
2165                                   if (sp->so_pcb == NULL ||
2166                                       sotoinpcb(sp)->inp_state ==
2167                                       INPCB_STATE_DEAD ||
2168                                       (tp = sototcpcb(sp)) == NULL ||
2169                                       tp->t_state == TCPS_CLOSED) {
2170                                           mask |= EV_WR|EV_RESET;
2171                                           break;
2172                                   }
2173                           }
2174                           mask |= EV_WR;
2175                           evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
2176                   }
2177                   break;
2178
2179                 case EV_RCONN:
2180                   if ((evq->ee_eventmask & EV_RE)) {
2181                           mask |= EV_RE|EV_RCONN;
2182                           evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
2183                   }
2184                   break;
2185
2186                 case EV_WCONN:
2187                   if ((evq->ee_eventmask & EV_WR)) {
2188                           mask |= EV_WR|EV_WCONN;
2189                   }
2190                   break;
2191
2192                 case EV_RCLOSED:
2193                   if ((evq->ee_eventmask & EV_RE)) {
2194                           mask |= EV_RE|EV_RCLOSED;
2195                   }
2196                   break;
2197
2198                 case EV_WCLOSED:
2199                   if ((evq->ee_eventmask & EV_WR)) {
2200                           mask |= EV_WR|EV_WCLOSED;
2201                   }
2202                   break;
2203
2204                 case EV_FIN:
2205                   if (evq->ee_eventmask & EV_RE) {
2206                           mask |= EV_RE|EV_FIN;
2207                   }
2208                   break;
2209
2210                 case EV_RESET:
2211                 case EV_TIMEOUT:
2212                   if (evq->ee_eventmask & EV_RE) {
2213                           mask |= EV_RE | event;
2214                   }
2215                   if (evq->ee_eventmask & EV_WR) {
2216                           mask |= EV_WR | event;
2217                   }
2218                   break;
2219
2220                 default:
2221                   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
2222                   return;
2223                 } /* switch */
2224
2225                 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
2226
2227                 if (mask) {
2228                         /*
2229                          * disarm... postevents are nops until this event is 'read' via
2230                          * waitevent and then re-armed via modwatch
2231                          */
2232                         evq->ee_eventmask = 0;
2233
2234                         /*
2235                          * since events are disarmed until after the waitevent
2236                          * the ee_req.er_xxxx fields can't change once we've
2237                          * inserted this event into the proc queue...
2238                          * since waitevent can't see this event until we
2239                          * enqueue it, waitevent will see a 'consistent'
2240                          * snapshot of the event, even though it won't hold
2241                          * the socket lock, and we're updating the event outside
2242                          * of the proc lock, which it will hold
2243                          */
2244                         evq->ee_req.er_eventbits |= mask;
2245
2246                         evprocenque(evq);
2247                 }
2248         }
2249         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
2250 }
2251 #endif /* SOCKETS */
2252
2253
2254 /*
2255  * watchevent system call. user passes us an event to watch
2256  * for. we malloc an event object, initialize it, and queue
2257  * it to the open socket. when the event occurs, postevent()
2258  * will enque it back to our proc where we can retrieve it
2259  * via waitevent().
2260  *
2261  * should this prevent duplicate events on same socket?
2262  *
2263  * Returns:
2264  *              ENOMEM                  No memory for operation
2265  *      copyin:EFAULT
2266  */
2267 int
2268 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
2269 {
2270         struct eventqelt *evq = (struct eventqelt *)0;
2271         struct eventqelt *np = NULL;
2272         struct eventreq64 *erp;
2273         struct fileproc *fp = NULL;
2274         int error;
2275
2276         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
2277
2278         // get a qelt and fill with users req
2279         MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
2280
2281         if (evq == NULL)
2282                 return (ENOMEM);
2283         erp = &evq->ee_req;
2284
2285         // get users request pkt
2286
2287         if (IS_64BIT_PROCESS(p)) {
2288                 error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
2289         } else {
2290                 struct eventreq32 er32;
2291
2292                 error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
2293                 if (error == 0) {
2294                        /*
2295                         * the user only passes in the
2296                         * er_type, er_handle and er_data...
2297                         * the other fields are initialized
2298                         * below, so don't bother to copy
2299                         */
2300                         erp->er_type = er32.er_type;
2301                         erp->er_handle = er32.er_handle;
2302                         erp->er_data = (user_addr_t)er32.er_data;
2303                 }
2304         }
2305         if (error) {
2306                 FREE(evq, M_TEMP);
2307                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2308
2309                 return(error);
2310         }
2311         KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2312
2313         // validate, freeing qelt if errors
2314         error = 0;
2315         proc_fdlock(p);
2316
2317         if (erp->er_type != EV_FD) {
2318                 error = EINVAL;
2319         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2320                 error = EBADF;
2321 #if SOCKETS
2322         } else if (fp->f_type == DTYPE_SOCKET) {
2323                 socket_lock((struct socket *)fp->f_data, 1);
2324                 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2325 #endif /* SOCKETS */
2326         } else if (fp->f_type == DTYPE_PIPE) {
2327                 PIPE_LOCK((struct pipe *)fp->f_data);
2328                 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2329         } else {
2330                 fp_drop(p, erp->er_handle, fp, 1);
2331                 error = EINVAL;
2332         }
2333         proc_fdunlock(p);
2334
2335         if (error) {
2336                 FREE(evq, M_TEMP);
2337
2338                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2339                 return(error);
2340         }
2341
2342         /*
2343          * only allow one watch per file per proc
2344          */
2345         for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2346                 if (np->ee_proc == p) {
2347 #if SOCKETS
2348                         if (fp->f_type == DTYPE_SOCKET)
2349                                 socket_unlock((struct socket *)fp->f_data, 1);
2350                         else
2351 #endif /* SOCKETS */
2352                                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2353                         fp_drop(p, erp->er_handle, fp, 0);
2354                         FREE(evq, M_TEMP);
2355
2356                         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2357                         return(EINVAL);
2358                 }
2359         }
2360         erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
2361         evq->ee_proc = p;
2362         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2363         evq->ee_flags = 0;
2364
2365 #if SOCKETS
2366         if (fp->f_type == DTYPE_SOCKET) {
2367                 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2368                 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
2369
2370                 socket_unlock((struct socket *)fp->f_data, 1);
2371         } else
2372 #endif /* SOCKETS */
2373         {
2374                 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2375                 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2376
2377                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2378         }
2379         fp_drop_event(p, erp->er_handle, fp);
2380
2381         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
2382         return(0);
2383 }
2384
2385
2386
2387 /*
2388  * waitevent system call.
2389  * grabs the next waiting event for this proc and returns
2390  * it. if no events, user can request to sleep with timeout
2391  * or without or poll mode
2392  *    ((tv != NULL && interval == 0) || tv == -1)
2393  */
2394 int
2395 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
2396 {
2397         int error = 0;
2398         struct eventqelt *evq;
2399         struct eventreq64 *erp;
2400         uint64_t abstime, interval;
2401         boolean_t fast_poll = FALSE;
2402         union {
2403                 struct eventreq64 er64;
2404                 struct eventreq32 er32;
2405         } uer;
2406
2407         interval = 0;
2408
2409         if (uap->tv) {
2410                 struct timeval atv;
2411                 /*
2412                  * check for fast poll method
2413                  */
2414                 if (IS_64BIT_PROCESS(p)) {
2415                         if (uap->tv == (user_addr_t)-1)
2416                                 fast_poll = TRUE;
2417                 } else if (uap->tv == (user_addr_t)((uint32_t)-1))
2418                         fast_poll = TRUE;
2419
2420                 if (fast_poll == TRUE) {
2421                         if (p->p_evlist.tqh_first == NULL) {
2422                                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_NONE, -1,0,0,0,0);
2423                                 /*
2424                                  * poll failed
2425                                  */
2426                                 *retval = 1;
2427                                 return (0);
2428                         }
2429                         proc_lock(p);
2430                         goto retry;
2431                 }
2432                 if (IS_64BIT_PROCESS(p)) {
2433                         struct user64_timeval atv64;
2434                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
2435                         /* Loses resolution - assume timeout < 68 years */
2436                         atv.tv_sec = atv64.tv_sec;
2437                         atv.tv_usec = atv64.tv_usec;
2438                 } else {
2439                         struct user32_timeval atv32;
2440                         error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
2441                         atv.tv_sec = atv32.tv_sec;
2442                         atv.tv_usec = atv32.tv_usec;
2443                 }
2444
2445                 if (error)
2446                         return(error);
2447                 if (itimerfix(&atv)) {
2448                         error = EINVAL;
2449                         return(error);
2450                 }
2451                 interval = tvtoabstime(&atv);
2452         }
2453         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
2454
2455         proc_lock(p);
2456 retry:
2457         if ((evq = p->p_evlist.tqh_first) != NULL) {
2458                 /*
2459                  * found one... make a local copy while it's still on the queue
2460                  * to prevent it from changing while in the midst of copying
2461                  * don't want to hold the proc lock across a copyout because
2462                  * it might block on a page fault at the target in user space
2463                  */
2464                 erp = &evq->ee_req;
2465
2466                 if (IS_64BIT_PROCESS(p))
2467                         bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
2468                 else {
2469                         uer.er32.er_type  = erp->er_type;
2470                         uer.er32.er_handle  = erp->er_handle;
2471                         uer.er32.er_data  = (uint32_t)erp->er_data;
2472                         uer.er32.er_ecnt  = erp->er_ecnt;
2473                         uer.er32.er_rcnt  = erp->er_rcnt;
2474                         uer.er32.er_wcnt  = erp->er_wcnt;
2475                         uer.er32.er_eventbits = erp->er_eventbits;
2476                 }
2477                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2478
2479                 evq->ee_flags &= ~EV_QUEUED;
2480
2481                 proc_unlock(p);
2482
2483                 if (IS_64BIT_PROCESS(p))
2484                         error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
2485                 else
2486                         error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
2487
2488                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
2489                              evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,0);
2490                 return (error);
2491         }
2492         else {
2493                 if (uap->tv && interval == 0) {
2494                         proc_unlock(p);
2495                         *retval = 1;  // poll failed
2496
2497                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
2498                         return (error);
2499                 }
2500                 if (interval != 0)
2501                         clock_absolutetime_interval_to_deadline(interval, &abstime);
2502                 else
2503                         abstime = 0;
2504
2505                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,(uint32_t)&p->p_evlist,0,0,0);
2506
2507                 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
2508
2509                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,(uint32_t)&p->p_evlist,0,0,0);
2510
2511                 if (error == 0)
2512                         goto retry;
2513                 if (error == ERESTART)
2514                         error = EINTR;
2515                 if (error == EWOULDBLOCK) {
2516                         *retval = 1;
2517                         error = 0;
2518                 }
2519         }
2520         proc_unlock(p);
2521
2522         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
2523         return (error);
2524 }
2525
2526
2527 /*
2528  * modwatch system call. user passes in event to modify.
2529  * if we find it we reset the event bits and que/deque event
2530  * it needed.
2531  */
2532 int
2533 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
2534 {
2535         struct eventreq64 er;
2536         struct eventreq64 *erp = &er;
2537         struct eventqelt *evq = NULL;   /* protected by error return */
2538         int error;
2539         struct fileproc *fp;
2540         int flag;
2541
2542         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
2543
2544         /*
2545          * get user's request pkt
2546          * just need the er_type and er_handle which sit above the
2547          * problematic er_data (32/64 issue)... so only copy in
2548          * those 2 fields
2549          */
2550         if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
2551                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2552                 return(error);
2553         }
2554         proc_fdlock(p);
2555
2556         if (erp->er_type != EV_FD) {
2557                 error = EINVAL;
2558         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2559                 error = EBADF;
2560 #if SOCKETS
2561         } else if (fp->f_type == DTYPE_SOCKET) {
2562                 socket_lock((struct socket *)fp->f_data, 1);
2563                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2564 #endif /* SOCKETS */
2565         } else if (fp->f_type == DTYPE_PIPE) {
2566                 PIPE_LOCK((struct pipe *)fp->f_data);
2567                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2568         } else {
2569                 fp_drop(p, erp->er_handle, fp, 1);
2570                 error = EINVAL;
2571         }
2572
2573         if (error) {
2574                 proc_fdunlock(p);
2575                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2576                 return(error);
2577         }
2578
2579         if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2580                 fp->f_flags &= ~FP_WAITEVENT;
2581         }
2582         proc_fdunlock(p);
2583
2584         // locate event if possible
2585         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2586                 if (evq->ee_proc == p)
2587                         break;
2588         }
2589         if (evq == NULL) {
2590 #if SOCKETS
2591                 if (fp->f_type == DTYPE_SOCKET)
2592                         socket_unlock((struct socket *)fp->f_data, 1);
2593                 else
2594 #endif /* SOCKETS */
2595                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2596                 fp_drop(p, erp->er_handle, fp, 0);
2597                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
2598                 return(EINVAL);
2599         }
2600         KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2601
2602         if (uap->u_eventmask == EV_RM) {
2603                 EVPROCDEQUE(p, evq);
2604
2605 #if SOCKETS
2606                 if (fp->f_type == DTYPE_SOCKET) {
2607                         TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2608                         socket_unlock((struct socket *)fp->f_data, 1);
2609                 } else
2610 #endif /* SOCKETS */
2611                 {
2612                         TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2613                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2614                 }
2615                 fp_drop(p, erp->er_handle, fp, 0);
2616                 FREE(evq, M_TEMP);
2617                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
2618                 return(0);
2619         }
2620         switch (uap->u_eventmask & EV_MASK) {
2621
2622         case 0:
2623                 flag = 0;
2624                 break;
2625
2626         case EV_RE:
2627         case EV_WR:
2628         case EV_RE|EV_WR:
2629                 flag = EV_RWBYTES;
2630                 break;
2631
2632         case EV_EX:
2633                 flag = EV_OOB;
2634                 break;
2635
2636         case EV_EX|EV_RE:
2637         case EV_EX|EV_WR:
2638         case EV_EX|EV_RE|EV_WR:
2639                 flag = EV_OOB|EV_RWBYTES;
2640                 break;
2641
2642         default:
2643 #if SOCKETS
2644                 if (fp->f_type == DTYPE_SOCKET)
2645                         socket_unlock((struct socket *)fp->f_data, 1);
2646                 else
2647 #endif /* SOCKETS */
2648                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2649                 fp_drop(p, erp->er_handle, fp, 0);
2650                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2651                 return(EINVAL);
2652         }
2653         /*
2654          * since we're holding the socket/pipe lock, the event
2655          * cannot go from the unqueued state to the queued state
2656          * however, it can go from the queued state to the unqueued state
2657          * since that direction is protected by the proc_lock...
2658          * so do a quick check for EV_QUEUED w/o holding the proc lock
2659          * since by far the common case will be NOT EV_QUEUED, this saves
2660          * us taking the proc_lock the majority of the time
2661          */
2662         if (evq->ee_flags & EV_QUEUED) {
2663                 /*
2664                  * EVPROCDEQUE will recheck the state after it grabs the proc_lock
2665                  */
2666                 EVPROCDEQUE(p, evq);
2667         }
2668         /*
2669          * while the event is off the proc queue and
2670          * we're holding the socket/pipe lock
2671          * it's safe to update these fields...
2672          */
2673         evq->ee_req.er_eventbits = 0;
2674         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2675
2676 #if SOCKETS
2677         if (fp->f_type == DTYPE_SOCKET) {
2678                 postevent((struct socket *)fp->f_data, 0, flag);
2679                 socket_unlock((struct socket *)fp->f_data, 1);
2680         } else
2681 #endif /* SOCKETS */
2682         {
2683                 postpipeevent((struct pipe *)fp->f_data, flag);
2684                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2685         }
2686         fp_drop(p, erp->er_handle, fp, 0);
2687         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,0);
2688         return(0);
2689 }
2690
2691 /* this routine is called from the close of fd with proc_fdlock held */
2692 int
2693 waitevent_close(struct proc *p, struct fileproc *fp)
2694 {
2695         struct eventqelt *evq;
2696
2697
2698         fp->f_flags &= ~FP_WAITEVENT;
2699
2700 #if SOCKETS
2701         if (fp->f_type == DTYPE_SOCKET) {
2702                 socket_lock((struct socket *)fp->f_data, 1);
2703                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2704         } else
2705 #endif /* SOCKETS */
2706         if (fp->f_type == DTYPE_PIPE) {
2707                 PIPE_LOCK((struct pipe *)fp->f_data);
2708                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2709         }
2710         else {
2711                 return(EINVAL);
2712         }
2713         proc_fdunlock(p);
2714
2715
2716         // locate event if possible
2717         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2718                 if (evq->ee_proc == p)
2719                         break;
2720         }
2721         if (evq == NULL) {
2722 #if SOCKETS
2723                 if (fp->f_type == DTYPE_SOCKET)
2724                         socket_unlock((struct socket *)fp->f_data, 1);
2725                 else
2726 #endif /* SOCKETS */
2727                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2728
2729                 proc_fdlock(p);
2730
2731                 return(EINVAL);
2732         }
2733         EVPROCDEQUE(p, evq);
2734
2735 #if SOCKETS
2736         if (fp->f_type == DTYPE_SOCKET) {
2737                 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2738                 socket_unlock((struct socket *)fp->f_data, 1);
2739         } else
2740 #endif /* SOCKETS */
2741         {
2742                 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2743                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2744         }
2745         FREE(evq, M_TEMP);
2746
2747         proc_fdlock(p);
2748
2749         return(0);
2750 }
2751
2752
2753 /*
2754  * gethostuuid
2755  *
2756  * Description: Get the host UUID from IOKit and return it to user space.
2757  *
2758  * Parameters:  uuid_buf                Pointer to buffer to receive UUID
2759  *              timeout                 Timespec for timout
2760  *              spi                             SPI, skip sandbox check (temporary)
2761  *
2762  * Returns:     0                       Success
2763  *              EWOULDBLOCK             Timeout is too short
2764  *              copyout:EFAULT          Bad user buffer
2765  *
2766  * Notes:       A timeout seems redundant, since if it's tolerable to not
2767  *              have a system UUID in hand, then why ask for one?
2768  */
2769 int
2770 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
2771 {
2772         kern_return_t kret;
2773         int error;
2774         mach_timespec_t mach_ts;        /* for IOKit call */
2775         __darwin_uuid_t uuid_kern;      /* for IOKit call */
2776
2777         if (!uap->spi) {
2778 #if 13841988
2779                 uint32_t flags;
2780                 if (temp_debug_13841988 && (0 == proc_get_darwinbgstate(p->task, &flags)) && (flags & PROC_FLAG_IOS_APPLICATION)) {
2781                         printf("Unauthorized access to gethostuuid() by %s(%d)\n", p->p_comm, proc_pid(p));
2782                         return (EPERM);
2783                 }
2784 #else
2785                 /* Perform sandbox check */
2786 #endif
2787         }
2788
2789         /* Convert the 32/64 bit timespec into a mach_timespec_t */
2790         if ( proc_is64bit(p) ) {
2791                 struct user64_timespec ts;
2792                 error = copyin(uap->timeoutp, &ts, sizeof(ts));
2793                 if (error)
2794                         return (error);
2795                 mach_ts.tv_sec = ts.tv_sec;
2796                 mach_ts.tv_nsec = ts.tv_nsec;
2797         } else {
2798                 struct user32_timespec ts;
2799                 error = copyin(uap->timeoutp, &ts, sizeof(ts) );
2800                 if (error)
2801                         return (error);
2802                 mach_ts.tv_sec = ts.tv_sec;
2803                 mach_ts.tv_nsec = ts.tv_nsec;
2804         }
2805
2806         /* Call IOKit with the stack buffer to get the UUID */
2807         kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
2808
2809         /*
2810          * If we get it, copy out the data to the user buffer; note that a
2811          * uuid_t is an array of characters, so this is size invariant for
2812          * 32 vs. 64 bit.
2813          */
2814         if (kret == KERN_SUCCESS) {
2815                 error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
2816         } else {
2817                 error = EWOULDBLOCK;
2818         }
2819
2820         return (error);
2821 }
2822
2823 /*
2824  * ledger
2825  *
2826  * Description: Omnibus system call for ledger operations
2827  */
2828 int
2829 ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
2830 {
2831 #if !CONFIG_MACF
2832 #pragma unused(p)
2833 #endif
2834         int rval, pid, len, error;
2835 #ifdef LEDGER_DEBUG
2836         struct ledger_limit_args lla;
2837 #endif
2838         task_t task;
2839         proc_t proc;
2840
2841         /* Finish copying in the necessary args before taking the proc lock */
2842         error = 0;
2843         len = 0;
2844         if (args->cmd == LEDGER_ENTRY_INFO)
2845                 error = copyin(args->arg3, (char *)&len, sizeof (len));
2846         else if (args->cmd == LEDGER_TEMPLATE_INFO)
2847                 error = copyin(args->arg2, (char *)&len, sizeof (len));
2848 #ifdef LEDGER_DEBUG
2849         else if (args->cmd == LEDGER_LIMIT)
2850                 error = copyin(args->arg2, (char *)&lla, sizeof (lla));
2851 #endif
2852         if (error)
2853                 return (error);
2854         if (len < 0)
2855                 return (EINVAL);
2856
2857         rval = 0;
2858         if (args->cmd != LEDGER_TEMPLATE_INFO) {
2859                 pid = args->arg1;
2860                 proc = proc_find(pid);
2861                 if (proc == NULL)
2862                         return (ESRCH);
2863
2864 #if CONFIG_MACF
2865                 error = mac_proc_check_ledger(p, proc, args->cmd);
2866                 if (error) {
2867                         proc_rele(proc);
2868                         return (error);
2869                 }
2870 #endif
2871
2872                 task = proc->task;
2873         }
2874
2875         switch (args->cmd) {
2876 #ifdef LEDGER_DEBUG
2877                 case LEDGER_LIMIT: {
2878                         if (!kauth_cred_issuser(kauth_cred_get()))
2879                                 rval = EPERM;
2880                         rval = ledger_limit(task, &lla);
2881                         proc_rele(proc);
2882                         break;
2883                 }
2884 #endif
2885                 case LEDGER_INFO: {
2886                         struct ledger_info info;
2887
2888                         rval = ledger_info(task, &info);
2889                         proc_rele(proc);
2890                         if (rval == 0)
2891                                 rval = copyout(&info, args->arg2,
2892                                     sizeof (info));
2893                         break;
2894                 }
2895
2896                 case LEDGER_ENTRY_INFO: {
2897                         void *buf;
2898                         int sz;
2899
2900                         rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
2901                         proc_rele(proc);
2902                         if ((rval == 0) && (len > 0)) {
2903                                 sz = len * sizeof (struct ledger_entry_info);
2904                                 rval = copyout(buf, args->arg2, sz);
2905                                 kfree(buf, sz);
2906                         }
2907                         if (rval == 0)
2908                                 rval = copyout(&len, args->arg3, sizeof (len));
2909                         break;
2910                 }
2911
2912                 case LEDGER_TEMPLATE_INFO: {
2913                         void *buf;
2914                         int sz;
2915
2916                         rval = ledger_template_info(&buf, &len);
2917                         if ((rval == 0) && (len > 0)) {
2918                                 sz = len * sizeof (struct ledger_template_info);
2919                                 rval = copyout(buf, args->arg1, sz);
2920                                 kfree(buf, sz);
2921                         }
2922                         if (rval == 0)
2923                                 rval = copyout(&len, args->arg2, sizeof (len));
2924                         break;
2925                 }
2926
2927                 default:
2928                         rval = EINVAL;
2929         }
2930
2931         return (rval);
2932 }
2933
2934 #if CONFIG_TELEMETRY
2935 int
2936 telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t *retval)
2937 {
2938         int error = 0;
2939
2940         switch (args->cmd) {
2941         case TELEMETRY_CMD_TIMER_EVENT:
2942                 error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
2943                 break;
2944         default:
2945                 error = EINVAL;
2946                 break;
2947         }
2948
2949         return (error);
2950 }
2951 #endif /* CONFIG_TELEMETRY */