bsd/kern/sys_generic.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/ioctl.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/proc_internal.h>
  81 #include <sys/socketvar.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/kernel.h>
  84 #include <sys/guarded.h>
  85 #include <sys/stat.h>
  86 #include <sys/malloc.h>
  87 #include <sys/sysproto.h>
  88
  89 #include <sys/mount_internal.h>
  90 #include <sys/protosw.h>
  91 #include <sys/ev.h>
  92 #include <sys/user.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/poll.h>
  95 #include <sys/event.h>
  96 #include <sys/eventvar.h>
  97 #include <sys/proc.h>
  98 #include <sys/kauth.h>
  99
 100 #include <mach/mach_types.h>
 101 #include <kern/kern_types.h>
 102 #include <kern/assert.h>
 103 #include <kern/kalloc.h>
 104 #include <kern/thread.h>
 105 #include <kern/clock.h>
 106 #include <kern/ledger.h>
 107 #include <kern/task.h>
 108 #include <kern/telemetry.h>
 109 #include <kern/waitq.h>
 110 #include <kern/sched_prim.h>
 111
 112 #include <sys/mbuf.h>
 113 #include <sys/domain.h>
 114 #include <sys/socket.h>
 115 #include <sys/socketvar.h>
 116 #include <sys/errno.h>
 117 #include <sys/syscall.h>
 118 #include <sys/pipe.h>
 119
 120 #include <security/audit/audit.h>
 121
 122 #include <net/if.h>
 123 #include <net/route.h>
 124
 125 #include <netinet/in.h>
 126 #include <netinet/in_systm.h>
 127 #include <netinet/ip.h>
 128 #include <netinet/in_pcb.h>
 129 #include <netinet/ip_var.h>
 130 #include <netinet/ip6.h>
 131 #include <netinet/tcp.h>
 132 #include <netinet/tcp_fsm.h>
 133 #include <netinet/tcp_seq.h>
 134 #include <netinet/tcp_timer.h>
 135 #include <netinet/tcp_var.h>
 136 #include <netinet/tcpip.h>
 137 #include <netinet/tcp_debug.h>
 138 /* for wait queue based select */
 139 #include <kern/waitq.h>
 140 #include <kern/kalloc.h>
 141 #include <sys/vnode_internal.h>
 142
 143 /* XXX should be in a header file somewhere */
 144 void evsofree(struct socket *);
 145 void evpipefree(struct pipe *);
 146 void postpipeevent(struct pipe *, int);
 147 void postevent(struct socket *, struct sockbuf *, int);
 148 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
 149
 150 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 151 int wr_uio(struct proc *p, struct fileproc *fp, uio_t uio, user_ssize_t *retval);
 152
 153 __private_extern__ int  dofileread(vfs_context_t ctx, struct fileproc *fp,
 154                                                                    user_addr_t bufp, user_size_t nbyte,
 155                                                                    off_t offset, int flags, user_ssize_t *retval);
 156 __private_extern__ int  dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 157                                                                         user_addr_t bufp, user_size_t nbyte,
 158                                                                         off_t offset, int flags, user_ssize_t *retval);
 159 __private_extern__ int  preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
 160 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
 161
 162
 163 /* Conflict wait queue for when selects collide (opaque type) */
 164 struct waitq select_conflict_queue;
 165
 166 /*
 167  * Init routine called from bsd_init.c
 168  */
 169 void select_waitq_init(void);
 170 void
 171 select_waitq_init(void)
 172 {
 173         waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO);
 174 }
 175
 176 #define f_flag f_fglob->fg_flag
 177 #define f_type f_fglob->fg_ops->fo_type
 178 #define f_msgcount f_fglob->fg_msgcount
 179 #define f_cred f_fglob->fg_cred
 180 #define f_ops f_fglob->fg_ops
 181 #define f_offset f_fglob->fg_offset
 182 #define f_data f_fglob->fg_data
 183
 184 /*
 185  * Read system call.
 186  *
 187  * Returns:     0                       Success
 188  *      preparefileread:EBADF
 189  *      preparefileread:ESPIPE
 190  *      preparefileread:ENXIO
 191  *      preparefileread:EBADF
 192  *      dofileread:???
 193  */
 194 int
 195 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
 196 {
 197         __pthread_testcancel(1);
 198         return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
 199 }
 200
 201 int
 202 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
 203 {
 204         struct fileproc *fp;
 205         int error;
 206         int fd = uap->fd;
 207         struct vfs_context context;
 208
 209         if ( (error = preparefileread(p, &fp, fd, 0)) )
 210                 return (error);
 211
 212         context = *(vfs_context_current());
 213         context.vc_ucred = fp->f_fglob->fg_cred;
 214
 215         error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
 216                            (off_t)-1, 0, retval);
 217
 218         donefileread(p, fp, fd);
 219
 220         return (error);
 221 }
 222
 223 /*
 224  * Pread system call
 225  *
 226  * Returns:     0                       Success
 227  *      preparefileread:EBADF
 228  *      preparefileread:ESPIPE
 229  *      preparefileread:ENXIO
 230  *      preparefileread:EBADF
 231  *      dofileread:???
 232  */
 233 int
 234 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
 235 {
 236         __pthread_testcancel(1);
 237         return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
 238 }
 239
 240 int
 241 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
 242 {
 243         struct fileproc *fp = NULL;     /* fp set by preparefileread() */
 244         int fd = uap->fd;
 245         int error;
 246         struct vfs_context context;
 247
 248         if ( (error = preparefileread(p, &fp, fd, 1)) )
 249                 goto out;
 250
 251         context = *(vfs_context_current());
 252         context.vc_ucred = fp->f_fglob->fg_cred;
 253
 254         error = dofileread(&context, fp, uap->buf, uap->nbyte,
 255                         uap->offset, FOF_OFFSET, retval);
 256
 257         donefileread(p, fp, fd);
 258
 259         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
 260               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 261
 262 out:
 263         return (error);
 264 }
 265
 266 /*
 267  * Code common for read and pread
 268  */
 269
 270 void
 271 donefileread(struct proc *p, struct fileproc *fp, int fd)
 272 {
 273         proc_fdlock_spin(p);
 274         fp_drop(p, fd, fp, 1);
 275         proc_fdunlock(p);
 276 }
 277
 278 /*
 279  * Returns:     0                       Success
 280  *              EBADF
 281  *              ESPIPE
 282  *              ENXIO
 283  *      fp_lookup:EBADF
 284  *      fo_read:???
 285  */
 286 int
 287 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
 288 {
 289         vnode_t vp;
 290         int     error;
 291         struct fileproc *fp;
 292
 293         AUDIT_ARG(fd, fd);
 294
 295         proc_fdlock_spin(p);
 296
 297         error = fp_lookup(p, fd, &fp, 1);
 298
 299         if (error) {
 300                 proc_fdunlock(p);
 301                 return (error);
 302         }
 303         if ((fp->f_flag & FREAD) == 0) {
 304                 error = EBADF;
 305                 goto out;
 306         }
 307         if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
 308                 error = ESPIPE;
 309                 goto out;
 310         }
 311         if (fp->f_type == DTYPE_VNODE) {
 312                 vp = (struct vnode *)fp->f_fglob->fg_data;
 313
 314                 if (check_for_pread && (vnode_isfifo(vp))) {
 315                         error = ESPIPE;
 316                         goto out;
 317                 }
 318                 if (check_for_pread && (vp->v_flag & VISTTY)) {
 319                         error = ENXIO;
 320                         goto out;
 321                 }
 322         }
 323
 324         *fp_ret = fp;
 325
 326         proc_fdunlock(p);
 327         return (0);
 328
 329 out:
 330         fp_drop(p, fd, fp, 1);
 331         proc_fdunlock(p);
 332         return (error);
 333 }
 334
 335
 336 /*
 337  * Returns:     0                       Success
 338  *              EINVAL
 339  *      fo_read:???
 340  */
 341 __private_extern__ int
 342 dofileread(vfs_context_t ctx, struct fileproc *fp,
 343            user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
 344            user_ssize_t *retval)
 345 {
 346         uio_t auio;
 347         user_ssize_t bytecnt;
 348         long error = 0;
 349         char uio_buf[ UIO_SIZEOF(1) ];
 350
 351         if (nbyte > INT_MAX)
 352                 return (EINVAL);
 353
 354         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
 355                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
 356                                                                           &uio_buf[0], sizeof(uio_buf));
 357         } else {
 358                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
 359                                                                           &uio_buf[0], sizeof(uio_buf));
 360         }
 361         uio_addiov(auio, bufp, nbyte);
 362
 363         bytecnt = nbyte;
 364
 365         if ((error = fo_read(fp, auio, flags, ctx))) {
 366                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 367                         error == EINTR || error == EWOULDBLOCK))
 368                         error = 0;
 369         }
 370         bytecnt -= uio_resid(auio);
 371
 372         *retval = bytecnt;
 373
 374         return (error);
 375 }
 376
 377 /*
 378  * Scatter read system call.
 379  *
 380  * Returns:     0                       Success
 381  *              EINVAL
 382  *              ENOMEM
 383  *      copyin:EFAULT
 384  *      rd_uio:???
 385  */
 386 int
 387 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
 388 {
 389         __pthread_testcancel(1);
 390         return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
 391 }
 392
 393 int
 394 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
 395 {
 396         uio_t auio = NULL;
 397         int error;
 398         struct user_iovec *iovp;
 399
 400         /* Verify range bedfore calling uio_create() */
 401         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 402                 return (EINVAL);
 403
 404         /* allocate a uio large enough to hold the number of iovecs passed */
 405         auio = uio_create(uap->iovcnt, 0,
 406                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 407                                   UIO_READ);
 408
 409         /* get location of iovecs within the uio.  then copyin the iovecs from
 410          * user space.
 411          */
 412         iovp = uio_iovsaddr(auio);
 413         if (iovp == NULL) {
 414                 error = ENOMEM;
 415                 goto ExitThisRoutine;
 416         }
 417         error = copyin_user_iovec_array(uap->iovp,
 418                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
 419                 uap->iovcnt, iovp);
 420         if (error) {
 421                 goto ExitThisRoutine;
 422         }
 423
 424         /* finalize uio_t for use and do the IO
 425          */
 426         error = uio_calculateresid(auio);
 427         if (error) {
 428                 goto ExitThisRoutine;
 429         }
 430         error = rd_uio(p, uap->fd, auio, retval);
 431
 432 ExitThisRoutine:
 433         if (auio != NULL) {
 434                 uio_free(auio);
 435         }
 436         return (error);
 437 }
 438
 439 /*
 440  * Write system call
 441  *
 442  * Returns:     0                       Success
 443  *              EBADF
 444  *      fp_lookup:EBADF
 445  *      dofilewrite:???
 446  */
 447 int
 448 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
 449 {
 450         __pthread_testcancel(1);
 451         return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
 452
 453 }
 454
 455 int
 456 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
 457 {
 458         struct fileproc *fp;
 459         int error;
 460         int fd = uap->fd;
 461         bool wrote_some = false;
 462
 463         AUDIT_ARG(fd, fd);
 464
 465         error = fp_lookup(p,fd,&fp,0);
 466         if (error)
 467                 return(error);
 468         if ((fp->f_flag & FWRITE) == 0) {
 469                 error = EBADF;
 470         } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
 471                 proc_fdlock(p);
 472                 error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
 473                 proc_fdunlock(p);
 474         } else {
 475                 struct vfs_context context = *(vfs_context_current());
 476                 context.vc_ucred = fp->f_fglob->fg_cred;
 477
 478                 error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
 479                         (off_t)-1, 0, retval);
 480
 481                 wrote_some = *retval > 0;
 482         }
 483         if (wrote_some)
 484                 fp_drop_written(p, fd, fp);
 485         else
 486                 fp_drop(p, fd, fp, 0);
 487         return(error);
 488 }
 489
 490 /*
 491  * pwrite system call
 492  *
 493  * Returns:     0                       Success
 494  *              EBADF
 495  *              ESPIPE
 496  *              ENXIO
 497  *              EINVAL
 498  *      fp_lookup:EBADF
 499  *      dofilewrite:???
 500  */
 501 int
 502 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
 503 {
 504         __pthread_testcancel(1);
 505         return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
 506 }
 507
 508 int
 509 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
 510 {
 511         struct fileproc *fp;
 512         int error;
 513         int fd = uap->fd;
 514         vnode_t vp  = (vnode_t)0;
 515         bool wrote_some = false;
 516
 517         AUDIT_ARG(fd, fd);
 518
 519         error = fp_lookup(p,fd,&fp,0);
 520         if (error)
 521                 return(error);
 522
 523         if ((fp->f_flag & FWRITE) == 0) {
 524                 error = EBADF;
 525         } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
 526                 proc_fdlock(p);
 527                 error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
 528                 proc_fdunlock(p);
 529         } else {
 530                 struct vfs_context context = *vfs_context_current();
 531                 context.vc_ucred = fp->f_fglob->fg_cred;
 532
 533                 if (fp->f_type != DTYPE_VNODE) {
 534                         error = ESPIPE;
 535                         goto errout;
 536                 }
 537                 vp = (vnode_t)fp->f_fglob->fg_data;
 538                 if (vnode_isfifo(vp)) {
 539                         error = ESPIPE;
 540                         goto errout;
 541                 }
 542                 if ((vp->v_flag & VISTTY)) {
 543                         error = ENXIO;
 544                         goto errout;
 545                 }
 546                 if (uap->offset == (off_t)-1) {
 547                         error = EINVAL;
 548                         goto errout;
 549                 }
 550
 551                     error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
 552                         uap->offset, FOF_OFFSET, retval);
 553                         wrote_some = *retval > 0;
 554         }
 555 errout:
 556         if (wrote_some)
 557                 fp_drop_written(p, fd, fp);
 558         else
 559                 fp_drop(p, fd, fp, 0);
 560
 561         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
 562               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 563
 564         return(error);
 565 }
 566
 567 /*
 568  * Returns:     0                       Success
 569  *              EINVAL
 570  *      <fo_write>:EPIPE
 571  *      <fo_write>:???                  [indirect through struct fileops]
 572  */
 573 __private_extern__ int
 574 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 575             user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
 576             user_ssize_t *retval)
 577 {
 578         uio_t auio;
 579         long error = 0;
 580         user_ssize_t bytecnt;
 581         char uio_buf[ UIO_SIZEOF(1) ];
 582
 583         if (nbyte > INT_MAX) {
 584                 *retval = 0;
 585                 return (EINVAL);
 586         }
 587
 588         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
 589                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
 590                                                                           &uio_buf[0], sizeof(uio_buf));
 591         } else {
 592                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
 593                                                                           &uio_buf[0], sizeof(uio_buf));
 594         }
 595         uio_addiov(auio, bufp, nbyte);
 596
 597         bytecnt = nbyte;
 598         if ((error = fo_write(fp, auio, flags, ctx))) {
 599                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 600                         error == EINTR || error == EWOULDBLOCK))
 601                         error = 0;
 602                 /* The socket layer handles SIGPIPE */
 603                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
 604                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
 605                         /* XXX Raise the signal on the thread? */
 606                         psignal(vfs_context_proc(ctx), SIGPIPE);
 607                 }
 608         }
 609         bytecnt -= uio_resid(auio);
 610         *retval = bytecnt;
 611
 612         return (error);
 613 }
 614
 615 /*
 616  * Gather write system call
 617  */
 618 int
 619 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
 620 {
 621         __pthread_testcancel(1);
 622         return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
 623 }
 624
 625 int
 626 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
 627 {
 628         uio_t auio = NULL;
 629         int error;
 630         struct fileproc *fp;
 631         struct user_iovec *iovp;
 632         bool wrote_some = false;
 633
 634         AUDIT_ARG(fd, uap->fd);
 635
 636         /* Verify range bedfore calling uio_create() */
 637         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 638                 return (EINVAL);
 639
 640         /* allocate a uio large enough to hold the number of iovecs passed */
 641         auio = uio_create(uap->iovcnt, 0,
 642                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 643                                   UIO_WRITE);
 644
 645         /* get location of iovecs within the uio.  then copyin the iovecs from
 646          * user space.
 647          */
 648         iovp = uio_iovsaddr(auio);
 649         if (iovp == NULL) {
 650                 error = ENOMEM;
 651                 goto ExitThisRoutine;
 652         }
 653         error = copyin_user_iovec_array(uap->iovp,
 654                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
 655                 uap->iovcnt, iovp);
 656         if (error) {
 657                 goto ExitThisRoutine;
 658         }
 659
 660         /* finalize uio_t for use and do the IO
 661          */
 662         error = uio_calculateresid(auio);
 663         if (error) {
 664                 goto ExitThisRoutine;
 665         }
 666
 667         error = fp_lookup(p, uap->fd, &fp, 0);
 668         if (error)
 669                 goto ExitThisRoutine;
 670
 671         if ((fp->f_flag & FWRITE) == 0) {
 672                 error = EBADF;
 673         } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
 674                 proc_fdlock(p);
 675                 error = fp_guard_exception(p, uap->fd, fp, kGUARD_EXC_WRITE);
 676                 proc_fdunlock(p);
 677         } else {
 678                 error = wr_uio(p, fp, auio, retval);
 679                 wrote_some = *retval > 0;
 680         }
 681
 682         if (wrote_some)
 683                 fp_drop_written(p, uap->fd, fp);
 684         else
 685                 fp_drop(p, uap->fd, fp, 0);
 686
 687 ExitThisRoutine:
 688         if (auio != NULL) {
 689                 uio_free(auio);
 690         }
 691         return (error);
 692 }
 693
 694
 695 int
 696 wr_uio(struct proc *p, struct fileproc *fp, uio_t uio, user_ssize_t *retval)
 697 {
 698         int error;
 699         user_ssize_t count;
 700         struct vfs_context context = *vfs_context_current();
 701
 702         count = uio_resid(uio);
 703
 704         context.vc_ucred = fp->f_cred;
 705         error = fo_write(fp, uio, 0, &context);
 706         if (error) {
 707                 if (uio_resid(uio) != count && (error == ERESTART ||
 708                                                 error == EINTR || error == EWOULDBLOCK))
 709                         error = 0;
 710                 /* The socket layer handles SIGPIPE */
 711                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
 712                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0)
 713                         psignal(p, SIGPIPE);
 714         }
 715         *retval = count - uio_resid(uio);
 716
 717         return(error);
 718 }
 719
 720
 721 int
 722 rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
 723 {
 724         struct fileproc *fp;
 725         int error;
 726         user_ssize_t count;
 727         struct vfs_context context = *vfs_context_current();
 728
 729         if ( (error = preparefileread(p, &fp, fdes, 0)) )
 730                 return (error);
 731
 732         count = uio_resid(uio);
 733
 734         context.vc_ucred = fp->f_cred;
 735
 736         error = fo_read(fp, uio, 0, &context);
 737
 738         if (error) {
 739                 if (uio_resid(uio) != count && (error == ERESTART ||
 740                                                 error == EINTR || error == EWOULDBLOCK))
 741                         error = 0;
 742         }
 743         *retval = count - uio_resid(uio);
 744
 745         donefileread(p, fp, fdes);
 746
 747         return (error);
 748 }
 749
 750 /*
 751  * Ioctl system call
 752  *
 753  * Returns:     0                       Success
 754  *              EBADF
 755  *              ENOTTY
 756  *              ENOMEM
 757  *              ESRCH
 758  *      copyin:EFAULT
 759  *      copyoutEFAULT
 760  *      fp_lookup:EBADF                 Bad file descriptor
 761  *      fo_ioctl:???
 762  */
 763 int
 764 ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
 765 {
 766         struct fileproc *fp = NULL;
 767         int error = 0;
 768         u_int size = 0;
 769         caddr_t datap = NULL, memp = NULL;
 770         boolean_t is64bit = FALSE;
 771         int tmp = 0;
 772 #define STK_PARAMS      128
 773         char stkbuf[STK_PARAMS];
 774         int fd = uap->fd;
 775         u_long com = uap->com;
 776         struct vfs_context context = *vfs_context_current();
 777
 778         AUDIT_ARG(fd, uap->fd);
 779         AUDIT_ARG(addr, uap->data);
 780
 781         is64bit = proc_is64bit(p);
 782 #if CONFIG_AUDIT
 783         if (is64bit)
 784                 AUDIT_ARG(value64, com);
 785         else
 786                 AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
 787 #endif /* CONFIG_AUDIT */
 788
 789         /*
 790          * Interpret high order word to find amount of data to be
 791          * copied to/from the user's address space.
 792          */
 793         size = IOCPARM_LEN(com);
 794         if (size > IOCPARM_MAX)
 795                         return ENOTTY;
 796         if (size > sizeof (stkbuf)) {
 797                 if ((memp = (caddr_t)kalloc(size)) == 0)
 798                         return ENOMEM;
 799                 datap = memp;
 800         } else
 801                 datap = &stkbuf[0];
 802         if (com & IOC_IN) {
 803                 if (size) {
 804                         error = copyin(uap->data, datap, size);
 805                         if (error)
 806                                 goto out_nofp;
 807                 } else {
 808                         /* XXX - IOC_IN and no size?  we should proably return an error here!! */
 809                         if (is64bit) {
 810                                 *(user_addr_t *)datap = uap->data;
 811                         }
 812                         else {
 813                                 *(uint32_t *)datap = (uint32_t)uap->data;
 814                         }
 815                 }
 816         } else if ((com & IOC_OUT) && size)
 817                 /*
 818                  * Zero the buffer so the user always
 819                  * gets back something deterministic.
 820                  */
 821                 bzero(datap, size);
 822         else if (com & IOC_VOID) {
 823                 /* XXX - this is odd since IOC_VOID means no parameters */
 824                 if (is64bit) {
 825                         *(user_addr_t *)datap = uap->data;
 826                 }
 827                 else {
 828                         *(uint32_t *)datap = (uint32_t)uap->data;
 829                 }
 830         }
 831
 832         proc_fdlock(p);
 833         error = fp_lookup(p,fd,&fp,1);
 834         if (error)  {
 835                 proc_fdunlock(p);
 836                 goto out_nofp;
 837         }
 838
 839         AUDIT_ARG(file, p, fp);
 840
 841         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 842                         error = EBADF;
 843                         goto out;
 844         }
 845
 846         context.vc_ucred = fp->f_fglob->fg_cred;
 847
 848 #if CONFIG_MACF
 849         error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, com);
 850         if (error)
 851                 goto out;
 852 #endif
 853
 854         switch (com) {
 855         case FIONCLEX:
 856                 *fdflags(p, fd) &= ~UF_EXCLOSE;
 857                 break;
 858
 859         case FIOCLEX:
 860                 *fdflags(p, fd) |= UF_EXCLOSE;
 861                 break;
 862
 863         case FIONBIO:
 864                 if ( (tmp = *(int *)datap) )
 865                         fp->f_flag |= FNONBLOCK;
 866                 else
 867                         fp->f_flag &= ~FNONBLOCK;
 868                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
 869                 break;
 870
 871         case FIOASYNC:
 872                 if ( (tmp = *(int *)datap) )
 873                         fp->f_flag |= FASYNC;
 874                 else
 875                         fp->f_flag &= ~FASYNC;
 876                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
 877                 break;
 878
 879         case FIOSETOWN:
 880                 tmp = *(int *)datap;
 881                 if (fp->f_type == DTYPE_SOCKET) {
 882                         ((struct socket *)fp->f_data)->so_pgid = tmp;
 883                         break;
 884                 }
 885                 if (fp->f_type == DTYPE_PIPE) {
 886                         error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
 887                         break;
 888                 }
 889                 if (tmp <= 0) {
 890                         tmp = -tmp;
 891                 } else {
 892                         struct proc *p1 = proc_find(tmp);
 893                         if (p1 == 0) {
 894                                 error = ESRCH;
 895                                 break;
 896                         }
 897                         tmp = p1->p_pgrpid;
 898                         proc_rele(p1);
 899                 }
 900                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
 901                 break;
 902
 903         case FIOGETOWN:
 904                 if (fp->f_type == DTYPE_SOCKET) {
 905                         *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
 906                         break;
 907                 }
 908                 error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
 909                 *(int *)datap = -*(int *)datap;
 910                 break;
 911
 912         default:
 913                 error = fo_ioctl(fp, com, datap, &context);
 914                 /*
 915                  * Copy any data to user, size was
 916                  * already set and checked above.
 917                  */
 918                 if (error == 0 && (com & IOC_OUT) && size)
 919                         error = copyout(datap, uap->data, (u_int)size);
 920                 break;
 921         }
 922 out:
 923         fp_drop(p, fd, fp, 1);
 924         proc_fdunlock(p);
 925
 926 out_nofp:
 927         if (memp)
 928                 kfree(memp, size);
 929         return(error);
 930 }
 931
 932 int     selwait, nselcoll;
 933 #define SEL_FIRSTPASS 1
 934 #define SEL_SECONDPASS 2
 935 extern int selcontinue(int error);
 936 extern int selprocess(int error, int sel_pass);
 937 static int selscan(struct proc *p, struct _select * sel, struct _select_data * seldata,
 938                         int nfd, int32_t *retval, int sel_pass, struct waitq_set *wqset);
 939 static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
 940 static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount);
 941 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
 942 static int select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval);
 943
 944 /*
 945  * Select system call.
 946  *
 947  * Returns:     0                       Success
 948  *              EINVAL                  Invalid argument
 949  *              EAGAIN                  Nonconformant error if allocation fails
 950  */
 951 int
 952 select(struct proc *p, struct select_args *uap, int32_t *retval)
 953 {
 954         __pthread_testcancel(1);
 955         return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
 956 }
 957
 958 int
 959 select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
 960 {
 961         uint64_t timeout = 0;
 962
 963         if (uap->tv) {
 964                 int err;
 965                 struct timeval atv;
 966                 if (IS_64BIT_PROCESS(p)) {
 967                         struct user64_timeval atv64;
 968                         err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
 969                         /* Loses resolution - assume timeout < 68 years */
 970                         atv.tv_sec = atv64.tv_sec;
 971                         atv.tv_usec = atv64.tv_usec;
 972                 } else {
 973                         struct user32_timeval atv32;
 974                         err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
 975                         atv.tv_sec = atv32.tv_sec;
 976                         atv.tv_usec = atv32.tv_usec;
 977                 }
 978                 if (err)
 979                         return err;
 980
 981                 if (itimerfix(&atv)) {
 982                         err = EINVAL;
 983                         return err;
 984                 }
 985
 986                 clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
 987         }
 988
 989         return select_internal(p, uap, timeout, retval);
 990 }
 991
 992 int
 993 pselect(struct proc *p, struct pselect_args *uap, int32_t *retval)
 994 {
 995         __pthread_testcancel(1);
 996         return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
 997 }
 998
 999 int
1000 pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *retval)
1001 {
1002         int err;
1003         struct uthread *ut;
1004         uint64_t timeout = 0;
1005
1006         if (uap->ts) {
1007                 struct timespec ts;
1008
1009                 if (IS_64BIT_PROCESS(p)) {
1010                         struct user64_timespec ts64;
1011                         err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
1012                         ts.tv_sec = ts64.tv_sec;
1013                         ts.tv_nsec = ts64.tv_nsec;
1014                 } else {
1015                         struct user32_timespec ts32;
1016                         err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
1017                         ts.tv_sec = ts32.tv_sec;
1018                         ts.tv_nsec = ts32.tv_nsec;
1019                 }
1020                 if (err) {
1021                         return err;
1022                 }
1023
1024                 if (!timespec_is_valid(&ts)) {
1025                         return EINVAL;
1026                 }
1027                 clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
1028         }
1029
1030         ut = get_bsdthread_info(current_thread());
1031
1032         if (uap->mask != USER_ADDR_NULL) {
1033                 /* save current mask, then copyin and set new mask */
1034                 sigset_t newset;
1035                 err = copyin(uap->mask, &newset, sizeof(sigset_t));
1036                 if (err) {
1037                         return err;
1038                 }
1039                 ut->uu_oldmask = ut->uu_sigmask;
1040                 ut->uu_flag |= UT_SAS_OLDMASK;
1041                 ut->uu_sigmask = (newset & ~sigcantmask);
1042         }
1043
1044         err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
1045
1046         if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
1047                 /*
1048                  * Restore old mask (direct return case). NOTE: EINTR can also be returned
1049                  * if the thread is cancelled. In that case, we don't reset the signal
1050                  * mask to its original value (which usually happens in the signal
1051                  * delivery path). This behavior is permitted by POSIX.
1052                  */
1053                 ut->uu_sigmask = ut->uu_oldmask;
1054                 ut->uu_oldmask = 0;
1055                 ut->uu_flag &= ~UT_SAS_OLDMASK;
1056         }
1057
1058         return err;
1059 }
1060
1061 /*
1062  * Generic implementation of {,p}select. Care: we type-pun uap across the two
1063  * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
1064  * are identical. The 5th (timeout) argument points to different types, so we
1065  * unpack in the syscall-specific code, but the generic code still does a null
1066  * check on this argument to determine if a timeout was specified.
1067  */
1068 static int
1069 select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval)
1070 {
1071         int error = 0;
1072         u_int ni, nw;
1073         thread_t th_act;
1074         struct uthread  *uth;
1075         struct _select *sel;
1076         struct _select_data *seldata;
1077         int needzerofill = 1;
1078         int count = 0;
1079         size_t sz = 0;
1080
1081         th_act = current_thread();
1082         uth = get_bsdthread_info(th_act);
1083         sel = &uth->uu_select;
1084         seldata = &uth->uu_kevent.ss_select_data;
1085         *retval = 0;
1086
1087         seldata->args = uap;
1088         seldata->retval = retval;
1089         seldata->wqp = NULL;
1090         seldata->count = 0;
1091
1092         if (uap->nd < 0) {
1093                 return (EINVAL);
1094         }
1095
1096         /* select on thread of process that already called proc_exit() */
1097         if (p->p_fd == NULL) {
1098                 return (EBADF);
1099         }
1100
1101         if (uap->nd > p->p_fd->fd_nfiles)
1102                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
1103
1104         nw = howmany(uap->nd, NFDBITS);
1105         ni = nw * sizeof(fd_mask);
1106
1107         /*
1108          * if the previously allocated space for the bits is smaller than
1109          * what is requested or no space has yet been allocated for this
1110          * thread, allocate enough space now.
1111          *
1112          * Note: If this process fails, select() will return EAGAIN; this
1113          * is the same thing pool() returns in a no-memory situation, but
1114          * it is not a POSIX compliant error code for select().
1115          */
1116         if (sel->nbytes < (3 * ni)) {
1117                 int nbytes = 3 * ni;
1118
1119                 /* Free previous allocation, if any */
1120                 if (sel->ibits != NULL)
1121                         FREE(sel->ibits, M_TEMP);
1122                 if (sel->obits != NULL) {
1123                         FREE(sel->obits, M_TEMP);
1124                         /* NULL out; subsequent ibits allocation may fail */
1125                         sel->obits = NULL;
1126                 }
1127
1128                 MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1129                 if (sel->ibits == NULL)
1130                         return (EAGAIN);
1131                 MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1132                 if (sel->obits == NULL) {
1133                         FREE(sel->ibits, M_TEMP);
1134                         sel->ibits = NULL;
1135                         return (EAGAIN);
1136                 }
1137                 sel->nbytes = nbytes;
1138                 needzerofill = 0;
1139         }
1140
1141         if (needzerofill) {
1142                 bzero((caddr_t)sel->ibits, sel->nbytes);
1143                 bzero((caddr_t)sel->obits, sel->nbytes);
1144         }
1145
1146         /*
1147          * get the bits from the user address space
1148          */
1149 #define getbits(name, x) \
1150         do { \
1151                 if (uap->name && (error = copyin(uap->name, \
1152                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
1153                         goto continuation; \
1154         } while (0)
1155
1156         getbits(in, 0);
1157         getbits(ou, 1);
1158         getbits(ex, 2);
1159 #undef  getbits
1160
1161         seldata->abstime = timeout;
1162
1163         if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) {
1164                         goto continuation;
1165         }
1166
1167         /*
1168          * We need an array of waitq pointers. This is due to the new way
1169          * in which waitqs are linked to sets. When a thread selects on a
1170          * file descriptor, a waitq (embedded in a selinfo structure) is
1171          * added to the thread's local waitq set. There is no longer any
1172          * way to directly iterate over all members of a given waitq set.
1173          * The process of linking a waitq into a set may allocate a link
1174          * table object. Because we can't iterate over all the waitqs to
1175          * which our thread waitq set belongs, we need a way of removing
1176          * this link object!
1177          *
1178          * Thus we need a buffer which will hold one waitq pointer
1179          * per FD being selected. During the tear-down phase we can use
1180          * these pointers to dis-associate the underlying selinfo's waitq
1181          * from our thread's waitq set.
1182          *
1183          * Because we also need to allocate a waitq set for this thread,
1184          * we use a bare buffer pointer to hold all the memory. Note that
1185          * this memory is cached in the thread pointer and not reaped until
1186          * the thread exists. This is generally OK because threads that
1187          * call select tend to keep calling select repeatedly.
1188          */
1189         sz = ALIGN(sizeof(struct waitq_set)) + (count * sizeof(uint64_t));
1190         if (sz > uth->uu_wqstate_sz) {
1191                 /* (re)allocate a buffer to hold waitq pointers */
1192                 if (uth->uu_wqset) {
1193                         if (waitq_set_is_valid(uth->uu_wqset))
1194                                 waitq_set_deinit(uth->uu_wqset);
1195                         FREE(uth->uu_wqset, M_SELECT);
1196                 } else if (uth->uu_wqstate_sz && !uth->uu_wqset)
1197                         panic("select: thread structure corrupt! "
1198                               "uu_wqstate_sz:%ld, wqstate_buf == NULL",
1199                               uth->uu_wqstate_sz);
1200                 uth->uu_wqstate_sz = sz;
1201                 MALLOC(uth->uu_wqset, struct waitq_set *, sz, M_SELECT, M_WAITOK);
1202                 if (!uth->uu_wqset)
1203                         panic("can't allocate %ld bytes for wqstate buffer",
1204                               uth->uu_wqstate_sz);
1205                 waitq_set_init(uth->uu_wqset,
1206                                SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL, NULL);
1207         }
1208
1209         if (!waitq_set_is_valid(uth->uu_wqset))
1210                 waitq_set_init(uth->uu_wqset,
1211                                SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL, NULL);
1212
1213         /* the last chunk of our buffer is an array of waitq pointers */
1214         seldata->wqp = (uint64_t *)((char *)(uth->uu_wqset) + ALIGN(sizeof(struct waitq_set)));
1215         bzero(seldata->wqp, sz - ALIGN(sizeof(struct waitq_set)));
1216
1217         seldata->count = count;
1218
1219 continuation:
1220
1221         if (error) {
1222                 /*
1223                  * We have already cleaned up any state we established,
1224                  * either locally or as a result of selcount().  We don't
1225                  * need to wait_subqueue_unlink_all(), since we haven't set
1226                  * anything at this point.
1227                  */
1228                 return (error);
1229         }
1230
1231         return selprocess(0, SEL_FIRSTPASS);
1232 }
1233
1234 int
1235 selcontinue(int error)
1236 {
1237         return selprocess(error, SEL_SECONDPASS);
1238 }
1239
1240
1241 /*
1242  * selprocess
1243  *
1244  * Parameters:  error                   The error code from our caller
1245  *              sel_pass                The pass we are on
1246  */
1247 int
1248 selprocess(int error, int sel_pass)
1249 {
1250         int ncoll;
1251         u_int ni, nw;
1252         thread_t th_act;
1253         struct uthread  *uth;
1254         struct proc *p;
1255         struct select_nocancel_args *uap;
1256         int *retval;
1257         struct _select *sel;
1258         struct _select_data *seldata;
1259         int unwind = 1;
1260         int prepost = 0;
1261         int somewakeup = 0;
1262         int doretry = 0;
1263         wait_result_t wait_result;
1264
1265         p = current_proc();
1266         th_act = current_thread();
1267         uth = get_bsdthread_info(th_act);
1268         sel = &uth->uu_select;
1269         seldata = &uth->uu_kevent.ss_select_data;
1270         uap = seldata->args;
1271         retval = seldata->retval;
1272
1273         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
1274                 unwind = 0;
1275         if (seldata->count == 0)
1276                 unwind = 0;
1277 retry:
1278         if (error != 0)
1279                 goto done;
1280
1281         ncoll = nselcoll;
1282         OSBitOrAtomic(P_SELECT, &p->p_flag);
1283
1284         /* skip scans if the select is just for timeouts */
1285         if (seldata->count) {
1286                 error = selscan(p, sel, seldata, uap->nd, retval, sel_pass, uth->uu_wqset);
1287                 if (error || *retval) {
1288                         goto done;
1289                 }
1290                 if (prepost || somewakeup) {
1291                         /*
1292                          * if the select of log, then we can wakeup and
1293                          * discover some one else already read the data;
1294                          * go to select again if time permits
1295                          */
1296                         prepost = 0;
1297                         somewakeup = 0;
1298                         doretry = 1;
1299                 }
1300         }
1301
1302         if (uap->tv) {
1303                 uint64_t        now;
1304
1305                 clock_get_uptime(&now);
1306                 if (now >= seldata->abstime)
1307                         goto done;
1308         }
1309
1310         if (doretry) {
1311                 /* cleanup obits and try again */
1312                 doretry = 0;
1313                 sel_pass = SEL_FIRSTPASS;
1314                 goto retry;
1315         }
1316
1317         /*
1318          * To effect a poll, the timeout argument should be
1319          * non-nil, pointing to a zero-valued timeval structure.
1320          */
1321         if (uap->tv && seldata->abstime == 0) {
1322                 goto done;
1323         }
1324
1325         /* No spurious wakeups due to colls,no need to check for them */
1326          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1327                 sel_pass = SEL_FIRSTPASS;
1328                 goto retry;
1329         }
1330
1331         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1332
1333         /* if the select is just for timeout skip check */
1334         if (seldata->count && (sel_pass == SEL_SECONDPASS))
1335                 panic("selprocess: 2nd pass assertwaiting");
1336
1337         /* waitq_set has waitqueue as first element */
1338         wait_result = waitq_assert_wait64_leeway((struct waitq *)uth->uu_wqset,
1339                                                  NO_EVENT64, THREAD_ABORTSAFE,
1340                                                  TIMEOUT_URGENCY_USER_NORMAL,
1341                                                  seldata->abstime,
1342                                                  TIMEOUT_NO_LEEWAY);
1343         if (wait_result != THREAD_AWAKENED) {
1344                 /* there are no preposted events */
1345                 error = tsleep1(NULL, PSOCK | PCATCH,
1346                                 "select", 0, selcontinue);
1347         } else  {
1348                 prepost = 1;
1349                 error = 0;
1350         }
1351
1352         if (error == 0) {
1353                 sel_pass = SEL_SECONDPASS;
1354                 if (!prepost)
1355                         somewakeup = 1;
1356                 goto retry;
1357         }
1358 done:
1359         if (unwind) {
1360                 seldrop(p, sel->ibits, uap->nd);
1361                 waitq_set_deinit(uth->uu_wqset);
1362                 /*
1363                  * zero out the waitq pointer array to avoid use-after free
1364                  * errors in the selcount error path (seldrop_locked) if/when
1365                  * the thread re-calls select().
1366                  */
1367                 bzero((void *)uth->uu_wqset, uth->uu_wqstate_sz);
1368         }
1369         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1370         /* select is not restarted after signals... */
1371         if (error == ERESTART)
1372                 error = EINTR;
1373         if (error == EWOULDBLOCK)
1374                 error = 0;
1375         nw = howmany(uap->nd, NFDBITS);
1376         ni = nw * sizeof(fd_mask);
1377
1378 #define putbits(name, x) \
1379         do { \
1380                 if (uap->name && (error2 = \
1381                         copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1382                         error = error2; \
1383         } while (0)
1384
1385         if (error == 0) {
1386                 int error2;
1387
1388                 putbits(in, 0);
1389                 putbits(ou, 1);
1390                 putbits(ex, 2);
1391 #undef putbits
1392         }
1393
1394         if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
1395                 /* restore signal mask - continuation case */
1396                 uth->uu_sigmask = uth->uu_oldmask;
1397                 uth->uu_oldmask = 0;
1398                 uth->uu_flag &= ~UT_SAS_OLDMASK;
1399         }
1400
1401         return(error);
1402 }
1403
1404
1405 /**
1406  * remove the fileproc's underlying waitq from the supplied waitq set;
1407  * clear FP_INSELECT when appropriate
1408  *
1409  * Parameters:
1410  *              fp      File proc that is potentially currently in select
1411  *              wqset   Waitq set to which the fileproc may belong
1412  *                      (usually this is the thread's private waitq set)
1413  * Conditions:
1414  *              proc_fdlock is held
1415  */
1416 static void selunlinkfp(struct fileproc *fp, uint64_t wqp_id, struct waitq_set *wqset)
1417 {
1418         int valid_set = waitq_set_is_valid(wqset);
1419         int valid_q = !!wqp_id;
1420
1421         /*
1422          * This could be called (from selcount error path) before we setup
1423          * the thread's wqset. Check the wqset passed in, and only unlink if
1424          * the set is valid.
1425          */
1426
1427         /* unlink the underlying waitq from the input set (thread waitq set) */
1428         if (valid_q && valid_set)
1429                 waitq_unlink_by_prepost_id(wqp_id, wqset);
1430
1431         /* allow passing a NULL/invalid fp for seldrop unwind */
1432         if (!fp || !(fp->f_flags & (FP_INSELECT|FP_SELCONFLICT)))
1433                 return;
1434
1435         /*
1436          * We can always remove the conflict queue from our thread's set: this
1437          * will not affect other threads that potentially need to be awoken on
1438          * the conflict queue during a fileproc_drain - those sets will still
1439          * be linked with the global conflict queue, and the last waiter
1440          * on the fp clears the CONFLICT marker.
1441          */
1442         if (valid_set && (fp->f_flags & FP_SELCONFLICT))
1443                 waitq_unlink(&select_conflict_queue, wqset);
1444
1445         /* jca: TODO:
1446          * This isn't quite right - we don't actually know if this
1447          * fileproc is in another select or not! Here we just assume
1448          * that if we were the first thread to select on the FD, then
1449          * we'll be the one to clear this flag...
1450          */
1451         if (valid_set && fp->f_wset == (void *)wqset) {
1452                 fp->f_flags &= ~FP_INSELECT;
1453                 fp->f_wset = NULL;
1454         }
1455 }
1456
1457 /**
1458  * connect a fileproc to the given wqset, potentially bridging to a waitq
1459  * pointed to indirectly by wq_data
1460  *
1461  * Parameters:
1462  *              fp      File proc potentially currently in select
1463  *              wq_data Pointer to a pointer to a waitq (could be NULL)
1464  *              wqset   Waitq set to which the fileproc should now belong
1465  *                      (usually this is the thread's private waitq set)
1466  *
1467  * Conditions:
1468  *              proc_fdlock is held
1469  */
1470 static uint64_t sellinkfp(struct fileproc *fp, void **wq_data, struct waitq_set *wqset)
1471 {
1472         struct waitq *f_wq = NULL;
1473
1474         if ((fp->f_flags & FP_INSELECT) != FP_INSELECT) {
1475                 if (wq_data)
1476                         panic("non-null data:%p on fp:%p not in select?!"
1477                               "(wqset:%p)", wq_data, fp, wqset);
1478                 return 0;
1479         }
1480
1481         if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) {
1482                 /*
1483                  * The conflict queue requires disabling interrupts, so we
1484                  * need to explicitly reserve a link object to avoid a
1485                  * panic/assert in the waitq code. Hopefully this extra step
1486                  * can be avoided if we can split the waitq structure into
1487                  * blocking and linkage sub-structures.
1488                  */
1489                 uint64_t reserved_link = waitq_link_reserve(&select_conflict_queue);
1490                 waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
1491                 waitq_link_release(reserved_link);
1492         }
1493
1494         /*
1495          * The wq_data parameter has potentially been set by selrecord called
1496          * from a subsystems fo_select() function. If the subsystem does not
1497          * call selrecord, then wq_data will be NULL
1498          *
1499          * Use memcpy to get the value into a proper pointer because
1500          * wq_data most likely points to a stack variable that could be
1501          * unaligned on 32-bit systems.
1502          */
1503         if (wq_data) {
1504                 memcpy(&f_wq, wq_data, sizeof(f_wq));
1505                 if (!waitq_is_valid(f_wq))
1506                         f_wq = NULL;
1507         }
1508
1509         /* record the first thread's wqset in the fileproc structure */
1510         if (!fp->f_wset)
1511                 fp->f_wset = (void *)wqset;
1512
1513         /* handles NULL f_wq */
1514         return waitq_get_prepost_id(f_wq);
1515 }
1516
1517
1518 /*
1519  * selscan
1520  *
1521  * Parameters:  p                       Process performing the select
1522  *              sel                     The per-thread select context structure
1523  *              nfd                     The number of file descriptors to scan
1524  *              retval                  The per thread system call return area
1525  *              sel_pass                Which pass this is; allowed values are
1526  *                                              SEL_FIRSTPASS and SEL_SECONDPASS
1527  *              wqset                   The per thread wait queue set
1528  *
1529  * Returns:     0                       Success
1530  *              EIO                     Invalid p->p_fd field XXX Obsolete?
1531  *              EBADF                   One of the files in the bit vector is
1532  *                                              invalid.
1533  */
1534 static int
1535 selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
1536         int nfd, int32_t *retval, int sel_pass, struct waitq_set *wqset)
1537 {
1538         struct filedesc *fdp = p->p_fd;
1539         int msk, i, j, fd;
1540         u_int32_t bits;
1541         struct fileproc *fp;
1542         int n = 0;              /* count of bits */
1543         int nc = 0;             /* bit vector offset (nc'th bit) */
1544         static int flag[3] = { FREAD, FWRITE, 0 };
1545         u_int32_t *iptr, *optr;
1546         u_int nw;
1547         u_int32_t *ibits, *obits;
1548         uint64_t reserved_link, *rl_ptr = NULL;
1549         int count;
1550         struct vfs_context context = *vfs_context_current();
1551
1552         /*
1553          * Problems when reboot; due to MacOSX signal probs
1554          * in Beaker1C ; verify that the p->p_fd is valid
1555          */
1556         if (fdp == NULL) {
1557                 *retval=0;
1558                 return(EIO);
1559         }
1560         ibits = sel->ibits;
1561         obits = sel->obits;
1562
1563         nw = howmany(nfd, NFDBITS);
1564
1565         count = seldata->count;
1566
1567         nc = 0;
1568         if (!count) {
1569                 *retval = 0;
1570                 return 0;
1571         }
1572
1573         proc_fdlock(p);
1574         for (msk = 0; msk < 3; msk++) {
1575                 iptr = (u_int32_t *)&ibits[msk * nw];
1576                 optr = (u_int32_t *)&obits[msk * nw];
1577
1578                 for (i = 0; i < nfd; i += NFDBITS) {
1579                         bits = iptr[i/NFDBITS];
1580
1581                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1582                                 bits &= ~(1 << j);
1583
1584                                 if (fd < fdp->fd_nfiles)
1585                                         fp = fdp->fd_ofiles[fd];
1586                                 else
1587                                         fp = NULL;
1588
1589                                 if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1590                                         /*
1591                                          * If we abort because of a bad
1592                                          * fd, let the caller unwind...
1593                                          */
1594                                         proc_fdunlock(p);
1595                                         return(EBADF);
1596                                 }
1597                                 if (sel_pass == SEL_SECONDPASS) {
1598                                         reserved_link = 0;
1599                                         rl_ptr = NULL;
1600                                         selunlinkfp(fp, seldata->wqp[nc], wqset);
1601                                 } else {
1602                                         reserved_link = waitq_link_reserve((struct waitq *)wqset);
1603                                         rl_ptr = &reserved_link;
1604                                         if (fp->f_flags & FP_INSELECT)
1605                                                 /* someone is already in select on this fp */
1606                                                 fp->f_flags |= FP_SELCONFLICT;
1607                                         else
1608                                                 fp->f_flags |= FP_INSELECT;
1609                                 }
1610
1611                                 context.vc_ucred = fp->f_cred;
1612
1613                                 /*
1614                                  * stash this value b/c fo_select may replace
1615                                  * reserved_link with a pointer to a waitq object
1616                                  */
1617                                 uint64_t rsvd = reserved_link;
1618
1619                                 /* The select; set the bit, if true */
1620                                 if (fp->f_ops && fp->f_type
1621                                         && fo_select(fp, flag[msk], rl_ptr, &context)) {
1622                                         optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1623                                         n++;
1624                                 }
1625                                 if (sel_pass == SEL_FIRSTPASS) {
1626                                         waitq_link_release(rsvd);
1627                                         /*
1628                                          * If the fp's supporting selinfo structure was linked
1629                                          * to this thread's waitq set, then 'reserved_link'
1630                                          * will have been updated by selrecord to be a pointer
1631                                          * to the selinfo's waitq.
1632                                          */
1633                                         if (reserved_link == rsvd)
1634                                                 rl_ptr = NULL; /* fo_select never called selrecord() */
1635                                         /*
1636                                          * Hook up the thread's waitq set either to
1637                                          * the fileproc structure, or to the global
1638                                          * conflict queue: but only on the first
1639                                          * select pass.
1640                                          */
1641                                         seldata->wqp[nc] = sellinkfp(fp, (void **)rl_ptr, wqset);
1642                                 }
1643                                 nc++;
1644                         }
1645                 }
1646         }
1647         proc_fdunlock(p);
1648
1649         *retval = n;
1650         return (0);
1651 }
1652
1653 int poll_callback(struct kqueue *, struct kevent_internal_s *, void *);
1654
1655 struct poll_continue_args {
1656         user_addr_t pca_fds;
1657         u_int pca_nfds;
1658         u_int pca_rfds;
1659 };
1660
1661 int
1662 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
1663 {
1664         __pthread_testcancel(1);
1665         return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
1666 }
1667
1668
1669 int
1670 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
1671 {
1672         struct poll_continue_args *cont;
1673         struct pollfd *fds;
1674         struct kqueue *kq;
1675         struct timeval atv;
1676         int ncoll, error = 0;
1677         u_int nfds = uap->nfds;
1678         u_int rfds = 0;
1679         u_int i;
1680         size_t ni;
1681
1682         /*
1683          * This is kinda bogus.  We have fd limits, but that is not
1684          * really related to the size of the pollfd array.  Make sure
1685          * we let the process use at least FD_SETSIZE entries and at
1686          * least enough for the current limits.  We want to be reasonably
1687          * safe, but not overly restrictive.
1688          */
1689         if (nfds > OPEN_MAX ||
1690             (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) || nfds > FD_SETSIZE)))
1691                 return (EINVAL);
1692
1693         kq = kqueue_alloc(p, 0);
1694         if (kq == NULL)
1695                 return (EAGAIN);
1696
1697         ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1698         MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1699         if (NULL == cont) {
1700                 error = EAGAIN;
1701                 goto out;
1702         }
1703
1704         fds = (struct pollfd *)&cont[1];
1705         error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1706         if (error)
1707                 goto out;
1708
1709         if (uap->timeout != -1) {
1710                 struct timeval rtv;
1711
1712                 atv.tv_sec = uap->timeout / 1000;
1713                 atv.tv_usec = (uap->timeout % 1000) * 1000;
1714                 if (itimerfix(&atv)) {
1715                         error = EINVAL;
1716                         goto out;
1717                 }
1718                 getmicrouptime(&rtv);
1719                 timevaladd(&atv, &rtv);
1720         } else {
1721                 atv.tv_sec = 0;
1722                 atv.tv_usec = 0;
1723         }
1724
1725         /* JMM - all this P_SELECT stuff is bogus */
1726         ncoll = nselcoll;
1727         OSBitOrAtomic(P_SELECT, &p->p_flag);
1728         for (i = 0; i < nfds; i++) {
1729                 short events = fds[i].events;
1730
1731                 /* per spec, ignore fd values below zero */
1732                 if (fds[i].fd < 0) {
1733                         fds[i].revents = 0;
1734                         continue;
1735                 }
1736
1737                 /* convert the poll event into a kqueue kevent */
1738                 struct kevent_internal_s kev = {
1739                         .ident = fds[i].fd,
1740                         .flags = EV_ADD | EV_ONESHOT | EV_POLL,
1741                         .udata = CAST_USER_ADDR_T(&fds[i]) };
1742
1743                 /* Handle input events */
1744                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
1745                         kev.filter = EVFILT_READ;
1746                         if (events & ( POLLPRI | POLLRDBAND ))
1747                                 kev.flags |= EV_OOBAND;
1748                         kevent_register(kq, &kev, p);
1749                 }
1750
1751                 /* Handle output events */
1752                 if ((kev.flags & EV_ERROR) == 0 &&
1753                     (events & ( POLLOUT | POLLWRNORM | POLLWRBAND ))) {
1754                         kev.filter = EVFILT_WRITE;
1755                         kevent_register(kq, &kev, p);
1756                 }
1757
1758                 /* Handle BSD extension vnode events */
1759                 if ((kev.flags & EV_ERROR) == 0 &&
1760                     (events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE ))) {
1761                         kev.filter = EVFILT_VNODE;
1762                         kev.fflags = 0;
1763                         if (events & POLLEXTEND)
1764                                 kev.fflags |= NOTE_EXTEND;
1765                         if (events & POLLATTRIB)
1766                                 kev.fflags |= NOTE_ATTRIB;
1767                         if (events & POLLNLINK)
1768                                 kev.fflags |= NOTE_LINK;
1769                         if (events & POLLWRITE)
1770                                 kev.fflags |= NOTE_WRITE;
1771                         kevent_register(kq, &kev, p);
1772                 }
1773
1774                 if (kev.flags & EV_ERROR) {
1775                         fds[i].revents = POLLNVAL;
1776                         rfds++;
1777                 } else
1778                         fds[i].revents = 0;
1779         }
1780
1781         /*
1782          * Did we have any trouble registering?
1783          * If user space passed 0 FDs, then respect any timeout value passed.
1784          * This is an extremely inefficient sleep. If user space passed one or
1785          * more FDs, and we had trouble registering _all_ of them, then bail
1786          * out. If a subset of the provided FDs failed to register, then we
1787          * will still call the kqueue_scan function.
1788          */
1789         if (nfds && (rfds == nfds))
1790                 goto done;
1791
1792         /* scan for, and possibly wait for, the kevents to trigger */
1793         cont->pca_fds = uap->fds;
1794         cont->pca_nfds = nfds;
1795         cont->pca_rfds = rfds;
1796         error = kqueue_scan(kq, poll_callback, NULL, cont, NULL, &atv, p);
1797         rfds = cont->pca_rfds;
1798
1799  done:
1800         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1801         /* poll is not restarted after signals... */
1802         if (error == ERESTART)
1803                 error = EINTR;
1804         if (error == EWOULDBLOCK)
1805                 error = 0;
1806         if (error == 0) {
1807                 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1808                 *retval = rfds;
1809         }
1810
1811  out:
1812         if (NULL != cont)
1813                 FREE(cont, M_TEMP);
1814
1815         kqueue_dealloc(kq);
1816         return (error);
1817 }
1818
1819 int
1820 poll_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, void *data)
1821 {
1822         struct poll_continue_args *cont = (struct poll_continue_args *)data;
1823         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1824         short prev_revents = fds->revents;
1825         short mask = 0;
1826
1827         /* convert the results back into revents */
1828         if (kevp->flags & EV_EOF)
1829                 fds->revents |= POLLHUP;
1830         if (kevp->flags & EV_ERROR)
1831                 fds->revents |= POLLERR;
1832
1833         switch (kevp->filter) {
1834         case EVFILT_READ:
1835                 if (fds->revents & POLLHUP)
1836                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
1837                 else {
1838                         mask = (POLLIN | POLLRDNORM);
1839                         if (kevp->flags & EV_OOBAND)
1840                                 mask |= (POLLPRI | POLLRDBAND);
1841                 }
1842                 fds->revents |= (fds->events & mask);
1843                 break;
1844
1845         case EVFILT_WRITE:
1846                 if (!(fds->revents & POLLHUP))
1847                         fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
1848                 break;
1849
1850         case EVFILT_VNODE:
1851                 if (kevp->fflags & NOTE_EXTEND)
1852                         fds->revents |= (fds->events & POLLEXTEND);
1853                 if (kevp->fflags & NOTE_ATTRIB)
1854                         fds->revents |= (fds->events & POLLATTRIB);
1855                 if (kevp->fflags & NOTE_LINK)
1856                         fds->revents |= (fds->events & POLLNLINK);
1857                 if (kevp->fflags & NOTE_WRITE)
1858                         fds->revents |= (fds->events & POLLWRITE);
1859                 break;
1860         }
1861
1862         if (fds->revents != 0 && prev_revents == 0)
1863                 cont->pca_rfds++;
1864
1865         return 0;
1866 }
1867
1868 int
1869 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1870 {
1871
1872         return (1);
1873 }
1874
1875 /*
1876  * selcount
1877  *
1878  * Count the number of bits set in the input bit vector, and establish an
1879  * outstanding fp->f_iocount for each of the descriptors which will be in
1880  * use in the select operation.
1881  *
1882  * Parameters:  p                       The process doing the select
1883  *              ibits                   The input bit vector
1884  *              nfd                     The number of fd's in the vector
1885  *              countp                  Pointer to where to store the bit count
1886  *
1887  * Returns:     0                       Success
1888  *              EIO                     Bad per process open file table
1889  *              EBADF                   One of the bits in the input bit vector
1890  *                                              references an invalid fd
1891  *
1892  * Implicit:    *countp (modified)      Count of fd's
1893  *
1894  * Notes:       This function is the first pass under the proc_fdlock() that
1895  *              permits us to recognize invalid descriptors in the bit vector;
1896  *              the may, however, not remain valid through the drop and
1897  *              later reacquisition of the proc_fdlock().
1898  */
1899 static int
1900 selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
1901 {
1902         struct filedesc *fdp = p->p_fd;
1903         int msk, i, j, fd;
1904         u_int32_t bits;
1905         struct fileproc *fp;
1906         int n = 0;
1907         u_int32_t *iptr;
1908         u_int nw;
1909         int error=0;
1910         int dropcount;
1911         int need_wakeup = 0;
1912
1913         /*
1914          * Problems when reboot; due to MacOSX signal probs
1915          * in Beaker1C ; verify that the p->p_fd is valid
1916          */
1917         if (fdp == NULL) {
1918                 *countp = 0;
1919                 return(EIO);
1920         }
1921         nw = howmany(nfd, NFDBITS);
1922
1923         proc_fdlock(p);
1924         for (msk = 0; msk < 3; msk++) {
1925                 iptr = (u_int32_t *)&ibits[msk * nw];
1926                 for (i = 0; i < nfd; i += NFDBITS) {
1927                         bits = iptr[i/NFDBITS];
1928                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1929                                 bits &= ~(1 << j);
1930
1931                                 if (fd < fdp->fd_nfiles)
1932                                         fp = fdp->fd_ofiles[fd];
1933                                 else
1934                                         fp = NULL;
1935
1936                                 if (fp == NULL ||
1937                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1938                                                 *countp = 0;
1939                                                 error = EBADF;
1940                                                 goto bad;
1941                                 }
1942                                 fp->f_iocount++;
1943                                 n++;
1944                         }
1945                 }
1946         }
1947         proc_fdunlock(p);
1948
1949         *countp = n;
1950         return (0);
1951
1952 bad:
1953         dropcount = 0;
1954
1955         if (n == 0)
1956                 goto out;
1957         /* Ignore error return; it's already EBADF */
1958         (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1);
1959
1960 out:
1961         proc_fdunlock(p);
1962         if (need_wakeup) {
1963                 wakeup(&p->p_fpdrainwait);
1964         }
1965         return(error);
1966 }
1967
1968
1969 /*
1970  * seldrop_locked
1971  *
1972  * Drop outstanding wait queue references set up during selscan(); drop the
1973  * outstanding per fileproc f_iocount() picked up during the selcount().
1974  *
1975  * Parameters:  p                       Process performing the select
1976  *              ibits                   Input bit bector of fd's
1977  *              nfd                     Number of fd's
1978  *              lim                     Limit to number of vector entries to
1979  *                                              consider, or -1 for "all"
1980  *              inselect                True if
1981  *              need_wakeup             Pointer to flag to set to do a wakeup
1982  *                                      if f_iocont on any descriptor goes to 0
1983  *
1984  * Returns:     0                       Success
1985  *              EBADF                   One or more fds in the bit vector
1986  *                                              were invalid, but the rest
1987  *                                              were successfully dropped
1988  *
1989  * Notes:       An fd make become bad while the proc_fdlock() is not held,
1990  *              if a multithreaded application closes the fd out from under
1991  *              the in progress select.  In this case, we still have to
1992  *              clean up after the set up on the remaining fds.
1993  */
1994 static int
1995 seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount)
1996 {
1997         struct filedesc *fdp = p->p_fd;
1998         int msk, i, j, nc, fd;
1999         u_int32_t bits;
2000         struct fileproc *fp;
2001         u_int32_t *iptr;
2002         u_int nw;
2003         int error = 0;
2004         int dropcount = 0;
2005         uthread_t uth = get_bsdthread_info(current_thread());
2006         struct _select_data *seldata;
2007
2008         *need_wakeup = 0;
2009
2010         /*
2011          * Problems when reboot; due to MacOSX signal probs
2012          * in Beaker1C ; verify that the p->p_fd is valid
2013          */
2014         if (fdp == NULL) {
2015                 return(EIO);
2016         }
2017
2018         nw = howmany(nfd, NFDBITS);
2019         seldata = &uth->uu_kevent.ss_select_data;
2020
2021         nc = 0;
2022         for (msk = 0; msk < 3; msk++) {
2023                 iptr = (u_int32_t *)&ibits[msk * nw];
2024                 for (i = 0; i < nfd; i += NFDBITS) {
2025                         bits = iptr[i/NFDBITS];
2026                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2027                                 bits &= ~(1 << j);
2028                                 fp = fdp->fd_ofiles[fd];
2029                                 /*
2030                                  * If we've already dropped as many as were
2031                                  * counted/scanned, then we are done.
2032                                  */
2033                                 if ((fromselcount != 0) && (++dropcount > lim))
2034                                         goto done;
2035
2036                                 /*
2037                                  * unlink even potentially NULL fileprocs.
2038                                  * If the FD was closed from under us, we
2039                                  * still need to cleanup the waitq links!
2040                                  */
2041                                 selunlinkfp(fp,
2042                                             seldata->wqp ? seldata->wqp[nc] : 0,
2043                                             uth->uu_wqset);
2044
2045                                 nc++;
2046
2047                                 if (fp == NULL) {
2048                                         /* skip (now) bad fds */
2049                                         error = EBADF;
2050                                         continue;
2051                                 }
2052
2053                                 fp->f_iocount--;
2054                                 if (fp->f_iocount < 0)
2055                                         panic("f_iocount overdecrement!");
2056
2057                                 if (fp->f_iocount == 0) {
2058                                         /*
2059                                          * The last iocount is responsible for clearing
2060                                          * selconfict flag - even if we didn't set it -
2061                                          * and is also responsible for waking up anyone
2062                                          * waiting on iocounts to drain.
2063                                          */
2064                                         if (fp->f_flags & FP_SELCONFLICT)
2065                                                 fp->f_flags &= ~FP_SELCONFLICT;
2066                                         if (p->p_fpdrainwait) {
2067                                                 p->p_fpdrainwait = 0;
2068                                                 *need_wakeup = 1;
2069                                         }
2070                                 }
2071                         }
2072                 }
2073         }
2074 done:
2075         return (error);
2076 }
2077
2078
2079 static int
2080 seldrop(struct proc *p, u_int32_t *ibits, int nfd)
2081 {
2082         int error;
2083         int need_wakeup = 0;
2084
2085         proc_fdlock(p);
2086         error =  seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0);
2087         proc_fdunlock(p);
2088         if (need_wakeup) {
2089                 wakeup(&p->p_fpdrainwait);
2090         }
2091         return (error);
2092 }
2093
2094 /*
2095  * Record a select request.
2096  */
2097 void
2098 selrecord(__unused struct proc *selector, struct selinfo *sip, void *s_data)
2099 {
2100         thread_t        cur_act = current_thread();
2101         struct uthread * ut = get_bsdthread_info(cur_act);
2102         /* on input, s_data points to the 64-bit ID of a reserved link object */
2103         uint64_t *reserved_link = (uint64_t *)s_data;
2104
2105         /* need to look at collisions */
2106
2107         /*do not record if this is second pass of select */
2108         if (!s_data)
2109                 return;
2110
2111         if ((sip->si_flags & SI_INITED) == 0) {
2112                 waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO);
2113                 sip->si_flags |= SI_INITED;
2114                 sip->si_flags &= ~SI_CLEAR;
2115         }
2116
2117         if (sip->si_flags & SI_RECORDED)
2118                 sip->si_flags |= SI_COLL;
2119         else
2120                 sip->si_flags &= ~SI_COLL;
2121
2122         sip->si_flags |= SI_RECORDED;
2123         /* note: this checks for pre-existing linkage */
2124         waitq_link(&sip->si_waitq, ut->uu_wqset,
2125                    WAITQ_SHOULD_LOCK, reserved_link);
2126
2127         /*
2128          * Always consume the reserved link.
2129          * We can always call waitq_link_release() safely because if
2130          * waitq_link is successful, it consumes the link and resets the
2131          * value to 0, in which case our call to release becomes a no-op.
2132          * If waitq_link fails, then the following release call will actually
2133          * release the reserved link object.
2134          */
2135         waitq_link_release(*reserved_link);
2136         *reserved_link = 0;
2137
2138         /*
2139          * Use the s_data pointer as an output parameter as well
2140          * This avoids changing the prototype for this function which is
2141          * used by many kexts. We need to surface the waitq object
2142          * associated with the selinfo we just added to the thread's select
2143          * set. New waitq sets do not have back-pointers to set members, so
2144          * the only way to clear out set linkage objects is to go from the
2145          * waitq to the set. We use a memcpy because s_data could be
2146          * pointing to an unaligned value on the stack
2147          * (especially on 32-bit systems)
2148          */
2149         void *wqptr = (void *)&sip->si_waitq;
2150         memcpy((void *)s_data, (void *)&wqptr, sizeof(void *));
2151
2152         return;
2153 }
2154
2155 void
2156 selwakeup(struct selinfo *sip)
2157 {
2158
2159         if ((sip->si_flags & SI_INITED) == 0) {
2160                 return;
2161         }
2162
2163         if (sip->si_flags & SI_COLL) {
2164                 nselcoll++;
2165                 sip->si_flags &= ~SI_COLL;
2166 #if 0
2167                 /* will not  support */
2168                 //wakeup((caddr_t)&selwait);
2169 #endif
2170         }
2171
2172         if (sip->si_flags & SI_RECORDED) {
2173                 waitq_wakeup64_all(&sip->si_waitq, NO_EVENT64,
2174                                    THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2175                 sip->si_flags &= ~SI_RECORDED;
2176         }
2177
2178 }
2179
2180 void
2181 selthreadclear(struct selinfo *sip)
2182 {
2183         struct waitq *wq;
2184
2185         if ((sip->si_flags & SI_INITED) == 0) {
2186                 return;
2187         }
2188         if (sip->si_flags & SI_RECORDED) {
2189                         selwakeup(sip);
2190                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
2191         }
2192         sip->si_flags |= SI_CLEAR;
2193         sip->si_flags &= ~SI_INITED;
2194
2195         wq = &sip->si_waitq;
2196
2197         /*
2198          * Higher level logic may have a handle on this waitq's prepost ID,
2199          * but that's OK because the waitq_deinit will remove/invalidate the
2200          * prepost object (as well as mark the waitq invalid). This de-couples
2201          * us from any callers that may have a handle to this waitq via the
2202          * prepost ID.
2203          */
2204         waitq_deinit(wq);
2205 }
2206
2207
2208
2209
2210 #define DBG_POST        0x10
2211 #define DBG_WATCH       0x11
2212 #define DBG_WAIT        0x12
2213 #define DBG_MOD         0x13
2214 #define DBG_EWAKEUP     0x14
2215 #define DBG_ENQUEUE     0x15
2216 #define DBG_DEQUEUE     0x16
2217
2218 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
2219 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
2220 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
2221 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
2222 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
2223 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
2224 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
2225
2226
2227 #define EVPROCDEQUE(p, evq)     do {                            \
2228         proc_lock(p);                                           \
2229         if (evq->ee_flags & EV_QUEUED) {                        \
2230                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);      \
2231                 evq->ee_flags &= ~EV_QUEUED;                    \
2232         }                                                       \
2233         proc_unlock(p);                                         \
2234 } while (0);
2235
2236
2237 /*
2238  * called upon socket close. deque and free all events for
2239  * the socket...  socket must be locked by caller.
2240  */
2241 void
2242 evsofree(struct socket *sp)
2243 {
2244         struct eventqelt *evq, *next;
2245         proc_t  p;
2246
2247         if (sp == NULL)
2248                 return;
2249
2250         for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
2251                 next = evq->ee_slist.tqe_next;
2252                 p = evq->ee_proc;
2253
2254                 if (evq->ee_flags & EV_QUEUED) {
2255                         EVPROCDEQUE(p, evq);
2256                 }
2257                 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
2258                 FREE(evq, M_TEMP);
2259         }
2260 }
2261
2262
2263 /*
2264  * called upon pipe close. deque and free all events for
2265  * the pipe... pipe must be locked by caller
2266  */
2267 void
2268 evpipefree(struct pipe *cpipe)
2269 {
2270         struct eventqelt *evq, *next;
2271         proc_t  p;
2272
2273         for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
2274                 next = evq->ee_slist.tqe_next;
2275                 p = evq->ee_proc;
2276
2277                 EVPROCDEQUE(p, evq);
2278
2279                 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
2280                 FREE(evq, M_TEMP);
2281         }
2282 }
2283
2284
2285 /*
2286  * enqueue this event if it's not already queued. wakeup
2287  * the proc if we do queue this event to it...
2288  * entered with proc lock held... we drop it before
2289  * doing the wakeup and return in that state
2290  */
2291 static void
2292 evprocenque(struct eventqelt *evq)
2293 {
2294         proc_t  p;
2295
2296         assert(evq);
2297         p = evq->ee_proc;
2298
2299         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,0,0);
2300
2301         proc_lock(p);
2302
2303         if (evq->ee_flags & EV_QUEUED) {
2304                 proc_unlock(p);
2305
2306                 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
2307                 return;
2308         }
2309         evq->ee_flags |= EV_QUEUED;
2310
2311         TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
2312
2313         proc_unlock(p);
2314
2315         wakeup(&p->p_evlist);
2316
2317         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
2318 }
2319
2320
2321 /*
2322  * pipe lock must be taken by the caller
2323  */
2324 void
2325 postpipeevent(struct pipe *pipep, int event)
2326 {
2327         int     mask;
2328         struct eventqelt *evq;
2329
2330         if (pipep == NULL)
2331                 return;
2332         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
2333
2334         for (evq = pipep->pipe_evlist.tqh_first;
2335              evq != NULL; evq = evq->ee_slist.tqe_next) {
2336
2337                 if (evq->ee_eventmask == 0)
2338                         continue;
2339                 mask = 0;
2340
2341                 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
2342
2343                 case EV_RWBYTES:
2344                   if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
2345                           mask |= EV_RE;
2346                           evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
2347                   }
2348                   if ((evq->ee_eventmask & EV_WR) &&
2349                       (MAX(pipep->pipe_buffer.size,PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
2350
2351                           if (pipep->pipe_state & PIPE_EOF) {
2352                                   mask |= EV_WR|EV_RESET;
2353                                   break;
2354                           }
2355                           mask |= EV_WR;
2356                           evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt;
2357                   }
2358                   break;
2359
2360                 case EV_WCLOSED:
2361                 case EV_RCLOSED:
2362                   if ((evq->ee_eventmask & EV_RE)) {
2363                           mask |= EV_RE|EV_RCLOSED;
2364                   }
2365                   if ((evq->ee_eventmask & EV_WR)) {
2366                           mask |= EV_WR|EV_WCLOSED;
2367                   }
2368                   break;
2369
2370                 default:
2371                   return;
2372                 }
2373                 if (mask) {
2374                         /*
2375                          * disarm... postevents are nops until this event is 'read' via
2376                          * waitevent and then re-armed via modwatch
2377                          */
2378                         evq->ee_eventmask = 0;
2379
2380                         /*
2381                          * since events are disarmed until after the waitevent
2382                          * the ee_req.er_xxxx fields can't change once we've
2383                          * inserted this event into the proc queue...
2384                          * therefore, the waitevent will see a 'consistent'
2385                          * snapshot of the event, even though it won't hold
2386                          * the pipe lock, and we're updating the event outside
2387                          * of the proc lock, which it will hold
2388                          */
2389                         evq->ee_req.er_eventbits |= mask;
2390
2391                         KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1,0);
2392
2393                         evprocenque(evq);
2394                 }
2395         }
2396         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
2397 }
2398
2399 #if SOCKETS
2400 /*
2401  * given either a sockbuf or a socket run down the
2402  * event list and queue ready events found...
2403  * the socket must be locked by the caller
2404  */
2405 void
2406 postevent(struct socket *sp, struct sockbuf *sb, int event)
2407 {
2408         int     mask;
2409         struct  eventqelt *evq;
2410         struct  tcpcb *tp;
2411
2412         if (sb)
2413                 sp = sb->sb_so;
2414         if (sp == NULL)
2415                 return;
2416
2417         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
2418
2419         for (evq = sp->so_evlist.tqh_first;
2420              evq != NULL; evq = evq->ee_slist.tqe_next) {
2421
2422                 if (evq->ee_eventmask == 0)
2423                         continue;
2424                 mask = 0;
2425
2426                 /* ready for reading:
2427                    - byte cnt >= receive low water mark
2428                    - read-half of conn closed
2429                    - conn pending for listening sock
2430                    - socket error pending
2431
2432                    ready for writing
2433                    - byte cnt avail >= send low water mark
2434                    - write half of conn closed
2435                    - socket error pending
2436                    - non-blocking conn completed successfully
2437
2438                    exception pending
2439                    - out of band data
2440                    - sock at out of band mark
2441                 */
2442
2443                 switch (event & EV_DMASK) {
2444
2445                 case EV_OOB:
2446                   if ((evq->ee_eventmask & EV_EX)) {
2447                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2448                                   mask |= EV_EX|EV_OOB;
2449                   }
2450                   break;
2451
2452                 case EV_RWBYTES|EV_OOB:
2453                   if ((evq->ee_eventmask & EV_EX)) {
2454                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2455                                   mask |= EV_EX|EV_OOB;
2456                   }
2457                   /*
2458                    * fall into the next case
2459                    */
2460                 case EV_RWBYTES:
2461                   if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
2462                           /* for AFP/OT purposes; may go away in future */
2463                           if ((SOCK_DOM(sp) == PF_INET ||
2464                               SOCK_DOM(sp) == PF_INET6) &&
2465                               SOCK_PROTO(sp) == IPPROTO_TCP &&
2466                               (sp->so_error == ECONNREFUSED ||
2467                               sp->so_error == ECONNRESET)) {
2468                                   if (sp->so_pcb == NULL ||
2469                                       sotoinpcb(sp)->inp_state ==
2470                                       INPCB_STATE_DEAD ||
2471                                       (tp = sototcpcb(sp)) == NULL ||
2472                                       tp->t_state == TCPS_CLOSED) {
2473                                           mask |= EV_RE|EV_RESET;
2474                                           break;
2475                                   }
2476                           }
2477                           mask |= EV_RE;
2478                           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
2479
2480                           if (sp->so_state & SS_CANTRCVMORE) {
2481                                   mask |= EV_FIN;
2482                                   break;
2483                           }
2484                   }
2485                   if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
2486                           /* for AFP/OT purposes; may go away in future */
2487                           if ((SOCK_DOM(sp) == PF_INET ||
2488                               SOCK_DOM(sp) == PF_INET6) &&
2489                               SOCK_PROTO(sp) == IPPROTO_TCP &&
2490                               (sp->so_error == ECONNREFUSED ||
2491                               sp->so_error == ECONNRESET)) {
2492                                   if (sp->so_pcb == NULL ||
2493                                       sotoinpcb(sp)->inp_state ==
2494                                       INPCB_STATE_DEAD ||
2495                                       (tp = sototcpcb(sp)) == NULL ||
2496                                       tp->t_state == TCPS_CLOSED) {
2497                                           mask |= EV_WR|EV_RESET;
2498                                           break;
2499                                   }
2500                           }
2501                           mask |= EV_WR;
2502                           evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
2503                   }
2504                   break;
2505
2506                 case EV_RCONN:
2507                   if ((evq->ee_eventmask & EV_RE)) {
2508                           mask |= EV_RE|EV_RCONN;
2509                           evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
2510                   }
2511                   break;
2512
2513                 case EV_WCONN:
2514                   if ((evq->ee_eventmask & EV_WR)) {
2515                           mask |= EV_WR|EV_WCONN;
2516                   }
2517                   break;
2518
2519                 case EV_RCLOSED:
2520                   if ((evq->ee_eventmask & EV_RE)) {
2521                           mask |= EV_RE|EV_RCLOSED;
2522                   }
2523                   break;
2524
2525                 case EV_WCLOSED:
2526                   if ((evq->ee_eventmask & EV_WR)) {
2527                           mask |= EV_WR|EV_WCLOSED;
2528                   }
2529                   break;
2530
2531                 case EV_FIN:
2532                   if (evq->ee_eventmask & EV_RE) {
2533                           mask |= EV_RE|EV_FIN;
2534                   }
2535                   break;
2536
2537                 case EV_RESET:
2538                 case EV_TIMEOUT:
2539                   if (evq->ee_eventmask & EV_RE) {
2540                           mask |= EV_RE | event;
2541                   }
2542                   if (evq->ee_eventmask & EV_WR) {
2543                           mask |= EV_WR | event;
2544                   }
2545                   break;
2546
2547                 default:
2548                   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
2549                   return;
2550                 } /* switch */
2551
2552                 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
2553
2554                 if (mask) {
2555                         /*
2556                          * disarm... postevents are nops until this event is 'read' via
2557                          * waitevent and then re-armed via modwatch
2558                          */
2559                         evq->ee_eventmask = 0;
2560
2561                         /*
2562                          * since events are disarmed until after the waitevent
2563                          * the ee_req.er_xxxx fields can't change once we've
2564                          * inserted this event into the proc queue...
2565                          * since waitevent can't see this event until we
2566                          * enqueue it, waitevent will see a 'consistent'
2567                          * snapshot of the event, even though it won't hold
2568                          * the socket lock, and we're updating the event outside
2569                          * of the proc lock, which it will hold
2570                          */
2571                         evq->ee_req.er_eventbits |= mask;
2572
2573                         evprocenque(evq);
2574                 }
2575         }
2576         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
2577 }
2578 #endif /* SOCKETS */
2579
2580
2581 /*
2582  * watchevent system call. user passes us an event to watch
2583  * for. we malloc an event object, initialize it, and queue
2584  * it to the open socket. when the event occurs, postevent()
2585  * will enque it back to our proc where we can retrieve it
2586  * via waitevent().
2587  *
2588  * should this prevent duplicate events on same socket?
2589  *
2590  * Returns:
2591  *              ENOMEM                  No memory for operation
2592  *      copyin:EFAULT
2593  */
2594 int
2595 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
2596 {
2597         struct eventqelt *evq = (struct eventqelt *)0;
2598         struct eventqelt *np = NULL;
2599         struct eventreq64 *erp;
2600         struct fileproc *fp = NULL;
2601         int error;
2602
2603         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
2604
2605         // get a qelt and fill with users req
2606         MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
2607
2608         if (evq == NULL)
2609                 return (ENOMEM);
2610         erp = &evq->ee_req;
2611
2612         // get users request pkt
2613
2614         if (IS_64BIT_PROCESS(p)) {
2615                 error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
2616         } else {
2617                 struct eventreq32 er32;
2618
2619                 error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
2620                 if (error == 0) {
2621                        /*
2622                         * the user only passes in the
2623                         * er_type, er_handle and er_data...
2624                         * the other fields are initialized
2625                         * below, so don't bother to copy
2626                         */
2627                         erp->er_type = er32.er_type;
2628                         erp->er_handle = er32.er_handle;
2629                         erp->er_data = (user_addr_t)er32.er_data;
2630                 }
2631         }
2632         if (error) {
2633                 FREE(evq, M_TEMP);
2634                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2635
2636                 return(error);
2637         }
2638         KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2639
2640         // validate, freeing qelt if errors
2641         error = 0;
2642         proc_fdlock(p);
2643
2644         if (erp->er_type != EV_FD) {
2645                 error = EINVAL;
2646         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2647                 error = EBADF;
2648 #if SOCKETS
2649         } else if (fp->f_type == DTYPE_SOCKET) {
2650                 socket_lock((struct socket *)fp->f_data, 1);
2651                 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2652 #endif /* SOCKETS */
2653         } else if (fp->f_type == DTYPE_PIPE) {
2654                 PIPE_LOCK((struct pipe *)fp->f_data);
2655                 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2656         } else {
2657                 fp_drop(p, erp->er_handle, fp, 1);
2658                 error = EINVAL;
2659         }
2660         proc_fdunlock(p);
2661
2662         if (error) {
2663                 FREE(evq, M_TEMP);
2664
2665                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2666                 return(error);
2667         }
2668
2669         /*
2670          * only allow one watch per file per proc
2671          */
2672         for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2673                 if (np->ee_proc == p) {
2674 #if SOCKETS
2675                         if (fp->f_type == DTYPE_SOCKET)
2676                                 socket_unlock((struct socket *)fp->f_data, 1);
2677                         else
2678 #endif /* SOCKETS */
2679                                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2680                         fp_drop(p, erp->er_handle, fp, 0);
2681                         FREE(evq, M_TEMP);
2682
2683                         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2684                         return(EINVAL);
2685                 }
2686         }
2687         erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
2688         evq->ee_proc = p;
2689         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2690         evq->ee_flags = 0;
2691
2692 #if SOCKETS
2693         if (fp->f_type == DTYPE_SOCKET) {
2694                 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2695                 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
2696
2697                 socket_unlock((struct socket *)fp->f_data, 1);
2698         } else
2699 #endif /* SOCKETS */
2700         {
2701                 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2702                 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2703
2704                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2705         }
2706         fp_drop_event(p, erp->er_handle, fp);
2707
2708         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
2709         return(0);
2710 }
2711
2712
2713
2714 /*
2715  * waitevent system call.
2716  * grabs the next waiting event for this proc and returns
2717  * it. if no events, user can request to sleep with timeout
2718  * or without or poll mode
2719  *    ((tv != NULL && interval == 0) || tv == -1)
2720  */
2721 int
2722 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
2723 {
2724         int error = 0;
2725         struct eventqelt *evq;
2726         struct eventreq64 *erp;
2727         uint64_t abstime, interval;
2728         boolean_t fast_poll = FALSE;
2729         union {
2730                 struct eventreq64 er64;
2731                 struct eventreq32 er32;
2732         } uer;
2733
2734         interval = 0;
2735
2736         if (uap->tv) {
2737                 struct timeval atv;
2738                 /*
2739                  * check for fast poll method
2740                  */
2741                 if (IS_64BIT_PROCESS(p)) {
2742                         if (uap->tv == (user_addr_t)-1)
2743                                 fast_poll = TRUE;
2744                 } else if (uap->tv == (user_addr_t)((uint32_t)-1))
2745                         fast_poll = TRUE;
2746
2747                 if (fast_poll == TRUE) {
2748                         if (p->p_evlist.tqh_first == NULL) {
2749                                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_NONE, -1,0,0,0,0);
2750                                 /*
2751                                  * poll failed
2752                                  */
2753                                 *retval = 1;
2754                                 return (0);
2755                         }
2756                         proc_lock(p);
2757                         goto retry;
2758                 }
2759                 if (IS_64BIT_PROCESS(p)) {
2760                         struct user64_timeval atv64;
2761                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
2762                         /* Loses resolution - assume timeout < 68 years */
2763                         atv.tv_sec = atv64.tv_sec;
2764                         atv.tv_usec = atv64.tv_usec;
2765                 } else {
2766                         struct user32_timeval atv32;
2767                         error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
2768                         atv.tv_sec = atv32.tv_sec;
2769                         atv.tv_usec = atv32.tv_usec;
2770                 }
2771
2772                 if (error)
2773                         return(error);
2774                 if (itimerfix(&atv)) {
2775                         error = EINVAL;
2776                         return(error);
2777                 }
2778                 interval = tvtoabstime(&atv);
2779         }
2780         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
2781
2782         proc_lock(p);
2783 retry:
2784         if ((evq = p->p_evlist.tqh_first) != NULL) {
2785                 /*
2786                  * found one... make a local copy while it's still on the queue
2787                  * to prevent it from changing while in the midst of copying
2788                  * don't want to hold the proc lock across a copyout because
2789                  * it might block on a page fault at the target in user space
2790                  */
2791                 erp = &evq->ee_req;
2792
2793                 if (IS_64BIT_PROCESS(p))
2794                         bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
2795                 else {
2796                         uer.er32.er_type  = erp->er_type;
2797                         uer.er32.er_handle  = erp->er_handle;
2798                         uer.er32.er_data  = (uint32_t)erp->er_data;
2799                         uer.er32.er_ecnt  = erp->er_ecnt;
2800                         uer.er32.er_rcnt  = erp->er_rcnt;
2801                         uer.er32.er_wcnt  = erp->er_wcnt;
2802                         uer.er32.er_eventbits = erp->er_eventbits;
2803                 }
2804                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2805
2806                 evq->ee_flags &= ~EV_QUEUED;
2807
2808                 proc_unlock(p);
2809
2810                 if (IS_64BIT_PROCESS(p))
2811                         error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
2812                 else
2813                         error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
2814
2815                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
2816                              evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,0);
2817                 return (error);
2818         }
2819         else {
2820                 if (uap->tv && interval == 0) {
2821                         proc_unlock(p);
2822                         *retval = 1;  // poll failed
2823
2824                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
2825                         return (error);
2826                 }
2827                 if (interval != 0)
2828                         clock_absolutetime_interval_to_deadline(interval, &abstime);
2829                 else
2830                         abstime = 0;
2831
2832                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,(uint32_t)&p->p_evlist,0,0,0);
2833
2834                 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
2835
2836                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,(uint32_t)&p->p_evlist,0,0,0);
2837
2838                 if (error == 0)
2839                         goto retry;
2840                 if (error == ERESTART)
2841                         error = EINTR;
2842                 if (error == EWOULDBLOCK) {
2843                         *retval = 1;
2844                         error = 0;
2845                 }
2846         }
2847         proc_unlock(p);
2848
2849         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
2850         return (error);
2851 }
2852
2853
2854 /*
2855  * modwatch system call. user passes in event to modify.
2856  * if we find it we reset the event bits and que/deque event
2857  * it needed.
2858  */
2859 int
2860 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
2861 {
2862         struct eventreq64 er;
2863         struct eventreq64 *erp = &er;
2864         struct eventqelt *evq = NULL;   /* protected by error return */
2865         int error;
2866         struct fileproc *fp;
2867         int flag;
2868
2869         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
2870
2871         /*
2872          * get user's request pkt
2873          * just need the er_type and er_handle which sit above the
2874          * problematic er_data (32/64 issue)... so only copy in
2875          * those 2 fields
2876          */
2877         if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
2878                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2879                 return(error);
2880         }
2881         proc_fdlock(p);
2882
2883         if (erp->er_type != EV_FD) {
2884                 error = EINVAL;
2885         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2886                 error = EBADF;
2887 #if SOCKETS
2888         } else if (fp->f_type == DTYPE_SOCKET) {
2889                 socket_lock((struct socket *)fp->f_data, 1);
2890                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2891 #endif /* SOCKETS */
2892         } else if (fp->f_type == DTYPE_PIPE) {
2893                 PIPE_LOCK((struct pipe *)fp->f_data);
2894                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2895         } else {
2896                 fp_drop(p, erp->er_handle, fp, 1);
2897                 error = EINVAL;
2898         }
2899
2900         if (error) {
2901                 proc_fdunlock(p);
2902                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2903                 return(error);
2904         }
2905
2906         if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2907                 fp->f_flags &= ~FP_WAITEVENT;
2908         }
2909         proc_fdunlock(p);
2910
2911         // locate event if possible
2912         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2913                 if (evq->ee_proc == p)
2914                         break;
2915         }
2916         if (evq == NULL) {
2917 #if SOCKETS
2918                 if (fp->f_type == DTYPE_SOCKET)
2919                         socket_unlock((struct socket *)fp->f_data, 1);
2920                 else
2921 #endif /* SOCKETS */
2922                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2923                 fp_drop(p, erp->er_handle, fp, 0);
2924                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
2925                 return(EINVAL);
2926         }
2927         KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2928
2929         if (uap->u_eventmask == EV_RM) {
2930                 EVPROCDEQUE(p, evq);
2931
2932 #if SOCKETS
2933                 if (fp->f_type == DTYPE_SOCKET) {
2934                         TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2935                         socket_unlock((struct socket *)fp->f_data, 1);
2936                 } else
2937 #endif /* SOCKETS */
2938                 {
2939                         TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2940                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2941                 }
2942                 fp_drop(p, erp->er_handle, fp, 0);
2943                 FREE(evq, M_TEMP);
2944                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
2945                 return(0);
2946         }
2947         switch (uap->u_eventmask & EV_MASK) {
2948
2949         case 0:
2950                 flag = 0;
2951                 break;
2952
2953         case EV_RE:
2954         case EV_WR:
2955         case EV_RE|EV_WR:
2956                 flag = EV_RWBYTES;
2957                 break;
2958
2959         case EV_EX:
2960                 flag = EV_OOB;
2961                 break;
2962
2963         case EV_EX|EV_RE:
2964         case EV_EX|EV_WR:
2965         case EV_EX|EV_RE|EV_WR:
2966                 flag = EV_OOB|EV_RWBYTES;
2967                 break;
2968
2969         default:
2970 #if SOCKETS
2971                 if (fp->f_type == DTYPE_SOCKET)
2972                         socket_unlock((struct socket *)fp->f_data, 1);
2973                 else
2974 #endif /* SOCKETS */
2975                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2976                 fp_drop(p, erp->er_handle, fp, 0);
2977                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2978                 return(EINVAL);
2979         }
2980         /*
2981          * since we're holding the socket/pipe lock, the event
2982          * cannot go from the unqueued state to the queued state
2983          * however, it can go from the queued state to the unqueued state
2984          * since that direction is protected by the proc_lock...
2985          * so do a quick check for EV_QUEUED w/o holding the proc lock
2986          * since by far the common case will be NOT EV_QUEUED, this saves
2987          * us taking the proc_lock the majority of the time
2988          */
2989         if (evq->ee_flags & EV_QUEUED) {
2990                 /*
2991                  * EVPROCDEQUE will recheck the state after it grabs the proc_lock
2992                  */
2993                 EVPROCDEQUE(p, evq);
2994         }
2995         /*
2996          * while the event is off the proc queue and
2997          * we're holding the socket/pipe lock
2998          * it's safe to update these fields...
2999          */
3000         evq->ee_req.er_eventbits = 0;
3001         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
3002
3003 #if SOCKETS
3004         if (fp->f_type == DTYPE_SOCKET) {
3005                 postevent((struct socket *)fp->f_data, 0, flag);
3006                 socket_unlock((struct socket *)fp->f_data, 1);
3007         } else
3008 #endif /* SOCKETS */
3009         {
3010                 postpipeevent((struct pipe *)fp->f_data, flag);
3011                 PIPE_UNLOCK((struct pipe *)fp->f_data);
3012         }
3013         fp_drop(p, erp->er_handle, fp, 0);
3014         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,0);
3015         return(0);
3016 }
3017
3018 /* this routine is called from the close of fd with proc_fdlock held */
3019 int
3020 waitevent_close(struct proc *p, struct fileproc *fp)
3021 {
3022         struct eventqelt *evq;
3023
3024
3025         fp->f_flags &= ~FP_WAITEVENT;
3026
3027 #if SOCKETS
3028         if (fp->f_type == DTYPE_SOCKET) {
3029                 socket_lock((struct socket *)fp->f_data, 1);
3030                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
3031         } else
3032 #endif /* SOCKETS */
3033         if (fp->f_type == DTYPE_PIPE) {
3034                 PIPE_LOCK((struct pipe *)fp->f_data);
3035                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
3036         }
3037         else {
3038                 return(EINVAL);
3039         }
3040         proc_fdunlock(p);
3041
3042
3043         // locate event if possible
3044         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
3045                 if (evq->ee_proc == p)
3046                         break;
3047         }
3048         if (evq == NULL) {
3049 #if SOCKETS
3050                 if (fp->f_type == DTYPE_SOCKET)
3051                         socket_unlock((struct socket *)fp->f_data, 1);
3052                 else
3053 #endif /* SOCKETS */
3054                         PIPE_UNLOCK((struct pipe *)fp->f_data);
3055
3056                 proc_fdlock(p);
3057
3058                 return(EINVAL);
3059         }
3060         EVPROCDEQUE(p, evq);
3061
3062 #if SOCKETS
3063         if (fp->f_type == DTYPE_SOCKET) {
3064                 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
3065                 socket_unlock((struct socket *)fp->f_data, 1);
3066         } else
3067 #endif /* SOCKETS */
3068         {
3069                 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
3070                 PIPE_UNLOCK((struct pipe *)fp->f_data);
3071         }
3072         FREE(evq, M_TEMP);
3073
3074         proc_fdlock(p);
3075
3076         return(0);
3077 }
3078
3079
3080 /*
3081  * gethostuuid
3082  *
3083  * Description: Get the host UUID from IOKit and return it to user space.
3084  *
3085  * Parameters:  uuid_buf                Pointer to buffer to receive UUID
3086  *              timeout                 Timespec for timout
3087  *              spi                             SPI, skip sandbox check (temporary)
3088  *
3089  * Returns:     0                       Success
3090  *              EWOULDBLOCK             Timeout is too short
3091  *              copyout:EFAULT          Bad user buffer
3092  *              mac_system_check_info:EPERM             Client not allowed to perform this operation
3093  *
3094  * Notes:       A timeout seems redundant, since if it's tolerable to not
3095  *              have a system UUID in hand, then why ask for one?
3096  */
3097 int
3098 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
3099 {
3100         kern_return_t kret;
3101         int error;
3102         mach_timespec_t mach_ts;        /* for IOKit call */
3103         __darwin_uuid_t uuid_kern;      /* for IOKit call */
3104
3105         if (!uap->spi) {
3106         }
3107
3108         /* Convert the 32/64 bit timespec into a mach_timespec_t */
3109         if ( proc_is64bit(p) ) {
3110                 struct user64_timespec ts;
3111                 error = copyin(uap->timeoutp, &ts, sizeof(ts));
3112                 if (error)
3113                         return (error);
3114                 mach_ts.tv_sec = ts.tv_sec;
3115                 mach_ts.tv_nsec = ts.tv_nsec;
3116         } else {
3117                 struct user32_timespec ts;
3118                 error = copyin(uap->timeoutp, &ts, sizeof(ts) );
3119                 if (error)
3120                         return (error);
3121                 mach_ts.tv_sec = ts.tv_sec;
3122                 mach_ts.tv_nsec = ts.tv_nsec;
3123         }
3124
3125         /* Call IOKit with the stack buffer to get the UUID */
3126         kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
3127
3128         /*
3129          * If we get it, copy out the data to the user buffer; note that a
3130          * uuid_t is an array of characters, so this is size invariant for
3131          * 32 vs. 64 bit.
3132          */
3133         if (kret == KERN_SUCCESS) {
3134                 error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
3135         } else {
3136                 error = EWOULDBLOCK;
3137         }
3138
3139         return (error);
3140 }
3141
3142 /*
3143  * ledger
3144  *
3145  * Description: Omnibus system call for ledger operations
3146  */
3147 int
3148 ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
3149 {
3150 #if !CONFIG_MACF
3151 #pragma unused(p)
3152 #endif
3153         int rval, pid, len, error;
3154 #ifdef LEDGER_DEBUG
3155         struct ledger_limit_args lla;
3156 #endif
3157         task_t task;
3158         proc_t proc;
3159
3160         /* Finish copying in the necessary args before taking the proc lock */
3161         error = 0;
3162         len = 0;
3163         if (args->cmd == LEDGER_ENTRY_INFO)
3164                 error = copyin(args->arg3, (char *)&len, sizeof (len));
3165         else if (args->cmd == LEDGER_TEMPLATE_INFO)
3166                 error = copyin(args->arg2, (char *)&len, sizeof (len));
3167 #ifdef LEDGER_DEBUG
3168         else if (args->cmd == LEDGER_LIMIT)
3169                 error = copyin(args->arg2, (char *)&lla, sizeof (lla));
3170 #endif
3171         else if ((args->cmd < 0) || (args->cmd > LEDGER_MAX_CMD))
3172                 return (EINVAL);
3173
3174         if (error)
3175                 return (error);
3176         if (len < 0)
3177                 return (EINVAL);
3178
3179         rval = 0;
3180         if (args->cmd != LEDGER_TEMPLATE_INFO) {
3181                 pid = args->arg1;
3182                 proc = proc_find(pid);
3183                 if (proc == NULL)
3184                         return (ESRCH);
3185
3186 #if CONFIG_MACF
3187                 error = mac_proc_check_ledger(p, proc, args->cmd);
3188                 if (error) {
3189                         proc_rele(proc);
3190                         return (error);
3191                 }
3192 #endif
3193
3194                 task = proc->task;
3195         }
3196
3197         switch (args->cmd) {
3198 #ifdef LEDGER_DEBUG
3199                 case LEDGER_LIMIT: {
3200                         if (!kauth_cred_issuser(kauth_cred_get()))
3201                                 rval = EPERM;
3202                         rval = ledger_limit(task, &lla);
3203                         proc_rele(proc);
3204                         break;
3205                 }
3206 #endif
3207                 case LEDGER_INFO: {
3208                         struct ledger_info info;
3209
3210                         rval = ledger_info(task, &info);
3211                         proc_rele(proc);
3212                         if (rval == 0)
3213                                 rval = copyout(&info, args->arg2,
3214                                     sizeof (info));
3215                         break;
3216                 }
3217
3218                 case LEDGER_ENTRY_INFO: {
3219                         void *buf;
3220                         int sz;
3221
3222                         rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
3223                         proc_rele(proc);
3224                         if ((rval == 0) && (len >= 0)) {
3225                                 sz = len * sizeof (struct ledger_entry_info);
3226                                 rval = copyout(buf, args->arg2, sz);
3227                                 kfree(buf, sz);
3228                         }
3229                         if (rval == 0)
3230                                 rval = copyout(&len, args->arg3, sizeof (len));
3231                         break;
3232                 }
3233
3234                 case LEDGER_TEMPLATE_INFO: {
3235                         void *buf;
3236                         int sz;
3237
3238                         rval = ledger_template_info(&buf, &len);
3239                         if ((rval == 0) && (len >= 0)) {
3240                                 sz = len * sizeof (struct ledger_template_info);
3241                                 rval = copyout(buf, args->arg1, sz);
3242                                 kfree(buf, sz);
3243                         }
3244                         if (rval == 0)
3245                                 rval = copyout(&len, args->arg2, sizeof (len));
3246                         break;
3247                 }
3248
3249                 default:
3250                         panic("ledger syscall logic error -- command type %d", args->cmd);
3251                         proc_rele(proc);
3252                         rval = EINVAL;
3253         }
3254
3255         return (rval);
3256 }
3257
3258 int
3259 telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t *retval)
3260 {
3261         int error = 0;
3262
3263         switch (args->cmd) {
3264 #if CONFIG_TELEMETRY
3265         case TELEMETRY_CMD_TIMER_EVENT:
3266                 error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
3267                 break;
3268 #endif /* CONFIG_TELEMETRY */
3269         case TELEMETRY_CMD_VOUCHER_NAME:
3270                 if (thread_set_voucher_name((mach_port_name_t)args->deadline))
3271                         error = EINVAL;
3272                 break;
3273
3274         default:
3275                 error = EINVAL;
3276                 break;
3277         }
3278
3279         return (error);
3280 }
3281
3282 #if defined(DEVELOPMENT) || defined(DEBUG)
3283 #if CONFIG_WAITQ_DEBUG
3284 static uint64_t g_wqset_num = 0;
3285 struct g_wqset {
3286         queue_chain_t      link;
3287         struct waitq_set  *wqset;
3288 };
3289
3290 static queue_head_t         g_wqset_list;
3291 static struct waitq_set    *g_waitq_set = NULL;
3292
3293 static inline struct waitq_set *sysctl_get_wqset(int idx)
3294 {
3295         struct g_wqset *gwqs;
3296
3297         if (!g_wqset_num)
3298                 queue_init(&g_wqset_list);
3299
3300         /* don't bother with locks: this is test-only code! */
3301         qe_foreach_element(gwqs, &g_wqset_list, link) {
3302                 if ((int)(wqset_id(gwqs->wqset) & 0xffffffff) == idx)
3303                         return gwqs->wqset;
3304         }
3305
3306         /* allocate a new one */
3307         ++g_wqset_num;
3308         gwqs = (struct g_wqset *)kalloc(sizeof(*gwqs));
3309         assert(gwqs != NULL);
3310
3311         gwqs->wqset = waitq_set_alloc(SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL);
3312         enqueue_tail(&g_wqset_list, &gwqs->link);
3313         printf("[WQ]: created new waitq set 0x%llx\n", wqset_id(gwqs->wqset));
3314
3315         return gwqs->wqset;
3316 }
3317
3318 #define MAX_GLOBAL_TEST_QUEUES 64
3319 static int g_wq_init = 0;
3320 static struct waitq  g_wq[MAX_GLOBAL_TEST_QUEUES];
3321
3322 static inline struct waitq *global_test_waitq(int idx)
3323 {
3324         if (idx < 0)
3325                 return NULL;
3326
3327         if (!g_wq_init) {
3328                 g_wq_init = 1;
3329                 for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++)
3330                         waitq_init(&g_wq[i], SYNC_POLICY_FIFO);
3331         }
3332
3333         return &g_wq[idx % MAX_GLOBAL_TEST_QUEUES];
3334 }
3335
3336 static int sysctl_waitq_wakeup_one SYSCTL_HANDLER_ARGS
3337 {
3338 #pragma unused(oidp, arg1, arg2)
3339         int error;
3340         int index;
3341         struct waitq *waitq;
3342         kern_return_t kr;
3343         int64_t event64 = 0;
3344
3345         error = SYSCTL_IN(req, &event64, sizeof(event64));
3346         if (error)
3347                 return error;
3348
3349         if (!req->newptr)
3350                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3351
3352         if (event64 < 0) {
3353                 index = (int)((-event64) & 0xffffffff);
3354                 waitq = wqset_waitq(sysctl_get_wqset(index));
3355                 index = -index;
3356         } else {
3357                 index = (int)event64;
3358                 waitq = global_test_waitq(index);
3359         }
3360
3361         event64 = 0;
3362
3363         printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
3364                index, event64);
3365         kr = waitq_wakeup64_one(waitq, (event64_t)event64, THREAD_AWAKENED,
3366                                 WAITQ_ALL_PRIORITIES);
3367         printf("[WQ]: \tkr=%d\n", kr);
3368
3369         return SYSCTL_OUT(req, &kr, sizeof(kr));
3370 }
3371 SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_one, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3372             0, 0, sysctl_waitq_wakeup_one, "Q", "wakeup one thread waiting on given event");
3373
3374
3375 static int sysctl_waitq_wakeup_all SYSCTL_HANDLER_ARGS
3376 {
3377 #pragma unused(oidp, arg1, arg2)
3378         int error;
3379         int index;
3380         struct waitq *waitq;
3381         kern_return_t kr;
3382         int64_t event64 = 0;
3383
3384         error = SYSCTL_IN(req, &event64, sizeof(event64));
3385         if (error)
3386                 return error;
3387
3388         if (!req->newptr)
3389                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3390
3391         if (event64 < 0) {
3392                 index = (int)((-event64) & 0xffffffff);
3393                 waitq = wqset_waitq(sysctl_get_wqset(index));
3394                 index = -index;
3395         } else {
3396                 index = (int)event64;
3397                 waitq = global_test_waitq(index);
3398         }
3399
3400         event64 = 0;
3401
3402         printf("[WQ]: Waking all threads on waitq [%d] event:0x%llx\n",
3403                index, event64);
3404         kr = waitq_wakeup64_all(waitq, (event64_t)event64,
3405                                 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
3406         printf("[WQ]: \tkr=%d\n", kr);
3407
3408         return SYSCTL_OUT(req, &kr, sizeof(kr));
3409 }
3410 SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_all, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3411             0, 0, sysctl_waitq_wakeup_all, "Q", "wakeup all threads waiting on given event");
3412
3413
3414 static int sysctl_waitq_wait SYSCTL_HANDLER_ARGS
3415 {
3416 #pragma unused(oidp, arg1, arg2)
3417         int error;
3418         int index;
3419         struct waitq *waitq;
3420         kern_return_t kr;
3421         int64_t event64 = 0;
3422
3423         error = SYSCTL_IN(req, &event64, sizeof(event64));
3424         if (error)
3425                 return error;
3426
3427         if (!req->newptr)
3428                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3429
3430         if (event64 < 0) {
3431                 index = (int)((-event64) & 0xffffffff);
3432                 waitq = wqset_waitq(sysctl_get_wqset(index));
3433                 index = -index;
3434         } else {
3435                 index = (int)event64;
3436                 waitq = global_test_waitq(index);
3437         }
3438
3439         event64 = 0;
3440
3441         printf("[WQ]: Current thread waiting on waitq [%d] event:0x%llx\n",
3442                index, event64);
3443         kr = waitq_assert_wait64(waitq, (event64_t)event64, THREAD_INTERRUPTIBLE, 0);
3444         if (kr == THREAD_WAITING)
3445                 thread_block(THREAD_CONTINUE_NULL);
3446         printf("[WQ]: \tWoke Up: kr=%d\n", kr);
3447
3448         return SYSCTL_OUT(req, &kr, sizeof(kr));
3449 }
3450 SYSCTL_PROC(_kern, OID_AUTO, waitq_wait, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3451             0, 0, sysctl_waitq_wait, "Q", "start waiting on given event");
3452
3453
3454 static int sysctl_wqset_select SYSCTL_HANDLER_ARGS
3455 {
3456 #pragma unused(oidp, arg1, arg2)
3457         int error;
3458         struct waitq_set *wqset;
3459         uint64_t event64 = 0;
3460
3461         error = SYSCTL_IN(req, &event64, sizeof(event64));
3462         if (error)
3463                 return error;
3464
3465         if (!req->newptr)
3466                 goto out;
3467
3468         wqset = sysctl_get_wqset((int)(event64 & 0xffffffff));
3469         g_waitq_set = wqset;
3470
3471         event64 = wqset_id(wqset);
3472         printf("[WQ]: selected wqset 0x%llx\n", event64);
3473
3474 out:
3475         if (g_waitq_set)
3476                 event64 = wqset_id(g_waitq_set);
3477         else
3478                 event64 = (uint64_t)(-1);
3479
3480         return SYSCTL_OUT(req, &event64, sizeof(event64));
3481 }
3482 SYSCTL_PROC(_kern, OID_AUTO, wqset_select, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3483             0, 0, sysctl_wqset_select, "Q", "select/create a global waitq set");
3484
3485
3486 static int sysctl_waitq_link SYSCTL_HANDLER_ARGS
3487 {
3488 #pragma unused(oidp, arg1, arg2)
3489         int error;
3490         int index;
3491         struct waitq *waitq;
3492         struct waitq_set *wqset;
3493         kern_return_t kr;
3494         uint64_t reserved_link = 0;
3495         int64_t event64 = 0;
3496
3497         error = SYSCTL_IN(req, &event64, sizeof(event64));
3498         if (error)
3499                 return error;
3500
3501         if (!req->newptr)
3502                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3503
3504         if (!g_waitq_set)
3505                 g_waitq_set = sysctl_get_wqset(1);
3506         wqset = g_waitq_set;
3507
3508         if (event64 < 0) {
3509                 struct waitq_set *tmp;
3510                 index = (int)((-event64) & 0xffffffff);
3511                 tmp = sysctl_get_wqset(index);
3512                 if (tmp == wqset)
3513                         goto out;
3514                 waitq = wqset_waitq(tmp);
3515                 index = -index;
3516         } else {
3517                 index = (int)event64;
3518                 waitq = global_test_waitq(index);
3519         }
3520
3521         printf("[WQ]: linking waitq [%d] to global wqset (0x%llx)\n",
3522                index, wqset_id(wqset));
3523         reserved_link = waitq_link_reserve(waitq);
3524         kr = waitq_link(waitq, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
3525         waitq_link_release(reserved_link);
3526
3527         printf("[WQ]: \tkr=%d\n", kr);
3528
3529 out:
3530         return SYSCTL_OUT(req, &kr, sizeof(kr));
3531 }
3532 SYSCTL_PROC(_kern, OID_AUTO, waitq_link, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3533             0, 0, sysctl_waitq_link, "Q", "link global waitq to test waitq set");
3534
3535
3536 static int sysctl_waitq_unlink SYSCTL_HANDLER_ARGS
3537 {
3538 #pragma unused(oidp, arg1, arg2)
3539         int error;
3540         int index;
3541         struct waitq *waitq;
3542         struct waitq_set *wqset;
3543         kern_return_t kr;
3544         uint64_t event64 = 0;
3545
3546         error = SYSCTL_IN(req, &event64, sizeof(event64));
3547         if (error)
3548                 return error;
3549
3550         if (!req->newptr)
3551                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3552
3553         if (!g_waitq_set)
3554                 g_waitq_set = sysctl_get_wqset(1);
3555         wqset = g_waitq_set;
3556
3557         index = (int)event64;
3558         waitq = global_test_waitq(index);
3559
3560         printf("[WQ]: unlinking waitq [%d] from global wqset (0x%llx)\n",
3561                index, wqset_id(wqset));
3562
3563         kr = waitq_unlink(waitq, wqset);
3564         printf("[WQ]: \tkr=%d\n", kr);
3565
3566         return SYSCTL_OUT(req, &kr, sizeof(kr));
3567 }
3568 SYSCTL_PROC(_kern, OID_AUTO, waitq_unlink, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3569             0, 0, sysctl_waitq_unlink, "Q", "unlink global waitq from test waitq set");
3570
3571
3572 static int sysctl_waitq_clear_prepost SYSCTL_HANDLER_ARGS
3573 {
3574 #pragma unused(oidp, arg1, arg2)
3575         struct waitq *waitq;
3576         uint64_t event64 = 0;
3577         int error, index;
3578
3579         error = SYSCTL_IN(req, &event64, sizeof(event64));
3580         if (error)
3581                 return error;
3582
3583         if (!req->newptr)
3584                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3585
3586         index = (int)event64;
3587         waitq = global_test_waitq(index);
3588
3589         printf("[WQ]: clearing prepost on waitq [%d]\n", index);
3590         waitq_clear_prepost(waitq);
3591
3592         return SYSCTL_OUT(req, &event64, sizeof(event64));
3593 }
3594 SYSCTL_PROC(_kern, OID_AUTO, waitq_clear_prepost, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3595             0, 0, sysctl_waitq_clear_prepost, "Q", "clear prepost on given waitq");
3596
3597
3598 static int sysctl_wqset_unlink_all SYSCTL_HANDLER_ARGS
3599 {
3600 #pragma unused(oidp, arg1, arg2)
3601         int error;
3602         struct waitq_set *wqset;
3603         kern_return_t kr;
3604         uint64_t event64 = 0;
3605
3606         error = SYSCTL_IN(req, &event64, sizeof(event64));
3607         if (error)
3608                 return error;
3609
3610         if (!req->newptr)
3611                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3612
3613         if (!g_waitq_set)
3614                 g_waitq_set = sysctl_get_wqset(1);
3615         wqset = g_waitq_set;
3616
3617         printf("[WQ]: unlinking all queues from global wqset (0x%llx)\n",
3618                wqset_id(wqset));
3619
3620         kr = waitq_set_unlink_all(wqset);
3621         printf("[WQ]: \tkr=%d\n", kr);
3622
3623         return SYSCTL_OUT(req, &kr, sizeof(kr));
3624 }
3625 SYSCTL_PROC(_kern, OID_AUTO, wqset_unlink_all, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3626             0, 0, sysctl_wqset_unlink_all, "Q", "unlink all queues from test waitq set");
3627
3628
3629 static int sysctl_wqset_clear_preposts SYSCTL_HANDLER_ARGS
3630 {
3631 #pragma unused(oidp, arg1, arg2)
3632         struct waitq_set *wqset = NULL;
3633         uint64_t event64 = 0;
3634         int error, index;
3635
3636         error = SYSCTL_IN(req, &event64, sizeof(event64));
3637         if (error)
3638                 return error;
3639
3640         if (!req->newptr)
3641                 goto out;
3642
3643         index = (int)((event64) & 0xffffffff);
3644         wqset = sysctl_get_wqset(index);
3645         assert(wqset != NULL);
3646
3647         printf("[WQ]: clearing preposts on wqset 0x%llx\n", wqset_id(wqset));
3648         waitq_set_clear_preposts(wqset);
3649
3650 out:
3651         if (wqset)
3652                 event64 = wqset_id(wqset);
3653         else
3654                 event64 = (uint64_t)(-1);
3655
3656         return SYSCTL_OUT(req, &event64, sizeof(event64));
3657 }
3658 SYSCTL_PROC(_kern, OID_AUTO, wqset_clear_preposts, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3659             0, 0, sysctl_wqset_clear_preposts, "Q", "clear preposts on given waitq set");
3660
3661 #endif /* CONFIG_WAITQ_DEBUG */
3662 #endif /* defined(DEVELOPMENT) || defined(DEBUG) */