bsd/kern/sys_generic.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
  67  */
  68 /*
  69  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
  70  * support for mandatory and extensible security protections.  This notice
  71  * is included in support of clause 2.2 (b) of the Apple Public License,
  72  * Version 2.0.
  73  */
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/ioctl.h>
  79 #include <sys/file_internal.h>
  80 #include <sys/proc_internal.h>
  81 #include <sys/socketvar.h>
  82 #include <sys/uio_internal.h>
  83 #include <sys/kernel.h>
  84 #include <sys/guarded.h>
  85 #include <sys/stat.h>
  86 #include <sys/malloc.h>
  87 #include <sys/sysproto.h>
  88
  89 #include <sys/mount_internal.h>
  90 #include <sys/protosw.h>
  91 #include <sys/ev.h>
  92 #include <sys/user.h>
  93 #include <sys/kdebug.h>
  94 #include <sys/poll.h>
  95 #include <sys/event.h>
  96 #include <sys/eventvar.h>
  97 #include <sys/proc.h>
  98 #include <sys/kauth.h>
  99
 100 #include <mach/mach_types.h>
 101 #include <kern/kern_types.h>
 102 #include <kern/assert.h>
 103 #include <kern/kalloc.h>
 104 #include <kern/thread.h>
 105 #include <kern/clock.h>
 106 #include <kern/ledger.h>
 107 #include <kern/task.h>
 108 #include <kern/telemetry.h>
 109 #include <kern/waitq.h>
 110 #include <kern/sched_prim.h>
 111
 112 #include <sys/mbuf.h>
 113 #include <sys/domain.h>
 114 #include <sys/socket.h>
 115 #include <sys/socketvar.h>
 116 #include <sys/errno.h>
 117 #include <sys/syscall.h>
 118 #include <sys/pipe.h>
 119
 120 #include <security/audit/audit.h>
 121
 122 #include <net/if.h>
 123 #include <net/route.h>
 124
 125 #include <netinet/in.h>
 126 #include <netinet/in_systm.h>
 127 #include <netinet/ip.h>
 128 #include <netinet/in_pcb.h>
 129 #include <netinet/ip_var.h>
 130 #include <netinet/ip6.h>
 131 #include <netinet/tcp.h>
 132 #include <netinet/tcp_fsm.h>
 133 #include <netinet/tcp_seq.h>
 134 #include <netinet/tcp_timer.h>
 135 #include <netinet/tcp_var.h>
 136 #include <netinet/tcpip.h>
 137 #include <netinet/tcp_debug.h>
 138 /* for wait queue based select */
 139 #include <kern/waitq.h>
 140 #include <kern/kalloc.h>
 141 #include <sys/vnode_internal.h>
 142
 143 /* XXX should be in a header file somewhere */
 144 void evsofree(struct socket *);
 145 void evpipefree(struct pipe *);
 146 void postpipeevent(struct pipe *, int);
 147 void postevent(struct socket *, struct sockbuf *, int);
 148 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
 149
 150 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 151 int wr_uio(struct proc *p, struct fileproc *fp, uio_t uio, user_ssize_t *retval);
 152
 153 __private_extern__ int  dofileread(vfs_context_t ctx, struct fileproc *fp,
 154                                                                    user_addr_t bufp, user_size_t nbyte,
 155                                                                    off_t offset, int flags, user_ssize_t *retval);
 156 __private_extern__ int  dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 157                                                                         user_addr_t bufp, user_size_t nbyte,
 158                                                                         off_t offset, int flags, user_ssize_t *retval);
 159 __private_extern__ int  preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
 160 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
 161
 162
 163 /* Conflict wait queue for when selects collide (opaque type) */
 164 struct waitq select_conflict_queue;
 165
 166 /*
 167  * Init routine called from bsd_init.c
 168  */
 169 void select_waitq_init(void);
 170 void
 171 select_waitq_init(void)
 172 {
 173         waitq_init(&select_conflict_queue, SYNC_POLICY_FIFO);
 174 }
 175
 176 #define f_flag f_fglob->fg_flag
 177 #define f_type f_fglob->fg_ops->fo_type
 178 #define f_msgcount f_fglob->fg_msgcount
 179 #define f_cred f_fglob->fg_cred
 180 #define f_ops f_fglob->fg_ops
 181 #define f_offset f_fglob->fg_offset
 182 #define f_data f_fglob->fg_data
 183
 184 /*
 185  * Read system call.
 186  *
 187  * Returns:     0                       Success
 188  *      preparefileread:EBADF
 189  *      preparefileread:ESPIPE
 190  *      preparefileread:ENXIO
 191  *      preparefileread:EBADF
 192  *      dofileread:???
 193  */
 194 int
 195 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
 196 {
 197         __pthread_testcancel(1);
 198         return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
 199 }
 200
 201 int
 202 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
 203 {
 204         struct fileproc *fp;
 205         int error;
 206         int fd = uap->fd;
 207         struct vfs_context context;
 208
 209         if ( (error = preparefileread(p, &fp, fd, 0)) )
 210                 return (error);
 211
 212         context = *(vfs_context_current());
 213         context.vc_ucred = fp->f_fglob->fg_cred;
 214
 215         error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
 216                            (off_t)-1, 0, retval);
 217
 218         donefileread(p, fp, fd);
 219
 220         return (error);
 221 }
 222
 223 /*
 224  * Pread system call
 225  *
 226  * Returns:     0                       Success
 227  *      preparefileread:EBADF
 228  *      preparefileread:ESPIPE
 229  *      preparefileread:ENXIO
 230  *      preparefileread:EBADF
 231  *      dofileread:???
 232  */
 233 int
 234 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
 235 {
 236         __pthread_testcancel(1);
 237         return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
 238 }
 239
 240 int
 241 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
 242 {
 243         struct fileproc *fp = NULL;     /* fp set by preparefileread() */
 244         int fd = uap->fd;
 245         int error;
 246         struct vfs_context context;
 247
 248         if ( (error = preparefileread(p, &fp, fd, 1)) )
 249                 goto out;
 250
 251         context = *(vfs_context_current());
 252         context.vc_ucred = fp->f_fglob->fg_cred;
 253
 254         error = dofileread(&context, fp, uap->buf, uap->nbyte,
 255                         uap->offset, FOF_OFFSET, retval);
 256
 257         donefileread(p, fp, fd);
 258
 259         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
 260               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 261
 262 out:
 263         return (error);
 264 }
 265
 266 /*
 267  * Code common for read and pread
 268  */
 269
 270 void
 271 donefileread(struct proc *p, struct fileproc *fp, int fd)
 272 {
 273         proc_fdlock_spin(p);
 274         fp_drop(p, fd, fp, 1);
 275         proc_fdunlock(p);
 276 }
 277
 278 /*
 279  * Returns:     0                       Success
 280  *              EBADF
 281  *              ESPIPE
 282  *              ENXIO
 283  *      fp_lookup:EBADF
 284  *      fo_read:???
 285  */
 286 int
 287 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
 288 {
 289         vnode_t vp;
 290         int     error;
 291         struct fileproc *fp;
 292
 293         AUDIT_ARG(fd, fd);
 294
 295         proc_fdlock_spin(p);
 296
 297         error = fp_lookup(p, fd, &fp, 1);
 298
 299         if (error) {
 300                 proc_fdunlock(p);
 301                 return (error);
 302         }
 303         if ((fp->f_flag & FREAD) == 0) {
 304                 error = EBADF;
 305                 goto out;
 306         }
 307         if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
 308                 error = ESPIPE;
 309                 goto out;
 310         }
 311         if (fp->f_type == DTYPE_VNODE) {
 312                 vp = (struct vnode *)fp->f_fglob->fg_data;
 313
 314                 if (check_for_pread && (vnode_isfifo(vp))) {
 315                         error = ESPIPE;
 316                         goto out;
 317                 }
 318                 if (check_for_pread && (vp->v_flag & VISTTY)) {
 319                         error = ENXIO;
 320                         goto out;
 321                 }
 322         }
 323
 324         *fp_ret = fp;
 325
 326         proc_fdunlock(p);
 327         return (0);
 328
 329 out:
 330         fp_drop(p, fd, fp, 1);
 331         proc_fdunlock(p);
 332         return (error);
 333 }
 334
 335
 336 /*
 337  * Returns:     0                       Success
 338  *              EINVAL
 339  *      fo_read:???
 340  */
 341 __private_extern__ int
 342 dofileread(vfs_context_t ctx, struct fileproc *fp,
 343            user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
 344            user_ssize_t *retval)
 345 {
 346         uio_t auio;
 347         user_ssize_t bytecnt;
 348         long error = 0;
 349         char uio_buf[ UIO_SIZEOF(1) ];
 350
 351         if (nbyte > INT_MAX)
 352                 return (EINVAL);
 353
 354         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
 355                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
 356                                                                           &uio_buf[0], sizeof(uio_buf));
 357         } else {
 358                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
 359                                                                           &uio_buf[0], sizeof(uio_buf));
 360         }
 361         uio_addiov(auio, bufp, nbyte);
 362
 363         bytecnt = nbyte;
 364
 365         if ((error = fo_read(fp, auio, flags, ctx))) {
 366                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 367                         error == EINTR || error == EWOULDBLOCK))
 368                         error = 0;
 369         }
 370         bytecnt -= uio_resid(auio);
 371
 372         *retval = bytecnt;
 373
 374         return (error);
 375 }
 376
 377 /*
 378  * Scatter read system call.
 379  *
 380  * Returns:     0                       Success
 381  *              EINVAL
 382  *              ENOMEM
 383  *      copyin:EFAULT
 384  *      rd_uio:???
 385  */
 386 int
 387 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
 388 {
 389         __pthread_testcancel(1);
 390         return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
 391 }
 392
 393 int
 394 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
 395 {
 396         uio_t auio = NULL;
 397         int error;
 398         struct user_iovec *iovp;
 399
 400         /* Verify range bedfore calling uio_create() */
 401         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 402                 return (EINVAL);
 403
 404         /* allocate a uio large enough to hold the number of iovecs passed */
 405         auio = uio_create(uap->iovcnt, 0,
 406                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 407                                   UIO_READ);
 408
 409         /* get location of iovecs within the uio.  then copyin the iovecs from
 410          * user space.
 411          */
 412         iovp = uio_iovsaddr(auio);
 413         if (iovp == NULL) {
 414                 error = ENOMEM;
 415                 goto ExitThisRoutine;
 416         }
 417         error = copyin_user_iovec_array(uap->iovp,
 418                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
 419                 uap->iovcnt, iovp);
 420         if (error) {
 421                 goto ExitThisRoutine;
 422         }
 423
 424         /* finalize uio_t for use and do the IO
 425          */
 426         error = uio_calculateresid(auio);
 427         if (error) {
 428                 goto ExitThisRoutine;
 429         }
 430         error = rd_uio(p, uap->fd, auio, retval);
 431
 432 ExitThisRoutine:
 433         if (auio != NULL) {
 434                 uio_free(auio);
 435         }
 436         return (error);
 437 }
 438
 439 /*
 440  * Write system call
 441  *
 442  * Returns:     0                       Success
 443  *              EBADF
 444  *      fp_lookup:EBADF
 445  *      dofilewrite:???
 446  */
 447 int
 448 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
 449 {
 450         __pthread_testcancel(1);
 451         return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
 452
 453 }
 454
 455 int
 456 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
 457 {
 458         struct fileproc *fp;
 459         int error;
 460         int fd = uap->fd;
 461         bool wrote_some = false;
 462
 463         AUDIT_ARG(fd, fd);
 464
 465         error = fp_lookup(p,fd,&fp,0);
 466         if (error)
 467                 return(error);
 468         if ((fp->f_flag & FWRITE) == 0) {
 469                 error = EBADF;
 470         } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
 471                 proc_fdlock(p);
 472                 error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
 473                 proc_fdunlock(p);
 474         } else {
 475                 struct vfs_context context = *(vfs_context_current());
 476                 context.vc_ucred = fp->f_fglob->fg_cred;
 477
 478                 error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
 479                         (off_t)-1, 0, retval);
 480
 481                 wrote_some = *retval > 0;
 482         }
 483         if (wrote_some)
 484                 fp_drop_written(p, fd, fp);
 485         else
 486                 fp_drop(p, fd, fp, 0);
 487         return(error);
 488 }
 489
 490 /*
 491  * pwrite system call
 492  *
 493  * Returns:     0                       Success
 494  *              EBADF
 495  *              ESPIPE
 496  *              ENXIO
 497  *              EINVAL
 498  *      fp_lookup:EBADF
 499  *      dofilewrite:???
 500  */
 501 int
 502 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
 503 {
 504         __pthread_testcancel(1);
 505         return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
 506 }
 507
 508 int
 509 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
 510 {
 511         struct fileproc *fp;
 512         int error;
 513         int fd = uap->fd;
 514         vnode_t vp  = (vnode_t)0;
 515         bool wrote_some = false;
 516
 517         AUDIT_ARG(fd, fd);
 518
 519         error = fp_lookup(p,fd,&fp,0);
 520         if (error)
 521                 return(error);
 522
 523         if ((fp->f_flag & FWRITE) == 0) {
 524                 error = EBADF;
 525         } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
 526                 proc_fdlock(p);
 527                 error = fp_guard_exception(p, fd, fp, kGUARD_EXC_WRITE);
 528                 proc_fdunlock(p);
 529         } else {
 530                 struct vfs_context context = *vfs_context_current();
 531                 context.vc_ucred = fp->f_fglob->fg_cred;
 532
 533                 if (fp->f_type != DTYPE_VNODE) {
 534                         error = ESPIPE;
 535                         goto errout;
 536                 }
 537                 vp = (vnode_t)fp->f_fglob->fg_data;
 538                 if (vnode_isfifo(vp)) {
 539                         error = ESPIPE;
 540                         goto errout;
 541                 }
 542                 if ((vp->v_flag & VISTTY)) {
 543                         error = ENXIO;
 544                         goto errout;
 545                 }
 546                 if (uap->offset == (off_t)-1) {
 547                         error = EINVAL;
 548                         goto errout;
 549                 }
 550
 551                     error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
 552                         uap->offset, FOF_OFFSET, retval);
 553                         wrote_some = *retval > 0;
 554         }
 555 errout:
 556         if (wrote_some)
 557                 fp_drop_written(p, fd, fp);
 558         else
 559                 fp_drop(p, fd, fp, 0);
 560
 561         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
 562               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 563
 564         return(error);
 565 }
 566
 567 /*
 568  * Returns:     0                       Success
 569  *              EINVAL
 570  *      <fo_write>:EPIPE
 571  *      <fo_write>:???                  [indirect through struct fileops]
 572  */
 573 __private_extern__ int
 574 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
 575             user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
 576             user_ssize_t *retval)
 577 {
 578         uio_t auio;
 579         long error = 0;
 580         user_ssize_t bytecnt;
 581         char uio_buf[ UIO_SIZEOF(1) ];
 582
 583         if (nbyte > INT_MAX) {
 584                 *retval = 0;
 585                 return (EINVAL);
 586         }
 587
 588         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
 589                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
 590                                                                           &uio_buf[0], sizeof(uio_buf));
 591         } else {
 592                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
 593                                                                           &uio_buf[0], sizeof(uio_buf));
 594         }
 595         uio_addiov(auio, bufp, nbyte);
 596
 597         bytecnt = nbyte;
 598         if ((error = fo_write(fp, auio, flags, ctx))) {
 599                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 600                         error == EINTR || error == EWOULDBLOCK))
 601                         error = 0;
 602                 /* The socket layer handles SIGPIPE */
 603                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
 604                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
 605                         /* XXX Raise the signal on the thread? */
 606                         psignal(vfs_context_proc(ctx), SIGPIPE);
 607                 }
 608         }
 609         bytecnt -= uio_resid(auio);
 610         *retval = bytecnt;
 611
 612         return (error);
 613 }
 614
 615 /*
 616  * Gather write system call
 617  */
 618 int
 619 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
 620 {
 621         __pthread_testcancel(1);
 622         return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
 623 }
 624
 625 int
 626 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
 627 {
 628         uio_t auio = NULL;
 629         int error;
 630         struct fileproc *fp;
 631         struct user_iovec *iovp;
 632         bool wrote_some = false;
 633
 634         AUDIT_ARG(fd, uap->fd);
 635
 636         /* Verify range bedfore calling uio_create() */
 637         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 638                 return (EINVAL);
 639
 640         /* allocate a uio large enough to hold the number of iovecs passed */
 641         auio = uio_create(uap->iovcnt, 0,
 642                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 643                                   UIO_WRITE);
 644
 645         /* get location of iovecs within the uio.  then copyin the iovecs from
 646          * user space.
 647          */
 648         iovp = uio_iovsaddr(auio);
 649         if (iovp == NULL) {
 650                 error = ENOMEM;
 651                 goto ExitThisRoutine;
 652         }
 653         error = copyin_user_iovec_array(uap->iovp,
 654                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
 655                 uap->iovcnt, iovp);
 656         if (error) {
 657                 goto ExitThisRoutine;
 658         }
 659
 660         /* finalize uio_t for use and do the IO
 661          */
 662         error = uio_calculateresid(auio);
 663         if (error) {
 664                 goto ExitThisRoutine;
 665         }
 666
 667         error = fp_lookup(p, uap->fd, &fp, 0);
 668         if (error)
 669                 goto ExitThisRoutine;
 670
 671         if ((fp->f_flag & FWRITE) == 0) {
 672                 error = EBADF;
 673         } else if (FP_ISGUARDED(fp, GUARD_WRITE)) {
 674                 proc_fdlock(p);
 675                 error = fp_guard_exception(p, uap->fd, fp, kGUARD_EXC_WRITE);
 676                 proc_fdunlock(p);
 677         } else {
 678                 error = wr_uio(p, fp, auio, retval);
 679                 wrote_some = *retval > 0;
 680         }
 681
 682         if (wrote_some)
 683                 fp_drop_written(p, uap->fd, fp);
 684         else
 685                 fp_drop(p, uap->fd, fp, 0);
 686
 687 ExitThisRoutine:
 688         if (auio != NULL) {
 689                 uio_free(auio);
 690         }
 691         return (error);
 692 }
 693
 694
 695 int
 696 wr_uio(struct proc *p, struct fileproc *fp, uio_t uio, user_ssize_t *retval)
 697 {
 698         int error;
 699         user_ssize_t count;
 700         struct vfs_context context = *vfs_context_current();
 701
 702         count = uio_resid(uio);
 703
 704         context.vc_ucred = fp->f_cred;
 705         error = fo_write(fp, uio, 0, &context);
 706         if (error) {
 707                 if (uio_resid(uio) != count && (error == ERESTART ||
 708                                                 error == EINTR || error == EWOULDBLOCK))
 709                         error = 0;
 710                 /* The socket layer handles SIGPIPE */
 711                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
 712                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0)
 713                         psignal(p, SIGPIPE);
 714         }
 715         *retval = count - uio_resid(uio);
 716
 717         return(error);
 718 }
 719
 720
 721 int
 722 rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
 723 {
 724         struct fileproc *fp;
 725         int error;
 726         user_ssize_t count;
 727         struct vfs_context context = *vfs_context_current();
 728
 729         if ( (error = preparefileread(p, &fp, fdes, 0)) )
 730                 return (error);
 731
 732         count = uio_resid(uio);
 733
 734         context.vc_ucred = fp->f_cred;
 735
 736         error = fo_read(fp, uio, 0, &context);
 737
 738         if (error) {
 739                 if (uio_resid(uio) != count && (error == ERESTART ||
 740                                                 error == EINTR || error == EWOULDBLOCK))
 741                         error = 0;
 742         }
 743         *retval = count - uio_resid(uio);
 744
 745         donefileread(p, fp, fdes);
 746
 747         return (error);
 748 }
 749
 750 /*
 751  * Ioctl system call
 752  *
 753  * Returns:     0                       Success
 754  *              EBADF
 755  *              ENOTTY
 756  *              ENOMEM
 757  *              ESRCH
 758  *      copyin:EFAULT
 759  *      copyoutEFAULT
 760  *      fp_lookup:EBADF                 Bad file descriptor
 761  *      fo_ioctl:???
 762  */
 763 int
 764 ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
 765 {
 766         struct fileproc *fp = NULL;
 767         int error = 0;
 768         u_int size = 0;
 769         caddr_t datap = NULL, memp = NULL;
 770         boolean_t is64bit = FALSE;
 771         int tmp = 0;
 772 #define STK_PARAMS      128
 773         char stkbuf[STK_PARAMS];
 774         int fd = uap->fd;
 775         u_long com = uap->com;
 776         struct vfs_context context = *vfs_context_current();
 777
 778         AUDIT_ARG(fd, uap->fd);
 779         AUDIT_ARG(addr, uap->data);
 780
 781         is64bit = proc_is64bit(p);
 782 #if CONFIG_AUDIT
 783         if (is64bit)
 784                 AUDIT_ARG(value64, com);
 785         else
 786                 AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, com));
 787 #endif /* CONFIG_AUDIT */
 788
 789         /*
 790          * Interpret high order word to find amount of data to be
 791          * copied to/from the user's address space.
 792          */
 793         size = IOCPARM_LEN(com);
 794         if (size > IOCPARM_MAX)
 795                         return ENOTTY;
 796         if (size > sizeof (stkbuf)) {
 797                 if ((memp = (caddr_t)kalloc(size)) == 0)
 798                         return ENOMEM;
 799                 datap = memp;
 800         } else
 801                 datap = &stkbuf[0];
 802         if (com & IOC_IN) {
 803                 if (size) {
 804                         error = copyin(uap->data, datap, size);
 805                         if (error)
 806                                 goto out_nofp;
 807                 } else {
 808                         /* XXX - IOC_IN and no size?  we should proably return an error here!! */
 809                         if (is64bit) {
 810                                 *(user_addr_t *)datap = uap->data;
 811                         }
 812                         else {
 813                                 *(uint32_t *)datap = (uint32_t)uap->data;
 814                         }
 815                 }
 816         } else if ((com & IOC_OUT) && size)
 817                 /*
 818                  * Zero the buffer so the user always
 819                  * gets back something deterministic.
 820                  */
 821                 bzero(datap, size);
 822         else if (com & IOC_VOID) {
 823                 /* XXX - this is odd since IOC_VOID means no parameters */
 824                 if (is64bit) {
 825                         *(user_addr_t *)datap = uap->data;
 826                 }
 827                 else {
 828                         *(uint32_t *)datap = (uint32_t)uap->data;
 829                 }
 830         }
 831
 832         proc_fdlock(p);
 833         error = fp_lookup(p,fd,&fp,1);
 834         if (error)  {
 835                 proc_fdunlock(p);
 836                 goto out_nofp;
 837         }
 838
 839         AUDIT_ARG(file, p, fp);
 840
 841         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 842                         error = EBADF;
 843                         goto out;
 844         }
 845
 846         context.vc_ucred = fp->f_fglob->fg_cred;
 847
 848 #if CONFIG_MACF
 849         error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, com);
 850         if (error)
 851                 goto out;
 852 #endif
 853
 854         switch (com) {
 855         case FIONCLEX:
 856                 *fdflags(p, fd) &= ~UF_EXCLOSE;
 857                 break;
 858
 859         case FIOCLEX:
 860                 *fdflags(p, fd) |= UF_EXCLOSE;
 861                 break;
 862
 863         case FIONBIO:
 864                 if ( (tmp = *(int *)datap) )
 865                         fp->f_flag |= FNONBLOCK;
 866                 else
 867                         fp->f_flag &= ~FNONBLOCK;
 868                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
 869                 break;
 870
 871         case FIOASYNC:
 872                 if ( (tmp = *(int *)datap) )
 873                         fp->f_flag |= FASYNC;
 874                 else
 875                         fp->f_flag &= ~FASYNC;
 876                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
 877                 break;
 878
 879         case FIOSETOWN:
 880                 tmp = *(int *)datap;
 881                 if (fp->f_type == DTYPE_SOCKET) {
 882                         ((struct socket *)fp->f_data)->so_pgid = tmp;
 883                         break;
 884                 }
 885                 if (fp->f_type == DTYPE_PIPE) {
 886                         error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
 887                         break;
 888                 }
 889                 if (tmp <= 0) {
 890                         tmp = -tmp;
 891                 } else {
 892                         struct proc *p1 = proc_find(tmp);
 893                         if (p1 == 0) {
 894                                 error = ESRCH;
 895                                 break;
 896                         }
 897                         tmp = p1->p_pgrpid;
 898                         proc_rele(p1);
 899                 }
 900                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
 901                 break;
 902
 903         case FIOGETOWN:
 904                 if (fp->f_type == DTYPE_SOCKET) {
 905                         *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
 906                         break;
 907                 }
 908                 error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
 909                 *(int *)datap = -*(int *)datap;
 910                 break;
 911
 912         default:
 913                 error = fo_ioctl(fp, com, datap, &context);
 914                 /*
 915                  * Copy any data to user, size was
 916                  * already set and checked above.
 917                  */
 918                 if (error == 0 && (com & IOC_OUT) && size)
 919                         error = copyout(datap, uap->data, (u_int)size);
 920                 break;
 921         }
 922 out:
 923         fp_drop(p, fd, fp, 1);
 924         proc_fdunlock(p);
 925
 926 out_nofp:
 927         if (memp)
 928                 kfree(memp, size);
 929         return(error);
 930 }
 931
 932 int     selwait, nselcoll;
 933 #define SEL_FIRSTPASS 1
 934 #define SEL_SECONDPASS 2
 935 extern int selcontinue(int error);
 936 extern int selprocess(int error, int sel_pass);
 937 static int selscan(struct proc *p, struct _select * sel, struct _select_data * seldata,
 938                         int nfd, int32_t *retval, int sel_pass, struct waitq_set *wqset);
 939 static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
 940 static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount);
 941 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
 942 static int select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval);
 943
 944 /*
 945  * Select system call.
 946  *
 947  * Returns:     0                       Success
 948  *              EINVAL                  Invalid argument
 949  *              EAGAIN                  Nonconformant error if allocation fails
 950  */
 951 int
 952 select(struct proc *p, struct select_args *uap, int32_t *retval)
 953 {
 954         __pthread_testcancel(1);
 955         return select_nocancel(p, (struct select_nocancel_args *)uap, retval);
 956 }
 957
 958 int
 959 select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
 960 {
 961         uint64_t timeout = 0;
 962
 963         if (uap->tv) {
 964                 int err;
 965                 struct timeval atv;
 966                 if (IS_64BIT_PROCESS(p)) {
 967                         struct user64_timeval atv64;
 968                         err = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
 969                         /* Loses resolution - assume timeout < 68 years */
 970                         atv.tv_sec = atv64.tv_sec;
 971                         atv.tv_usec = atv64.tv_usec;
 972                 } else {
 973                         struct user32_timeval atv32;
 974                         err = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
 975                         atv.tv_sec = atv32.tv_sec;
 976                         atv.tv_usec = atv32.tv_usec;
 977                 }
 978                 if (err)
 979                         return err;
 980
 981                 if (itimerfix(&atv)) {
 982                         err = EINVAL;
 983                         return err;
 984                 }
 985
 986                 clock_absolutetime_interval_to_deadline(tvtoabstime(&atv), &timeout);
 987         }
 988
 989         return select_internal(p, uap, timeout, retval);
 990 }
 991
 992 int
 993 pselect(struct proc *p, struct pselect_args *uap, int32_t *retval)
 994 {
 995         __pthread_testcancel(1);
 996         return pselect_nocancel(p, (struct pselect_nocancel_args *)uap, retval);
 997 }
 998
 999 int
1000 pselect_nocancel(struct proc *p, struct pselect_nocancel_args *uap, int32_t *retval)
1001 {
1002         int err;
1003         struct uthread *ut;
1004         uint64_t timeout = 0;
1005
1006         if (uap->ts) {
1007                 struct timespec ts;
1008
1009                 if (IS_64BIT_PROCESS(p)) {
1010                         struct user64_timespec ts64;
1011                         err = copyin(uap->ts, (caddr_t)&ts64, sizeof(ts64));
1012                         ts.tv_sec = ts64.tv_sec;
1013                         ts.tv_nsec = ts64.tv_nsec;
1014                 } else {
1015                         struct user32_timespec ts32;
1016                         err = copyin(uap->ts, (caddr_t)&ts32, sizeof(ts32));
1017                         ts.tv_sec = ts32.tv_sec;
1018                         ts.tv_nsec = ts32.tv_nsec;
1019                 }
1020                 if (err) {
1021                         return err;
1022                 }
1023
1024                 if (!timespec_is_valid(&ts)) {
1025                         return EINVAL;
1026                 }
1027                 clock_absolutetime_interval_to_deadline(tstoabstime(&ts), &timeout);
1028         }
1029
1030         ut = get_bsdthread_info(current_thread());
1031
1032         if (uap->mask != USER_ADDR_NULL) {
1033                 /* save current mask, then copyin and set new mask */
1034                 sigset_t newset;
1035                 err = copyin(uap->mask, &newset, sizeof(sigset_t));
1036                 if (err) {
1037                         return err;
1038                 }
1039                 ut->uu_oldmask = ut->uu_sigmask;
1040                 ut->uu_flag |= UT_SAS_OLDMASK;
1041                 ut->uu_sigmask = (newset & ~sigcantmask);
1042         }
1043
1044         err = select_internal(p, (struct select_nocancel_args *)uap, timeout, retval);
1045
1046         if (err != EINTR && ut->uu_flag & UT_SAS_OLDMASK) {
1047                 /*
1048                  * Restore old mask (direct return case). NOTE: EINTR can also be returned
1049                  * if the thread is cancelled. In that case, we don't reset the signal
1050                  * mask to its original value (which usually happens in the signal
1051                  * delivery path). This behavior is permitted by POSIX.
1052                  */
1053                 ut->uu_sigmask = ut->uu_oldmask;
1054                 ut->uu_oldmask = 0;
1055                 ut->uu_flag &= ~UT_SAS_OLDMASK;
1056         }
1057
1058         return err;
1059 }
1060
1061 /*
1062  * Generic implementation of {,p}select. Care: we type-pun uap across the two
1063  * syscalls, which differ slightly. The first 4 arguments (nfds and the fd sets)
1064  * are identical. The 5th (timeout) argument points to different types, so we
1065  * unpack in the syscall-specific code, but the generic code still does a null
1066  * check on this argument to determine if a timeout was specified.
1067  */
1068 static int
1069 select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeout, int32_t *retval)
1070 {
1071         int error = 0;
1072         u_int ni, nw;
1073         thread_t th_act;
1074         struct uthread  *uth;
1075         struct _select *sel;
1076         struct _select_data *seldata;
1077         int needzerofill = 1;
1078         int count = 0;
1079         size_t sz = 0;
1080
1081         th_act = current_thread();
1082         uth = get_bsdthread_info(th_act);
1083         sel = &uth->uu_select;
1084         seldata = &uth->uu_kevent.ss_select_data;
1085         *retval = 0;
1086
1087         seldata->args = uap;
1088         seldata->retval = retval;
1089         seldata->wqp = NULL;
1090         seldata->count = 0;
1091
1092         if (uap->nd < 0) {
1093                 return (EINVAL);
1094         }
1095
1096         /* select on thread of process that already called proc_exit() */
1097         if (p->p_fd == NULL) {
1098                 return (EBADF);
1099         }
1100
1101         if (uap->nd > p->p_fd->fd_nfiles)
1102                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
1103
1104         nw = howmany(uap->nd, NFDBITS);
1105         ni = nw * sizeof(fd_mask);
1106
1107         /*
1108          * if the previously allocated space for the bits is smaller than
1109          * what is requested or no space has yet been allocated for this
1110          * thread, allocate enough space now.
1111          *
1112          * Note: If this process fails, select() will return EAGAIN; this
1113          * is the same thing pool() returns in a no-memory situation, but
1114          * it is not a POSIX compliant error code for select().
1115          */
1116         if (sel->nbytes < (3 * ni)) {
1117                 int nbytes = 3 * ni;
1118
1119                 /* Free previous allocation, if any */
1120                 if (sel->ibits != NULL)
1121                         FREE(sel->ibits, M_TEMP);
1122                 if (sel->obits != NULL) {
1123                         FREE(sel->obits, M_TEMP);
1124                         /* NULL out; subsequent ibits allocation may fail */
1125                         sel->obits = NULL;
1126                 }
1127
1128                 MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1129                 if (sel->ibits == NULL)
1130                         return (EAGAIN);
1131                 MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
1132                 if (sel->obits == NULL) {
1133                         FREE(sel->ibits, M_TEMP);
1134                         sel->ibits = NULL;
1135                         return (EAGAIN);
1136                 }
1137                 sel->nbytes = nbytes;
1138                 needzerofill = 0;
1139         }
1140
1141         if (needzerofill) {
1142                 bzero((caddr_t)sel->ibits, sel->nbytes);
1143                 bzero((caddr_t)sel->obits, sel->nbytes);
1144         }
1145
1146         /*
1147          * get the bits from the user address space
1148          */
1149 #define getbits(name, x) \
1150         do { \
1151                 if (uap->name && (error = copyin(uap->name, \
1152                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
1153                         goto continuation; \
1154         } while (0)
1155
1156         getbits(in, 0);
1157         getbits(ou, 1);
1158         getbits(ex, 2);
1159 #undef  getbits
1160
1161         seldata->abstime = timeout;
1162
1163         if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) {
1164                         goto continuation;
1165         }
1166
1167         /*
1168          * We need an array of waitq pointers. This is due to the new way
1169          * in which waitqs are linked to sets. When a thread selects on a
1170          * file descriptor, a waitq (embedded in a selinfo structure) is
1171          * added to the thread's local waitq set. There is no longer any
1172          * way to directly iterate over all members of a given waitq set.
1173          * The process of linking a waitq into a set may allocate a link
1174          * table object. Because we can't iterate over all the waitqs to
1175          * which our thread waitq set belongs, we need a way of removing
1176          * this link object!
1177          *
1178          * Thus we need a buffer which will hold one waitq pointer
1179          * per FD being selected. During the tear-down phase we can use
1180          * these pointers to dis-associate the underlying selinfo's waitq
1181          * from our thread's waitq set.
1182          *
1183          * Because we also need to allocate a waitq set for this thread,
1184          * we use a bare buffer pointer to hold all the memory. Note that
1185          * this memory is cached in the thread pointer and not reaped until
1186          * the thread exists. This is generally OK because threads that
1187          * call select tend to keep calling select repeatedly.
1188          */
1189         sz = ALIGN(sizeof(struct waitq_set)) + (count * sizeof(uint64_t));
1190         if (sz > uth->uu_wqstate_sz) {
1191                 /* (re)allocate a buffer to hold waitq pointers */
1192                 if (uth->uu_wqset) {
1193                         if (waitq_set_is_valid(uth->uu_wqset))
1194                                 waitq_set_deinit(uth->uu_wqset);
1195                         FREE(uth->uu_wqset, M_SELECT);
1196                 } else if (uth->uu_wqstate_sz && !uth->uu_wqset)
1197                         panic("select: thread structure corrupt! "
1198                               "uu_wqstate_sz:%ld, wqstate_buf == NULL",
1199                               uth->uu_wqstate_sz);
1200                 uth->uu_wqstate_sz = sz;
1201                 MALLOC(uth->uu_wqset, struct waitq_set *, sz, M_SELECT, M_WAITOK);
1202                 if (!uth->uu_wqset)
1203                         panic("can't allocate %ld bytes for wqstate buffer",
1204                               uth->uu_wqstate_sz);
1205                 waitq_set_init(uth->uu_wqset,
1206                                SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL, NULL);
1207         }
1208
1209         if (!waitq_set_is_valid(uth->uu_wqset))
1210                 waitq_set_init(uth->uu_wqset,
1211                                SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL, NULL);
1212
1213         /* the last chunk of our buffer is an array of waitq pointers */
1214         seldata->wqp = (uint64_t *)((char *)(uth->uu_wqset) + ALIGN(sizeof(struct waitq_set)));
1215         bzero(seldata->wqp, sz - ALIGN(sizeof(struct waitq_set)));
1216
1217         seldata->count = count;
1218
1219 continuation:
1220
1221         if (error) {
1222                 /*
1223                  * We have already cleaned up any state we established,
1224                  * either locally or as a result of selcount().  We don't
1225                  * need to wait_subqueue_unlink_all(), since we haven't set
1226                  * anything at this point.
1227                  */
1228                 return (error);
1229         }
1230
1231         return selprocess(0, SEL_FIRSTPASS);
1232 }
1233
1234 int
1235 selcontinue(int error)
1236 {
1237         return selprocess(error, SEL_SECONDPASS);
1238 }
1239
1240
1241 /*
1242  * selprocess
1243  *
1244  * Parameters:  error                   The error code from our caller
1245  *              sel_pass                The pass we are on
1246  */
1247 int
1248 selprocess(int error, int sel_pass)
1249 {
1250         int ncoll;
1251         u_int ni, nw;
1252         thread_t th_act;
1253         struct uthread  *uth;
1254         struct proc *p;
1255         struct select_nocancel_args *uap;
1256         int *retval;
1257         struct _select *sel;
1258         struct _select_data *seldata;
1259         int unwind = 1;
1260         int prepost = 0;
1261         int somewakeup = 0;
1262         int doretry = 0;
1263         wait_result_t wait_result;
1264
1265         p = current_proc();
1266         th_act = current_thread();
1267         uth = get_bsdthread_info(th_act);
1268         sel = &uth->uu_select;
1269         seldata = &uth->uu_kevent.ss_select_data;
1270         uap = seldata->args;
1271         retval = seldata->retval;
1272
1273         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
1274                 unwind = 0;
1275         if (seldata->count == 0)
1276                 unwind = 0;
1277 retry:
1278         if (error != 0)
1279                 goto done;
1280
1281         ncoll = nselcoll;
1282         OSBitOrAtomic(P_SELECT, &p->p_flag);
1283
1284         /* skip scans if the select is just for timeouts */
1285         if (seldata->count) {
1286                 error = selscan(p, sel, seldata, uap->nd, retval, sel_pass, uth->uu_wqset);
1287                 if (error || *retval) {
1288                         goto done;
1289                 }
1290                 if (prepost || somewakeup) {
1291                         /*
1292                          * if the select of log, then we can wakeup and
1293                          * discover some one else already read the data;
1294                          * go to select again if time permits
1295                          */
1296                         prepost = 0;
1297                         somewakeup = 0;
1298                         doretry = 1;
1299                 }
1300         }
1301
1302         if (uap->tv) {
1303                 uint64_t        now;
1304
1305                 clock_get_uptime(&now);
1306                 if (now >= seldata->abstime)
1307                         goto done;
1308         }
1309
1310         if (doretry) {
1311                 /* cleanup obits and try again */
1312                 doretry = 0;
1313                 sel_pass = SEL_FIRSTPASS;
1314                 goto retry;
1315         }
1316
1317         /*
1318          * To effect a poll, the timeout argument should be
1319          * non-nil, pointing to a zero-valued timeval structure.
1320          */
1321         if (uap->tv && seldata->abstime == 0) {
1322                 goto done;
1323         }
1324
1325         /* No spurious wakeups due to colls,no need to check for them */
1326          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1327                 sel_pass = SEL_FIRSTPASS;
1328                 goto retry;
1329         }
1330
1331         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1332
1333         /* if the select is just for timeout skip check */
1334         if (seldata->count && (sel_pass == SEL_SECONDPASS))
1335                 panic("selprocess: 2nd pass assertwaiting");
1336
1337         /* waitq_set has waitqueue as first element */
1338         wait_result = waitq_assert_wait64_leeway((struct waitq *)uth->uu_wqset,
1339                                                  NO_EVENT64, THREAD_ABORTSAFE,
1340                                                  TIMEOUT_URGENCY_USER_NORMAL,
1341                                                  seldata->abstime,
1342                                                  TIMEOUT_NO_LEEWAY);
1343         if (wait_result != THREAD_AWAKENED) {
1344                 /* there are no preposted events */
1345                 error = tsleep1(NULL, PSOCK | PCATCH,
1346                                 "select", 0, selcontinue);
1347         } else  {
1348                 prepost = 1;
1349                 error = 0;
1350         }
1351
1352         if (error == 0) {
1353                 sel_pass = SEL_SECONDPASS;
1354                 if (!prepost)
1355                         somewakeup = 1;
1356                 goto retry;
1357         }
1358 done:
1359         if (unwind) {
1360                 seldrop(p, sel->ibits, uap->nd);
1361                 waitq_set_deinit(uth->uu_wqset);
1362                 /*
1363                  * zero out the waitq pointer array to avoid use-after free
1364                  * errors in the selcount error path (seldrop_locked) if/when
1365                  * the thread re-calls select().
1366                  */
1367                 bzero((void *)uth->uu_wqset, uth->uu_wqstate_sz);
1368         }
1369         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1370         /* select is not restarted after signals... */
1371         if (error == ERESTART)
1372                 error = EINTR;
1373         if (error == EWOULDBLOCK)
1374                 error = 0;
1375         nw = howmany(uap->nd, NFDBITS);
1376         ni = nw * sizeof(fd_mask);
1377
1378 #define putbits(name, x) \
1379         do { \
1380                 if (uap->name && (error2 = \
1381                         copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1382                         error = error2; \
1383         } while (0)
1384
1385         if (error == 0) {
1386                 int error2;
1387
1388                 putbits(in, 0);
1389                 putbits(ou, 1);
1390                 putbits(ex, 2);
1391 #undef putbits
1392         }
1393
1394         if (error != EINTR && sel_pass == SEL_SECONDPASS && uth->uu_flag & UT_SAS_OLDMASK) {
1395                 /* restore signal mask - continuation case */
1396                 uth->uu_sigmask = uth->uu_oldmask;
1397                 uth->uu_oldmask = 0;
1398                 uth->uu_flag &= ~UT_SAS_OLDMASK;
1399         }
1400
1401         return(error);
1402 }
1403
1404
1405 /**
1406  * remove the fileproc's underlying waitq from the supplied waitq set;
1407  * clear FP_INSELECT when appropriate
1408  *
1409  * Parameters:
1410  *              fp      File proc that is potentially currently in select
1411  *              wqset   Waitq set to which the fileproc may belong
1412  *                      (usually this is the thread's private waitq set)
1413  * Conditions:
1414  *              proc_fdlock is held
1415  */
1416 static void selunlinkfp(struct fileproc *fp, uint64_t wqp_id, struct waitq_set *wqset)
1417 {
1418         int valid_set = waitq_set_is_valid(wqset);
1419         int valid_q = !!wqp_id;
1420
1421         /*
1422          * This could be called (from selcount error path) before we setup
1423          * the thread's wqset. Check the wqset passed in, and only unlink if
1424          * the set is valid.
1425          */
1426
1427         /* unlink the underlying waitq from the input set (thread waitq set) */
1428         if (valid_q && valid_set)
1429                 waitq_unlink_by_prepost_id(wqp_id, wqset);
1430
1431         /* allow passing a NULL/invalid fp for seldrop unwind */
1432         if (!fp || !(fp->f_flags & (FP_INSELECT|FP_SELCONFLICT)))
1433                 return;
1434
1435         /*
1436          * We can always remove the conflict queue from our thread's set: this
1437          * will not affect other threads that potentially need to be awoken on
1438          * the conflict queue during a fileproc_drain - those sets will still
1439          * be linked with the global conflict queue, and the last waiter
1440          * on the fp clears the CONFLICT marker.
1441          */
1442         if (valid_set && (fp->f_flags & FP_SELCONFLICT))
1443                 waitq_unlink(&select_conflict_queue, wqset);
1444
1445         /* jca: TODO:
1446          * This isn't quite right - we don't actually know if this
1447          * fileproc is in another select or not! Here we just assume
1448          * that if we were the first thread to select on the FD, then
1449          * we'll be the one to clear this flag...
1450          */
1451         if (valid_set && fp->f_wset == (void *)wqset) {
1452                 fp->f_flags &= ~FP_INSELECT;
1453                 fp->f_wset = NULL;
1454         }
1455 }
1456
1457 /**
1458  * connect a fileproc to the given wqset, potentially bridging to a waitq
1459  * pointed to indirectly by wq_data
1460  *
1461  * Parameters:
1462  *              fp      File proc potentially currently in select
1463  *              wq_data Pointer to a pointer to a waitq (could be NULL)
1464  *              wqset   Waitq set to which the fileproc should now belong
1465  *                      (usually this is the thread's private waitq set)
1466  *
1467  * Conditions:
1468  *              proc_fdlock is held
1469  */
1470 static uint64_t sellinkfp(struct fileproc *fp, void **wq_data, struct waitq_set *wqset)
1471 {
1472         struct waitq *f_wq = NULL;
1473
1474         if ((fp->f_flags & FP_INSELECT) != FP_INSELECT) {
1475                 if (wq_data)
1476                         panic("non-null data:%p on fp:%p not in select?!"
1477                               "(wqset:%p)", wq_data, fp, wqset);
1478                 return 0;
1479         }
1480
1481         if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) {
1482                 /*
1483                  * The conflict queue requires disabling interrupts, so we
1484                  * need to explicitly reserve a link object to avoid a
1485                  * panic/assert in the waitq code. Hopefully this extra step
1486                  * can be avoided if we can split the waitq structure into
1487                  * blocking and linkage sub-structures.
1488                  */
1489                 uint64_t reserved_link = waitq_link_reserve(&select_conflict_queue);
1490                 waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
1491                 waitq_link_release(reserved_link);
1492         }
1493
1494         /*
1495          * The wq_data parameter has potentially been set by selrecord called
1496          * from a subsystems fo_select() function. If the subsystem does not
1497          * call selrecord, then wq_data will be NULL
1498          *
1499          * Use memcpy to get the value into a proper pointer because
1500          * wq_data most likely points to a stack variable that could be
1501          * unaligned on 32-bit systems.
1502          */
1503         if (wq_data) {
1504                 memcpy(&f_wq, wq_data, sizeof(f_wq));
1505                 if (!waitq_is_valid(f_wq))
1506                         f_wq = NULL;
1507         }
1508
1509         /* record the first thread's wqset in the fileproc structure */
1510         if (!fp->f_wset)
1511                 fp->f_wset = (void *)wqset;
1512
1513         /* handles NULL f_wq */
1514         return waitq_get_prepost_id(f_wq);
1515 }
1516
1517
1518 /*
1519  * selscan
1520  *
1521  * Parameters:  p                       Process performing the select
1522  *              sel                     The per-thread select context structure
1523  *              nfd                     The number of file descriptors to scan
1524  *              retval                  The per thread system call return area
1525  *              sel_pass                Which pass this is; allowed values are
1526  *                                              SEL_FIRSTPASS and SEL_SECONDPASS
1527  *              wqset                   The per thread wait queue set
1528  *
1529  * Returns:     0                       Success
1530  *              EIO                     Invalid p->p_fd field XXX Obsolete?
1531  *              EBADF                   One of the files in the bit vector is
1532  *                                              invalid.
1533  */
1534 static int
1535 selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
1536         int nfd, int32_t *retval, int sel_pass, struct waitq_set *wqset)
1537 {
1538         struct filedesc *fdp = p->p_fd;
1539         int msk, i, j, fd;
1540         u_int32_t bits;
1541         struct fileproc *fp;
1542         int n = 0;              /* count of bits */
1543         int nc = 0;             /* bit vector offset (nc'th bit) */
1544         static int flag[3] = { FREAD, FWRITE, 0 };
1545         u_int32_t *iptr, *optr;
1546         u_int nw;
1547         u_int32_t *ibits, *obits;
1548         uint64_t reserved_link, *rl_ptr = NULL;
1549         int count;
1550         struct vfs_context context = *vfs_context_current();
1551
1552         /*
1553          * Problems when reboot; due to MacOSX signal probs
1554          * in Beaker1C ; verify that the p->p_fd is valid
1555          */
1556         if (fdp == NULL) {
1557                 *retval=0;
1558                 return(EIO);
1559         }
1560         ibits = sel->ibits;
1561         obits = sel->obits;
1562
1563         nw = howmany(nfd, NFDBITS);
1564
1565         count = seldata->count;
1566
1567         nc = 0;
1568         if (!count) {
1569                 *retval = 0;
1570                 return 0;
1571         }
1572
1573         proc_fdlock(p);
1574         for (msk = 0; msk < 3; msk++) {
1575                 iptr = (u_int32_t *)&ibits[msk * nw];
1576                 optr = (u_int32_t *)&obits[msk * nw];
1577
1578                 for (i = 0; i < nfd; i += NFDBITS) {
1579                         bits = iptr[i/NFDBITS];
1580
1581                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1582                                 bits &= ~(1 << j);
1583
1584                                 if (fd < fdp->fd_nfiles)
1585                                         fp = fdp->fd_ofiles[fd];
1586                                 else
1587                                         fp = NULL;
1588
1589                                 if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1590                                         /*
1591                                          * If we abort because of a bad
1592                                          * fd, let the caller unwind...
1593                                          */
1594                                         proc_fdunlock(p);
1595                                         return(EBADF);
1596                                 }
1597                                 if (sel_pass == SEL_SECONDPASS) {
1598                                         reserved_link = 0;
1599                                         rl_ptr = NULL;
1600                                         selunlinkfp(fp, seldata->wqp[nc], wqset);
1601                                 } else {
1602                                         reserved_link = waitq_link_reserve((struct waitq *)wqset);
1603                                         rl_ptr = &reserved_link;
1604                                         if (fp->f_flags & FP_INSELECT)
1605                                                 /* someone is already in select on this fp */
1606                                                 fp->f_flags |= FP_SELCONFLICT;
1607                                         else
1608                                                 fp->f_flags |= FP_INSELECT;
1609                                 }
1610
1611                                 context.vc_ucred = fp->f_cred;
1612
1613                                 /*
1614                                  * stash this value b/c fo_select may replace
1615                                  * reserved_link with a pointer to a waitq object
1616                                  */
1617                                 uint64_t rsvd = reserved_link;
1618
1619                                 /* The select; set the bit, if true */
1620                                 if (fp->f_ops && fp->f_type
1621                                         && fo_select(fp, flag[msk], rl_ptr, &context)) {
1622                                         optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1623                                         n++;
1624                                 }
1625                                 if (sel_pass == SEL_FIRSTPASS) {
1626                                         waitq_link_release(rsvd);
1627                                         /*
1628                                          * If the fp's supporting selinfo structure was linked
1629                                          * to this thread's waitq set, then 'reserved_link'
1630                                          * will have been updated by selrecord to be a pointer
1631                                          * to the selinfo's waitq.
1632                                          */
1633                                         if (reserved_link == rsvd)
1634                                                 rl_ptr = NULL; /* fo_select never called selrecord() */
1635                                         /*
1636                                          * Hook up the thread's waitq set either to
1637                                          * the fileproc structure, or to the global
1638                                          * conflict queue: but only on the first
1639                                          * select pass.
1640                                          */
1641                                         seldata->wqp[nc] = sellinkfp(fp, (void **)rl_ptr, wqset);
1642                                 }
1643                                 nc++;
1644                         }
1645                 }
1646         }
1647         proc_fdunlock(p);
1648
1649         *retval = n;
1650         return (0);
1651 }
1652
1653 int poll_callback(struct kqueue *, struct kevent_internal_s *, void *);
1654
1655 struct poll_continue_args {
1656         user_addr_t pca_fds;
1657         u_int pca_nfds;
1658         u_int pca_rfds;
1659 };
1660
1661 int
1662 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
1663 {
1664         __pthread_testcancel(1);
1665         return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
1666 }
1667
1668
1669 int
1670 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
1671 {
1672         struct poll_continue_args *cont;
1673         struct pollfd *fds;
1674         struct kqueue *kq;
1675         struct timeval atv;
1676         int ncoll, error = 0;
1677         u_int nfds = uap->nfds;
1678         u_int rfds = 0;
1679         u_int i;
1680         size_t ni;
1681
1682         /*
1683          * This is kinda bogus.  We have fd limits, but that is not
1684          * really related to the size of the pollfd array.  Make sure
1685          * we let the process use at least FD_SETSIZE entries and at
1686          * least enough for the current limits.  We want to be reasonably
1687          * safe, but not overly restrictive.
1688          */
1689         if (nfds > OPEN_MAX ||
1690             (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) || nfds > FD_SETSIZE)))
1691                 return (EINVAL);
1692
1693         kq = kqueue_alloc(p, 0);
1694         if (kq == NULL)
1695                 return (EAGAIN);
1696
1697         ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1698         MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1699         if (NULL == cont) {
1700                 error = EAGAIN;
1701                 goto out;
1702         }
1703
1704         fds = (struct pollfd *)&cont[1];
1705         error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1706         if (error)
1707                 goto out;
1708
1709         if (uap->timeout != -1) {
1710                 struct timeval rtv;
1711
1712                 atv.tv_sec = uap->timeout / 1000;
1713                 atv.tv_usec = (uap->timeout % 1000) * 1000;
1714                 if (itimerfix(&atv)) {
1715                         error = EINVAL;
1716                         goto out;
1717                 }
1718                 getmicrouptime(&rtv);
1719                 timevaladd(&atv, &rtv);
1720         } else {
1721                 atv.tv_sec = 0;
1722                 atv.tv_usec = 0;
1723         }
1724
1725         /* JMM - all this P_SELECT stuff is bogus */
1726         ncoll = nselcoll;
1727         OSBitOrAtomic(P_SELECT, &p->p_flag);
1728         for (i = 0; i < nfds; i++) {
1729                 short events = fds[i].events;
1730
1731                 /* per spec, ignore fd values below zero */
1732                 if (fds[i].fd < 0) {
1733                         fds[i].revents = 0;
1734                         continue;
1735                 }
1736
1737                 /* convert the poll event into a kqueue kevent */
1738                 struct kevent_internal_s kev = {
1739                         .ident = fds[i].fd,
1740                         .flags = EV_ADD | EV_ONESHOT | EV_POLL,
1741                         .udata = CAST_USER_ADDR_T(&fds[i]) };
1742
1743                 /* Handle input events */
1744                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
1745                         kev.filter = EVFILT_READ;
1746                         if (events & ( POLLPRI | POLLRDBAND ))
1747                                 kev.flags |= EV_OOBAND;
1748                         kevent_register(kq, &kev, p);
1749                 }
1750
1751                 /* Handle output events */
1752                 if ((kev.flags & EV_ERROR) == 0 &&
1753                     (events & ( POLLOUT | POLLWRNORM | POLLWRBAND ))) {
1754                         kev.filter = EVFILT_WRITE;
1755                         kevent_register(kq, &kev, p);
1756                 }
1757
1758                 /* Handle BSD extension vnode events */
1759                 if ((kev.flags & EV_ERROR) == 0 &&
1760                     (events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE ))) {
1761                         kev.filter = EVFILT_VNODE;
1762                         kev.fflags = 0;
1763                         if (events & POLLEXTEND)
1764                                 kev.fflags |= NOTE_EXTEND;
1765                         if (events & POLLATTRIB)
1766                                 kev.fflags |= NOTE_ATTRIB;
1767                         if (events & POLLNLINK)
1768                                 kev.fflags |= NOTE_LINK;
1769                         if (events & POLLWRITE)
1770                                 kev.fflags |= NOTE_WRITE;
1771                         kevent_register(kq, &kev, p);
1772                 }
1773
1774                 if (kev.flags & EV_ERROR) {
1775                         fds[i].revents = POLLNVAL;
1776                         rfds++;
1777                 } else
1778                         fds[i].revents = 0;
1779         }
1780
1781         /*
1782          * Did we have any trouble registering?
1783          * If user space passed 0 FDs, then respect any timeout value passed.
1784          * This is an extremely inefficient sleep. If user space passed one or
1785          * more FDs, and we had trouble registering _all_ of them, then bail
1786          * out. If a subset of the provided FDs failed to register, then we
1787          * will still call the kqueue_scan function.
1788          */
1789         if (nfds && (rfds == nfds))
1790                 goto done;
1791
1792         /*
1793          * If any events have trouble registering, an event has fired and we
1794          * shouldn't wait for events in kqueue_scan -- use the current time as
1795          * the deadline.
1796          */
1797         if (rfds)
1798                 getmicrouptime(&atv);
1799
1800         /* scan for, and possibly wait for, the kevents to trigger */
1801         cont->pca_fds = uap->fds;
1802         cont->pca_nfds = nfds;
1803         cont->pca_rfds = rfds;
1804         error = kqueue_scan(kq, poll_callback, NULL, cont, NULL, &atv, p);
1805         rfds = cont->pca_rfds;
1806
1807  done:
1808         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
1809         /* poll is not restarted after signals... */
1810         if (error == ERESTART)
1811                 error = EINTR;
1812         if (error == EWOULDBLOCK)
1813                 error = 0;
1814         if (error == 0) {
1815                 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1816                 *retval = rfds;
1817         }
1818
1819  out:
1820         if (NULL != cont)
1821                 FREE(cont, M_TEMP);
1822
1823         kqueue_dealloc(kq);
1824         return (error);
1825 }
1826
1827 int
1828 poll_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp, void *data)
1829 {
1830         struct poll_continue_args *cont = (struct poll_continue_args *)data;
1831         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1832         short prev_revents = fds->revents;
1833         short mask = 0;
1834
1835         /* convert the results back into revents */
1836         if (kevp->flags & EV_EOF)
1837                 fds->revents |= POLLHUP;
1838         if (kevp->flags & EV_ERROR)
1839                 fds->revents |= POLLERR;
1840
1841         switch (kevp->filter) {
1842         case EVFILT_READ:
1843                 if (fds->revents & POLLHUP)
1844                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
1845                 else {
1846                         mask = (POLLIN | POLLRDNORM);
1847                         if (kevp->flags & EV_OOBAND)
1848                                 mask |= (POLLPRI | POLLRDBAND);
1849                 }
1850                 fds->revents |= (fds->events & mask);
1851                 break;
1852
1853         case EVFILT_WRITE:
1854                 if (!(fds->revents & POLLHUP))
1855                         fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
1856                 break;
1857
1858         case EVFILT_VNODE:
1859                 if (kevp->fflags & NOTE_EXTEND)
1860                         fds->revents |= (fds->events & POLLEXTEND);
1861                 if (kevp->fflags & NOTE_ATTRIB)
1862                         fds->revents |= (fds->events & POLLATTRIB);
1863                 if (kevp->fflags & NOTE_LINK)
1864                         fds->revents |= (fds->events & POLLNLINK);
1865                 if (kevp->fflags & NOTE_WRITE)
1866                         fds->revents |= (fds->events & POLLWRITE);
1867                 break;
1868         }
1869
1870         if (fds->revents != 0 && prev_revents == 0)
1871                 cont->pca_rfds++;
1872
1873         return 0;
1874 }
1875
1876 int
1877 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1878 {
1879
1880         return (1);
1881 }
1882
1883 /*
1884  * selcount
1885  *
1886  * Count the number of bits set in the input bit vector, and establish an
1887  * outstanding fp->f_iocount for each of the descriptors which will be in
1888  * use in the select operation.
1889  *
1890  * Parameters:  p                       The process doing the select
1891  *              ibits                   The input bit vector
1892  *              nfd                     The number of fd's in the vector
1893  *              countp                  Pointer to where to store the bit count
1894  *
1895  * Returns:     0                       Success
1896  *              EIO                     Bad per process open file table
1897  *              EBADF                   One of the bits in the input bit vector
1898  *                                              references an invalid fd
1899  *
1900  * Implicit:    *countp (modified)      Count of fd's
1901  *
1902  * Notes:       This function is the first pass under the proc_fdlock() that
1903  *              permits us to recognize invalid descriptors in the bit vector;
1904  *              the may, however, not remain valid through the drop and
1905  *              later reacquisition of the proc_fdlock().
1906  */
1907 static int
1908 selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
1909 {
1910         struct filedesc *fdp = p->p_fd;
1911         int msk, i, j, fd;
1912         u_int32_t bits;
1913         struct fileproc *fp;
1914         int n = 0;
1915         u_int32_t *iptr;
1916         u_int nw;
1917         int error=0;
1918         int dropcount;
1919         int need_wakeup = 0;
1920
1921         /*
1922          * Problems when reboot; due to MacOSX signal probs
1923          * in Beaker1C ; verify that the p->p_fd is valid
1924          */
1925         if (fdp == NULL) {
1926                 *countp = 0;
1927                 return(EIO);
1928         }
1929         nw = howmany(nfd, NFDBITS);
1930
1931         proc_fdlock(p);
1932         for (msk = 0; msk < 3; msk++) {
1933                 iptr = (u_int32_t *)&ibits[msk * nw];
1934                 for (i = 0; i < nfd; i += NFDBITS) {
1935                         bits = iptr[i/NFDBITS];
1936                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1937                                 bits &= ~(1 << j);
1938
1939                                 if (fd < fdp->fd_nfiles)
1940                                         fp = fdp->fd_ofiles[fd];
1941                                 else
1942                                         fp = NULL;
1943
1944                                 if (fp == NULL ||
1945                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1946                                                 *countp = 0;
1947                                                 error = EBADF;
1948                                                 goto bad;
1949                                 }
1950                                 fp->f_iocount++;
1951                                 n++;
1952                         }
1953                 }
1954         }
1955         proc_fdunlock(p);
1956
1957         *countp = n;
1958         return (0);
1959
1960 bad:
1961         dropcount = 0;
1962
1963         if (n == 0)
1964                 goto out;
1965         /* Ignore error return; it's already EBADF */
1966         (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1);
1967
1968 out:
1969         proc_fdunlock(p);
1970         if (need_wakeup) {
1971                 wakeup(&p->p_fpdrainwait);
1972         }
1973         return(error);
1974 }
1975
1976
1977 /*
1978  * seldrop_locked
1979  *
1980  * Drop outstanding wait queue references set up during selscan(); drop the
1981  * outstanding per fileproc f_iocount() picked up during the selcount().
1982  *
1983  * Parameters:  p                       Process performing the select
1984  *              ibits                   Input bit bector of fd's
1985  *              nfd                     Number of fd's
1986  *              lim                     Limit to number of vector entries to
1987  *                                              consider, or -1 for "all"
1988  *              inselect                True if
1989  *              need_wakeup             Pointer to flag to set to do a wakeup
1990  *                                      if f_iocont on any descriptor goes to 0
1991  *
1992  * Returns:     0                       Success
1993  *              EBADF                   One or more fds in the bit vector
1994  *                                              were invalid, but the rest
1995  *                                              were successfully dropped
1996  *
1997  * Notes:       An fd make become bad while the proc_fdlock() is not held,
1998  *              if a multithreaded application closes the fd out from under
1999  *              the in progress select.  In this case, we still have to
2000  *              clean up after the set up on the remaining fds.
2001  */
2002 static int
2003 seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount)
2004 {
2005         struct filedesc *fdp = p->p_fd;
2006         int msk, i, j, nc, fd;
2007         u_int32_t bits;
2008         struct fileproc *fp;
2009         u_int32_t *iptr;
2010         u_int nw;
2011         int error = 0;
2012         int dropcount = 0;
2013         uthread_t uth = get_bsdthread_info(current_thread());
2014         struct _select_data *seldata;
2015
2016         *need_wakeup = 0;
2017
2018         /*
2019          * Problems when reboot; due to MacOSX signal probs
2020          * in Beaker1C ; verify that the p->p_fd is valid
2021          */
2022         if (fdp == NULL) {
2023                 return(EIO);
2024         }
2025
2026         nw = howmany(nfd, NFDBITS);
2027         seldata = &uth->uu_kevent.ss_select_data;
2028
2029         nc = 0;
2030         for (msk = 0; msk < 3; msk++) {
2031                 iptr = (u_int32_t *)&ibits[msk * nw];
2032                 for (i = 0; i < nfd; i += NFDBITS) {
2033                         bits = iptr[i/NFDBITS];
2034                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
2035                                 bits &= ~(1 << j);
2036                                 fp = fdp->fd_ofiles[fd];
2037                                 /*
2038                                  * If we've already dropped as many as were
2039                                  * counted/scanned, then we are done.
2040                                  */
2041                                 if ((fromselcount != 0) && (++dropcount > lim))
2042                                         goto done;
2043
2044                                 /*
2045                                  * unlink even potentially NULL fileprocs.
2046                                  * If the FD was closed from under us, we
2047                                  * still need to cleanup the waitq links!
2048                                  */
2049                                 selunlinkfp(fp,
2050                                             seldata->wqp ? seldata->wqp[nc] : 0,
2051                                             uth->uu_wqset);
2052
2053                                 nc++;
2054
2055                                 if (fp == NULL) {
2056                                         /* skip (now) bad fds */
2057                                         error = EBADF;
2058                                         continue;
2059                                 }
2060
2061                                 fp->f_iocount--;
2062                                 if (fp->f_iocount < 0)
2063                                         panic("f_iocount overdecrement!");
2064
2065                                 if (fp->f_iocount == 0) {
2066                                         /*
2067                                          * The last iocount is responsible for clearing
2068                                          * selconfict flag - even if we didn't set it -
2069                                          * and is also responsible for waking up anyone
2070                                          * waiting on iocounts to drain.
2071                                          */
2072                                         if (fp->f_flags & FP_SELCONFLICT)
2073                                                 fp->f_flags &= ~FP_SELCONFLICT;
2074                                         if (p->p_fpdrainwait) {
2075                                                 p->p_fpdrainwait = 0;
2076                                                 *need_wakeup = 1;
2077                                         }
2078                                 }
2079                         }
2080                 }
2081         }
2082 done:
2083         return (error);
2084 }
2085
2086
2087 static int
2088 seldrop(struct proc *p, u_int32_t *ibits, int nfd)
2089 {
2090         int error;
2091         int need_wakeup = 0;
2092
2093         proc_fdlock(p);
2094         error =  seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0);
2095         proc_fdunlock(p);
2096         if (need_wakeup) {
2097                 wakeup(&p->p_fpdrainwait);
2098         }
2099         return (error);
2100 }
2101
2102 /*
2103  * Record a select request.
2104  */
2105 void
2106 selrecord(__unused struct proc *selector, struct selinfo *sip, void *s_data)
2107 {
2108         thread_t        cur_act = current_thread();
2109         struct uthread * ut = get_bsdthread_info(cur_act);
2110         /* on input, s_data points to the 64-bit ID of a reserved link object */
2111         uint64_t *reserved_link = (uint64_t *)s_data;
2112
2113         /* need to look at collisions */
2114
2115         /*do not record if this is second pass of select */
2116         if (!s_data)
2117                 return;
2118
2119         if ((sip->si_flags & SI_INITED) == 0) {
2120                 waitq_init(&sip->si_waitq, SYNC_POLICY_FIFO);
2121                 sip->si_flags |= SI_INITED;
2122                 sip->si_flags &= ~SI_CLEAR;
2123         }
2124
2125         if (sip->si_flags & SI_RECORDED)
2126                 sip->si_flags |= SI_COLL;
2127         else
2128                 sip->si_flags &= ~SI_COLL;
2129
2130         sip->si_flags |= SI_RECORDED;
2131         /* note: this checks for pre-existing linkage */
2132         waitq_link(&sip->si_waitq, ut->uu_wqset,
2133                    WAITQ_SHOULD_LOCK, reserved_link);
2134
2135         /*
2136          * Always consume the reserved link.
2137          * We can always call waitq_link_release() safely because if
2138          * waitq_link is successful, it consumes the link and resets the
2139          * value to 0, in which case our call to release becomes a no-op.
2140          * If waitq_link fails, then the following release call will actually
2141          * release the reserved link object.
2142          */
2143         waitq_link_release(*reserved_link);
2144         *reserved_link = 0;
2145
2146         /*
2147          * Use the s_data pointer as an output parameter as well
2148          * This avoids changing the prototype for this function which is
2149          * used by many kexts. We need to surface the waitq object
2150          * associated with the selinfo we just added to the thread's select
2151          * set. New waitq sets do not have back-pointers to set members, so
2152          * the only way to clear out set linkage objects is to go from the
2153          * waitq to the set. We use a memcpy because s_data could be
2154          * pointing to an unaligned value on the stack
2155          * (especially on 32-bit systems)
2156          */
2157         void *wqptr = (void *)&sip->si_waitq;
2158         memcpy((void *)s_data, (void *)&wqptr, sizeof(void *));
2159
2160         return;
2161 }
2162
2163 void
2164 selwakeup(struct selinfo *sip)
2165 {
2166
2167         if ((sip->si_flags & SI_INITED) == 0) {
2168                 return;
2169         }
2170
2171         if (sip->si_flags & SI_COLL) {
2172                 nselcoll++;
2173                 sip->si_flags &= ~SI_COLL;
2174 #if 0
2175                 /* will not  support */
2176                 //wakeup((caddr_t)&selwait);
2177 #endif
2178         }
2179
2180         if (sip->si_flags & SI_RECORDED) {
2181                 waitq_wakeup64_all(&sip->si_waitq, NO_EVENT64,
2182                                    THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
2183                 sip->si_flags &= ~SI_RECORDED;
2184         }
2185
2186 }
2187
2188 void
2189 selthreadclear(struct selinfo *sip)
2190 {
2191         struct waitq *wq;
2192
2193         if ((sip->si_flags & SI_INITED) == 0) {
2194                 return;
2195         }
2196         if (sip->si_flags & SI_RECORDED) {
2197                         selwakeup(sip);
2198                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
2199         }
2200         sip->si_flags |= SI_CLEAR;
2201         sip->si_flags &= ~SI_INITED;
2202
2203         wq = &sip->si_waitq;
2204
2205         /*
2206          * Higher level logic may have a handle on this waitq's prepost ID,
2207          * but that's OK because the waitq_deinit will remove/invalidate the
2208          * prepost object (as well as mark the waitq invalid). This de-couples
2209          * us from any callers that may have a handle to this waitq via the
2210          * prepost ID.
2211          */
2212         waitq_deinit(wq);
2213 }
2214
2215
2216
2217
2218 #define DBG_POST        0x10
2219 #define DBG_WATCH       0x11
2220 #define DBG_WAIT        0x12
2221 #define DBG_MOD         0x13
2222 #define DBG_EWAKEUP     0x14
2223 #define DBG_ENQUEUE     0x15
2224 #define DBG_DEQUEUE     0x16
2225
2226 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
2227 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
2228 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
2229 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
2230 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
2231 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
2232 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
2233
2234
2235 #define EVPROCDEQUE(p, evq)     do {                            \
2236         proc_lock(p);                                           \
2237         if (evq->ee_flags & EV_QUEUED) {                        \
2238                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);      \
2239                 evq->ee_flags &= ~EV_QUEUED;                    \
2240         }                                                       \
2241         proc_unlock(p);                                         \
2242 } while (0);
2243
2244
2245 /*
2246  * called upon socket close. deque and free all events for
2247  * the socket...  socket must be locked by caller.
2248  */
2249 void
2250 evsofree(struct socket *sp)
2251 {
2252         struct eventqelt *evq, *next;
2253         proc_t  p;
2254
2255         if (sp == NULL)
2256                 return;
2257
2258         for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
2259                 next = evq->ee_slist.tqe_next;
2260                 p = evq->ee_proc;
2261
2262                 if (evq->ee_flags & EV_QUEUED) {
2263                         EVPROCDEQUE(p, evq);
2264                 }
2265                 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
2266                 FREE(evq, M_TEMP);
2267         }
2268 }
2269
2270
2271 /*
2272  * called upon pipe close. deque and free all events for
2273  * the pipe... pipe must be locked by caller
2274  */
2275 void
2276 evpipefree(struct pipe *cpipe)
2277 {
2278         struct eventqelt *evq, *next;
2279         proc_t  p;
2280
2281         for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
2282                 next = evq->ee_slist.tqe_next;
2283                 p = evq->ee_proc;
2284
2285                 EVPROCDEQUE(p, evq);
2286
2287                 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
2288                 FREE(evq, M_TEMP);
2289         }
2290 }
2291
2292
2293 /*
2294  * enqueue this event if it's not already queued. wakeup
2295  * the proc if we do queue this event to it...
2296  * entered with proc lock held... we drop it before
2297  * doing the wakeup and return in that state
2298  */
2299 static void
2300 evprocenque(struct eventqelt *evq)
2301 {
2302         proc_t  p;
2303
2304         assert(evq);
2305         p = evq->ee_proc;
2306
2307         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,0,0);
2308
2309         proc_lock(p);
2310
2311         if (evq->ee_flags & EV_QUEUED) {
2312                 proc_unlock(p);
2313
2314                 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
2315                 return;
2316         }
2317         evq->ee_flags |= EV_QUEUED;
2318
2319         TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
2320
2321         proc_unlock(p);
2322
2323         wakeup(&p->p_evlist);
2324
2325         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
2326 }
2327
2328
2329 /*
2330  * pipe lock must be taken by the caller
2331  */
2332 void
2333 postpipeevent(struct pipe *pipep, int event)
2334 {
2335         int     mask;
2336         struct eventqelt *evq;
2337
2338         if (pipep == NULL)
2339                 return;
2340         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
2341
2342         for (evq = pipep->pipe_evlist.tqh_first;
2343              evq != NULL; evq = evq->ee_slist.tqe_next) {
2344
2345                 if (evq->ee_eventmask == 0)
2346                         continue;
2347                 mask = 0;
2348
2349                 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
2350
2351                 case EV_RWBYTES:
2352                   if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
2353                           mask |= EV_RE;
2354                           evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
2355                   }
2356                   if ((evq->ee_eventmask & EV_WR) &&
2357                       (MAX(pipep->pipe_buffer.size,PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
2358
2359                           if (pipep->pipe_state & PIPE_EOF) {
2360                                   mask |= EV_WR|EV_RESET;
2361                                   break;
2362                           }
2363                           mask |= EV_WR;
2364                           evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt;
2365                   }
2366                   break;
2367
2368                 case EV_WCLOSED:
2369                 case EV_RCLOSED:
2370                   if ((evq->ee_eventmask & EV_RE)) {
2371                           mask |= EV_RE|EV_RCLOSED;
2372                   }
2373                   if ((evq->ee_eventmask & EV_WR)) {
2374                           mask |= EV_WR|EV_WCLOSED;
2375                   }
2376                   break;
2377
2378                 default:
2379                   return;
2380                 }
2381                 if (mask) {
2382                         /*
2383                          * disarm... postevents are nops until this event is 'read' via
2384                          * waitevent and then re-armed via modwatch
2385                          */
2386                         evq->ee_eventmask = 0;
2387
2388                         /*
2389                          * since events are disarmed until after the waitevent
2390                          * the ee_req.er_xxxx fields can't change once we've
2391                          * inserted this event into the proc queue...
2392                          * therefore, the waitevent will see a 'consistent'
2393                          * snapshot of the event, even though it won't hold
2394                          * the pipe lock, and we're updating the event outside
2395                          * of the proc lock, which it will hold
2396                          */
2397                         evq->ee_req.er_eventbits |= mask;
2398
2399                         KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1,0);
2400
2401                         evprocenque(evq);
2402                 }
2403         }
2404         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
2405 }
2406
2407 #if SOCKETS
2408 /*
2409  * given either a sockbuf or a socket run down the
2410  * event list and queue ready events found...
2411  * the socket must be locked by the caller
2412  */
2413 void
2414 postevent(struct socket *sp, struct sockbuf *sb, int event)
2415 {
2416         int     mask;
2417         struct  eventqelt *evq;
2418         struct  tcpcb *tp;
2419
2420         if (sb)
2421                 sp = sb->sb_so;
2422         if (sp == NULL)
2423                 return;
2424
2425         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
2426
2427         for (evq = sp->so_evlist.tqh_first;
2428              evq != NULL; evq = evq->ee_slist.tqe_next) {
2429
2430                 if (evq->ee_eventmask == 0)
2431                         continue;
2432                 mask = 0;
2433
2434                 /* ready for reading:
2435                    - byte cnt >= receive low water mark
2436                    - read-half of conn closed
2437                    - conn pending for listening sock
2438                    - socket error pending
2439
2440                    ready for writing
2441                    - byte cnt avail >= send low water mark
2442                    - write half of conn closed
2443                    - socket error pending
2444                    - non-blocking conn completed successfully
2445
2446                    exception pending
2447                    - out of band data
2448                    - sock at out of band mark
2449                 */
2450
2451                 switch (event & EV_DMASK) {
2452
2453                 case EV_OOB:
2454                   if ((evq->ee_eventmask & EV_EX)) {
2455                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2456                                   mask |= EV_EX|EV_OOB;
2457                   }
2458                   break;
2459
2460                 case EV_RWBYTES|EV_OOB:
2461                   if ((evq->ee_eventmask & EV_EX)) {
2462                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
2463                                   mask |= EV_EX|EV_OOB;
2464                   }
2465                   /*
2466                    * fall into the next case
2467                    */
2468                 case EV_RWBYTES:
2469                   if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
2470                           /* for AFP/OT purposes; may go away in future */
2471                           if ((SOCK_DOM(sp) == PF_INET ||
2472                               SOCK_DOM(sp) == PF_INET6) &&
2473                               SOCK_PROTO(sp) == IPPROTO_TCP &&
2474                               (sp->so_error == ECONNREFUSED ||
2475                               sp->so_error == ECONNRESET)) {
2476                                   if (sp->so_pcb == NULL ||
2477                                       sotoinpcb(sp)->inp_state ==
2478                                       INPCB_STATE_DEAD ||
2479                                       (tp = sototcpcb(sp)) == NULL ||
2480                                       tp->t_state == TCPS_CLOSED) {
2481                                           mask |= EV_RE|EV_RESET;
2482                                           break;
2483                                   }
2484                           }
2485                           mask |= EV_RE;
2486                           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
2487
2488                           if (sp->so_state & SS_CANTRCVMORE) {
2489                                   mask |= EV_FIN;
2490                                   break;
2491                           }
2492                   }
2493                   if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
2494                           /* for AFP/OT purposes; may go away in future */
2495                           if ((SOCK_DOM(sp) == PF_INET ||
2496                               SOCK_DOM(sp) == PF_INET6) &&
2497                               SOCK_PROTO(sp) == IPPROTO_TCP &&
2498                               (sp->so_error == ECONNREFUSED ||
2499                               sp->so_error == ECONNRESET)) {
2500                                   if (sp->so_pcb == NULL ||
2501                                       sotoinpcb(sp)->inp_state ==
2502                                       INPCB_STATE_DEAD ||
2503                                       (tp = sototcpcb(sp)) == NULL ||
2504                                       tp->t_state == TCPS_CLOSED) {
2505                                           mask |= EV_WR|EV_RESET;
2506                                           break;
2507                                   }
2508                           }
2509                           mask |= EV_WR;
2510                           evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
2511                   }
2512                   break;
2513
2514                 case EV_RCONN:
2515                   if ((evq->ee_eventmask & EV_RE)) {
2516                           mask |= EV_RE|EV_RCONN;
2517                           evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
2518                   }
2519                   break;
2520
2521                 case EV_WCONN:
2522                   if ((evq->ee_eventmask & EV_WR)) {
2523                           mask |= EV_WR|EV_WCONN;
2524                   }
2525                   break;
2526
2527                 case EV_RCLOSED:
2528                   if ((evq->ee_eventmask & EV_RE)) {
2529                           mask |= EV_RE|EV_RCLOSED;
2530                   }
2531                   break;
2532
2533                 case EV_WCLOSED:
2534                   if ((evq->ee_eventmask & EV_WR)) {
2535                           mask |= EV_WR|EV_WCLOSED;
2536                   }
2537                   break;
2538
2539                 case EV_FIN:
2540                   if (evq->ee_eventmask & EV_RE) {
2541                           mask |= EV_RE|EV_FIN;
2542                   }
2543                   break;
2544
2545                 case EV_RESET:
2546                 case EV_TIMEOUT:
2547                   if (evq->ee_eventmask & EV_RE) {
2548                           mask |= EV_RE | event;
2549                   }
2550                   if (evq->ee_eventmask & EV_WR) {
2551                           mask |= EV_WR | event;
2552                   }
2553                   break;
2554
2555                 default:
2556                   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
2557                   return;
2558                 } /* switch */
2559
2560                 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
2561
2562                 if (mask) {
2563                         /*
2564                          * disarm... postevents are nops until this event is 'read' via
2565                          * waitevent and then re-armed via modwatch
2566                          */
2567                         evq->ee_eventmask = 0;
2568
2569                         /*
2570                          * since events are disarmed until after the waitevent
2571                          * the ee_req.er_xxxx fields can't change once we've
2572                          * inserted this event into the proc queue...
2573                          * since waitevent can't see this event until we
2574                          * enqueue it, waitevent will see a 'consistent'
2575                          * snapshot of the event, even though it won't hold
2576                          * the socket lock, and we're updating the event outside
2577                          * of the proc lock, which it will hold
2578                          */
2579                         evq->ee_req.er_eventbits |= mask;
2580
2581                         evprocenque(evq);
2582                 }
2583         }
2584         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
2585 }
2586 #endif /* SOCKETS */
2587
2588
2589 /*
2590  * watchevent system call. user passes us an event to watch
2591  * for. we malloc an event object, initialize it, and queue
2592  * it to the open socket. when the event occurs, postevent()
2593  * will enque it back to our proc where we can retrieve it
2594  * via waitevent().
2595  *
2596  * should this prevent duplicate events on same socket?
2597  *
2598  * Returns:
2599  *              ENOMEM                  No memory for operation
2600  *      copyin:EFAULT
2601  */
2602 int
2603 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
2604 {
2605         struct eventqelt *evq = (struct eventqelt *)0;
2606         struct eventqelt *np = NULL;
2607         struct eventreq64 *erp;
2608         struct fileproc *fp = NULL;
2609         int error;
2610
2611         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
2612
2613         // get a qelt and fill with users req
2614         MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
2615
2616         if (evq == NULL)
2617                 return (ENOMEM);
2618         erp = &evq->ee_req;
2619
2620         // get users request pkt
2621
2622         if (IS_64BIT_PROCESS(p)) {
2623                 error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
2624         } else {
2625                 struct eventreq32 er32;
2626
2627                 error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
2628                 if (error == 0) {
2629                        /*
2630                         * the user only passes in the
2631                         * er_type, er_handle and er_data...
2632                         * the other fields are initialized
2633                         * below, so don't bother to copy
2634                         */
2635                         erp->er_type = er32.er_type;
2636                         erp->er_handle = er32.er_handle;
2637                         erp->er_data = (user_addr_t)er32.er_data;
2638                 }
2639         }
2640         if (error) {
2641                 FREE(evq, M_TEMP);
2642                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2643
2644                 return(error);
2645         }
2646         KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2647
2648         // validate, freeing qelt if errors
2649         error = 0;
2650         proc_fdlock(p);
2651
2652         if (erp->er_type != EV_FD) {
2653                 error = EINVAL;
2654         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2655                 error = EBADF;
2656 #if SOCKETS
2657         } else if (fp->f_type == DTYPE_SOCKET) {
2658                 socket_lock((struct socket *)fp->f_data, 1);
2659                 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2660 #endif /* SOCKETS */
2661         } else if (fp->f_type == DTYPE_PIPE) {
2662                 PIPE_LOCK((struct pipe *)fp->f_data);
2663                 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2664         } else {
2665                 fp_drop(p, erp->er_handle, fp, 1);
2666                 error = EINVAL;
2667         }
2668         proc_fdunlock(p);
2669
2670         if (error) {
2671                 FREE(evq, M_TEMP);
2672
2673                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2674                 return(error);
2675         }
2676
2677         /*
2678          * only allow one watch per file per proc
2679          */
2680         for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2681                 if (np->ee_proc == p) {
2682 #if SOCKETS
2683                         if (fp->f_type == DTYPE_SOCKET)
2684                                 socket_unlock((struct socket *)fp->f_data, 1);
2685                         else
2686 #endif /* SOCKETS */
2687                                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2688                         fp_drop(p, erp->er_handle, fp, 0);
2689                         FREE(evq, M_TEMP);
2690
2691                         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2692                         return(EINVAL);
2693                 }
2694         }
2695         erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
2696         evq->ee_proc = p;
2697         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2698         evq->ee_flags = 0;
2699
2700 #if SOCKETS
2701         if (fp->f_type == DTYPE_SOCKET) {
2702                 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2703                 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
2704
2705                 socket_unlock((struct socket *)fp->f_data, 1);
2706         } else
2707 #endif /* SOCKETS */
2708         {
2709                 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2710                 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2711
2712                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2713         }
2714         fp_drop_event(p, erp->er_handle, fp);
2715
2716         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
2717         return(0);
2718 }
2719
2720
2721
2722 /*
2723  * waitevent system call.
2724  * grabs the next waiting event for this proc and returns
2725  * it. if no events, user can request to sleep with timeout
2726  * or without or poll mode
2727  *    ((tv != NULL && interval == 0) || tv == -1)
2728  */
2729 int
2730 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
2731 {
2732         int error = 0;
2733         struct eventqelt *evq;
2734         struct eventreq64 *erp;
2735         uint64_t abstime, interval;
2736         boolean_t fast_poll = FALSE;
2737         union {
2738                 struct eventreq64 er64;
2739                 struct eventreq32 er32;
2740         } uer;
2741
2742         interval = 0;
2743
2744         if (uap->tv) {
2745                 struct timeval atv;
2746                 /*
2747                  * check for fast poll method
2748                  */
2749                 if (IS_64BIT_PROCESS(p)) {
2750                         if (uap->tv == (user_addr_t)-1)
2751                                 fast_poll = TRUE;
2752                 } else if (uap->tv == (user_addr_t)((uint32_t)-1))
2753                         fast_poll = TRUE;
2754
2755                 if (fast_poll == TRUE) {
2756                         if (p->p_evlist.tqh_first == NULL) {
2757                                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_NONE, -1,0,0,0,0);
2758                                 /*
2759                                  * poll failed
2760                                  */
2761                                 *retval = 1;
2762                                 return (0);
2763                         }
2764                         proc_lock(p);
2765                         goto retry;
2766                 }
2767                 if (IS_64BIT_PROCESS(p)) {
2768                         struct user64_timeval atv64;
2769                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
2770                         /* Loses resolution - assume timeout < 68 years */
2771                         atv.tv_sec = atv64.tv_sec;
2772                         atv.tv_usec = atv64.tv_usec;
2773                 } else {
2774                         struct user32_timeval atv32;
2775                         error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
2776                         atv.tv_sec = atv32.tv_sec;
2777                         atv.tv_usec = atv32.tv_usec;
2778                 }
2779
2780                 if (error)
2781                         return(error);
2782                 if (itimerfix(&atv)) {
2783                         error = EINVAL;
2784                         return(error);
2785                 }
2786                 interval = tvtoabstime(&atv);
2787         }
2788         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
2789
2790         proc_lock(p);
2791 retry:
2792         if ((evq = p->p_evlist.tqh_first) != NULL) {
2793                 /*
2794                  * found one... make a local copy while it's still on the queue
2795                  * to prevent it from changing while in the midst of copying
2796                  * don't want to hold the proc lock across a copyout because
2797                  * it might block on a page fault at the target in user space
2798                  */
2799                 erp = &evq->ee_req;
2800
2801                 if (IS_64BIT_PROCESS(p))
2802                         bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
2803                 else {
2804                         uer.er32.er_type  = erp->er_type;
2805                         uer.er32.er_handle  = erp->er_handle;
2806                         uer.er32.er_data  = (uint32_t)erp->er_data;
2807                         uer.er32.er_ecnt  = erp->er_ecnt;
2808                         uer.er32.er_rcnt  = erp->er_rcnt;
2809                         uer.er32.er_wcnt  = erp->er_wcnt;
2810                         uer.er32.er_eventbits = erp->er_eventbits;
2811                 }
2812                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2813
2814                 evq->ee_flags &= ~EV_QUEUED;
2815
2816                 proc_unlock(p);
2817
2818                 if (IS_64BIT_PROCESS(p))
2819                         error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
2820                 else
2821                         error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
2822
2823                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
2824                              evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,0);
2825                 return (error);
2826         }
2827         else {
2828                 if (uap->tv && interval == 0) {
2829                         proc_unlock(p);
2830                         *retval = 1;  // poll failed
2831
2832                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
2833                         return (error);
2834                 }
2835                 if (interval != 0)
2836                         clock_absolutetime_interval_to_deadline(interval, &abstime);
2837                 else
2838                         abstime = 0;
2839
2840                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,(uint32_t)&p->p_evlist,0,0,0);
2841
2842                 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
2843
2844                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,(uint32_t)&p->p_evlist,0,0,0);
2845
2846                 if (error == 0)
2847                         goto retry;
2848                 if (error == ERESTART)
2849                         error = EINTR;
2850                 if (error == EWOULDBLOCK) {
2851                         *retval = 1;
2852                         error = 0;
2853                 }
2854         }
2855         proc_unlock(p);
2856
2857         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
2858         return (error);
2859 }
2860
2861
2862 /*
2863  * modwatch system call. user passes in event to modify.
2864  * if we find it we reset the event bits and que/deque event
2865  * it needed.
2866  */
2867 int
2868 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
2869 {
2870         struct eventreq64 er;
2871         struct eventreq64 *erp = &er;
2872         struct eventqelt *evq = NULL;   /* protected by error return */
2873         int error;
2874         struct fileproc *fp;
2875         int flag;
2876
2877         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
2878
2879         /*
2880          * get user's request pkt
2881          * just need the er_type and er_handle which sit above the
2882          * problematic er_data (32/64 issue)... so only copy in
2883          * those 2 fields
2884          */
2885         if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
2886                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2887                 return(error);
2888         }
2889         proc_fdlock(p);
2890
2891         if (erp->er_type != EV_FD) {
2892                 error = EINVAL;
2893         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2894                 error = EBADF;
2895 #if SOCKETS
2896         } else if (fp->f_type == DTYPE_SOCKET) {
2897                 socket_lock((struct socket *)fp->f_data, 1);
2898                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2899 #endif /* SOCKETS */
2900         } else if (fp->f_type == DTYPE_PIPE) {
2901                 PIPE_LOCK((struct pipe *)fp->f_data);
2902                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2903         } else {
2904                 fp_drop(p, erp->er_handle, fp, 1);
2905                 error = EINVAL;
2906         }
2907
2908         if (error) {
2909                 proc_fdunlock(p);
2910                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2911                 return(error);
2912         }
2913
2914         if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2915                 fp->f_flags &= ~FP_WAITEVENT;
2916         }
2917         proc_fdunlock(p);
2918
2919         // locate event if possible
2920         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2921                 if (evq->ee_proc == p)
2922                         break;
2923         }
2924         if (evq == NULL) {
2925 #if SOCKETS
2926                 if (fp->f_type == DTYPE_SOCKET)
2927                         socket_unlock((struct socket *)fp->f_data, 1);
2928                 else
2929 #endif /* SOCKETS */
2930                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2931                 fp_drop(p, erp->er_handle, fp, 0);
2932                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
2933                 return(EINVAL);
2934         }
2935         KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
2936
2937         if (uap->u_eventmask == EV_RM) {
2938                 EVPROCDEQUE(p, evq);
2939
2940 #if SOCKETS
2941                 if (fp->f_type == DTYPE_SOCKET) {
2942                         TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2943                         socket_unlock((struct socket *)fp->f_data, 1);
2944                 } else
2945 #endif /* SOCKETS */
2946                 {
2947                         TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2948                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2949                 }
2950                 fp_drop(p, erp->er_handle, fp, 0);
2951                 FREE(evq, M_TEMP);
2952                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
2953                 return(0);
2954         }
2955         switch (uap->u_eventmask & EV_MASK) {
2956
2957         case 0:
2958                 flag = 0;
2959                 break;
2960
2961         case EV_RE:
2962         case EV_WR:
2963         case EV_RE|EV_WR:
2964                 flag = EV_RWBYTES;
2965                 break;
2966
2967         case EV_EX:
2968                 flag = EV_OOB;
2969                 break;
2970
2971         case EV_EX|EV_RE:
2972         case EV_EX|EV_WR:
2973         case EV_EX|EV_RE|EV_WR:
2974                 flag = EV_OOB|EV_RWBYTES;
2975                 break;
2976
2977         default:
2978 #if SOCKETS
2979                 if (fp->f_type == DTYPE_SOCKET)
2980                         socket_unlock((struct socket *)fp->f_data, 1);
2981                 else
2982 #endif /* SOCKETS */
2983                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2984                 fp_drop(p, erp->er_handle, fp, 0);
2985                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2986                 return(EINVAL);
2987         }
2988         /*
2989          * since we're holding the socket/pipe lock, the event
2990          * cannot go from the unqueued state to the queued state
2991          * however, it can go from the queued state to the unqueued state
2992          * since that direction is protected by the proc_lock...
2993          * so do a quick check for EV_QUEUED w/o holding the proc lock
2994          * since by far the common case will be NOT EV_QUEUED, this saves
2995          * us taking the proc_lock the majority of the time
2996          */
2997         if (evq->ee_flags & EV_QUEUED) {
2998                 /*
2999                  * EVPROCDEQUE will recheck the state after it grabs the proc_lock
3000                  */
3001                 EVPROCDEQUE(p, evq);
3002         }
3003         /*
3004          * while the event is off the proc queue and
3005          * we're holding the socket/pipe lock
3006          * it's safe to update these fields...
3007          */
3008         evq->ee_req.er_eventbits = 0;
3009         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
3010
3011 #if SOCKETS
3012         if (fp->f_type == DTYPE_SOCKET) {
3013                 postevent((struct socket *)fp->f_data, 0, flag);
3014                 socket_unlock((struct socket *)fp->f_data, 1);
3015         } else
3016 #endif /* SOCKETS */
3017         {
3018                 postpipeevent((struct pipe *)fp->f_data, flag);
3019                 PIPE_UNLOCK((struct pipe *)fp->f_data);
3020         }
3021         fp_drop(p, erp->er_handle, fp, 0);
3022         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,0);
3023         return(0);
3024 }
3025
3026 /* this routine is called from the close of fd with proc_fdlock held */
3027 int
3028 waitevent_close(struct proc *p, struct fileproc *fp)
3029 {
3030         struct eventqelt *evq;
3031
3032
3033         fp->f_flags &= ~FP_WAITEVENT;
3034
3035 #if SOCKETS
3036         if (fp->f_type == DTYPE_SOCKET) {
3037                 socket_lock((struct socket *)fp->f_data, 1);
3038                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
3039         } else
3040 #endif /* SOCKETS */
3041         if (fp->f_type == DTYPE_PIPE) {
3042                 PIPE_LOCK((struct pipe *)fp->f_data);
3043                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
3044         }
3045         else {
3046                 return(EINVAL);
3047         }
3048         proc_fdunlock(p);
3049
3050
3051         // locate event if possible
3052         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
3053                 if (evq->ee_proc == p)
3054                         break;
3055         }
3056         if (evq == NULL) {
3057 #if SOCKETS
3058                 if (fp->f_type == DTYPE_SOCKET)
3059                         socket_unlock((struct socket *)fp->f_data, 1);
3060                 else
3061 #endif /* SOCKETS */
3062                         PIPE_UNLOCK((struct pipe *)fp->f_data);
3063
3064                 proc_fdlock(p);
3065
3066                 return(EINVAL);
3067         }
3068         EVPROCDEQUE(p, evq);
3069
3070 #if SOCKETS
3071         if (fp->f_type == DTYPE_SOCKET) {
3072                 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
3073                 socket_unlock((struct socket *)fp->f_data, 1);
3074         } else
3075 #endif /* SOCKETS */
3076         {
3077                 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
3078                 PIPE_UNLOCK((struct pipe *)fp->f_data);
3079         }
3080         FREE(evq, M_TEMP);
3081
3082         proc_fdlock(p);
3083
3084         return(0);
3085 }
3086
3087
3088 /*
3089  * gethostuuid
3090  *
3091  * Description: Get the host UUID from IOKit and return it to user space.
3092  *
3093  * Parameters:  uuid_buf                Pointer to buffer to receive UUID
3094  *              timeout                 Timespec for timout
3095  *              spi                             SPI, skip sandbox check (temporary)
3096  *
3097  * Returns:     0                       Success
3098  *              EWOULDBLOCK             Timeout is too short
3099  *              copyout:EFAULT          Bad user buffer
3100  *              mac_system_check_info:EPERM             Client not allowed to perform this operation
3101  *
3102  * Notes:       A timeout seems redundant, since if it's tolerable to not
3103  *              have a system UUID in hand, then why ask for one?
3104  */
3105 int
3106 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
3107 {
3108         kern_return_t kret;
3109         int error;
3110         mach_timespec_t mach_ts;        /* for IOKit call */
3111         __darwin_uuid_t uuid_kern;      /* for IOKit call */
3112
3113         if (!uap->spi) {
3114         }
3115
3116         /* Convert the 32/64 bit timespec into a mach_timespec_t */
3117         if ( proc_is64bit(p) ) {
3118                 struct user64_timespec ts;
3119                 error = copyin(uap->timeoutp, &ts, sizeof(ts));
3120                 if (error)
3121                         return (error);
3122                 mach_ts.tv_sec = ts.tv_sec;
3123                 mach_ts.tv_nsec = ts.tv_nsec;
3124         } else {
3125                 struct user32_timespec ts;
3126                 error = copyin(uap->timeoutp, &ts, sizeof(ts) );
3127                 if (error)
3128                         return (error);
3129                 mach_ts.tv_sec = ts.tv_sec;
3130                 mach_ts.tv_nsec = ts.tv_nsec;
3131         }
3132
3133         /* Call IOKit with the stack buffer to get the UUID */
3134         kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
3135
3136         /*
3137          * If we get it, copy out the data to the user buffer; note that a
3138          * uuid_t is an array of characters, so this is size invariant for
3139          * 32 vs. 64 bit.
3140          */
3141         if (kret == KERN_SUCCESS) {
3142                 error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
3143         } else {
3144                 error = EWOULDBLOCK;
3145         }
3146
3147         return (error);
3148 }
3149
3150 /*
3151  * ledger
3152  *
3153  * Description: Omnibus system call for ledger operations
3154  */
3155 int
3156 ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
3157 {
3158 #if !CONFIG_MACF
3159 #pragma unused(p)
3160 #endif
3161         int rval, pid, len, error;
3162 #ifdef LEDGER_DEBUG
3163         struct ledger_limit_args lla;
3164 #endif
3165         task_t task;
3166         proc_t proc;
3167
3168         /* Finish copying in the necessary args before taking the proc lock */
3169         error = 0;
3170         len = 0;
3171         if (args->cmd == LEDGER_ENTRY_INFO)
3172                 error = copyin(args->arg3, (char *)&len, sizeof (len));
3173         else if (args->cmd == LEDGER_TEMPLATE_INFO)
3174                 error = copyin(args->arg2, (char *)&len, sizeof (len));
3175         else if (args->cmd == LEDGER_LIMIT)
3176 #ifdef LEDGER_DEBUG
3177                 error = copyin(args->arg2, (char *)&lla, sizeof (lla));
3178 #else
3179                 return (EINVAL);
3180 #endif
3181         else if ((args->cmd < 0) || (args->cmd > LEDGER_MAX_CMD))
3182                 return (EINVAL);
3183
3184         if (error)
3185                 return (error);
3186         if (len < 0)
3187                 return (EINVAL);
3188
3189         rval = 0;
3190         if (args->cmd != LEDGER_TEMPLATE_INFO) {
3191                 pid = args->arg1;
3192                 proc = proc_find(pid);
3193                 if (proc == NULL)
3194                         return (ESRCH);
3195
3196 #if CONFIG_MACF
3197                 error = mac_proc_check_ledger(p, proc, args->cmd);
3198                 if (error) {
3199                         proc_rele(proc);
3200                         return (error);
3201                 }
3202 #endif
3203
3204                 task = proc->task;
3205         }
3206
3207         switch (args->cmd) {
3208 #ifdef LEDGER_DEBUG
3209                 case LEDGER_LIMIT: {
3210                         if (!kauth_cred_issuser(kauth_cred_get()))
3211                                 rval = EPERM;
3212                         rval = ledger_limit(task, &lla);
3213                         proc_rele(proc);
3214                         break;
3215                 }
3216 #endif
3217                 case LEDGER_INFO: {
3218                         struct ledger_info info;
3219
3220                         rval = ledger_info(task, &info);
3221                         proc_rele(proc);
3222                         if (rval == 0)
3223                                 rval = copyout(&info, args->arg2,
3224                                     sizeof (info));
3225                         break;
3226                 }
3227
3228                 case LEDGER_ENTRY_INFO: {
3229                         void *buf;
3230                         int sz;
3231
3232                         rval = ledger_get_task_entry_info_multiple(task, &buf, &len);
3233                         proc_rele(proc);
3234                         if ((rval == 0) && (len >= 0)) {
3235                                 sz = len * sizeof (struct ledger_entry_info);
3236                                 rval = copyout(buf, args->arg2, sz);
3237                                 kfree(buf, sz);
3238                         }
3239                         if (rval == 0)
3240                                 rval = copyout(&len, args->arg3, sizeof (len));
3241                         break;
3242                 }
3243
3244                 case LEDGER_TEMPLATE_INFO: {
3245                         void *buf;
3246                         int sz;
3247
3248                         rval = ledger_template_info(&buf, &len);
3249                         if ((rval == 0) && (len >= 0)) {
3250                                 sz = len * sizeof (struct ledger_template_info);
3251                                 rval = copyout(buf, args->arg1, sz);
3252                                 kfree(buf, sz);
3253                         }
3254                         if (rval == 0)
3255                                 rval = copyout(&len, args->arg2, sizeof (len));
3256                         break;
3257                 }
3258
3259                 default:
3260                         panic("ledger syscall logic error -- command type %d", args->cmd);
3261                         proc_rele(proc);
3262                         rval = EINVAL;
3263         }
3264
3265         return (rval);
3266 }
3267
3268 int
3269 telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t *retval)
3270 {
3271         int error = 0;
3272
3273         switch (args->cmd) {
3274 #if CONFIG_TELEMETRY
3275         case TELEMETRY_CMD_TIMER_EVENT:
3276                 error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
3277                 break;
3278 #endif /* CONFIG_TELEMETRY */
3279         case TELEMETRY_CMD_VOUCHER_NAME:
3280                 if (thread_set_voucher_name((mach_port_name_t)args->deadline))
3281                         error = EINVAL;
3282                 break;
3283
3284         default:
3285                 error = EINVAL;
3286                 break;
3287         }
3288
3289         return (error);
3290 }
3291
3292 #if defined(DEVELOPMENT) || defined(DEBUG)
3293 #if CONFIG_WAITQ_DEBUG
3294 static uint64_t g_wqset_num = 0;
3295 struct g_wqset {
3296         queue_chain_t      link;
3297         struct waitq_set  *wqset;
3298 };
3299
3300 static queue_head_t         g_wqset_list;
3301 static struct waitq_set    *g_waitq_set = NULL;
3302
3303 static inline struct waitq_set *sysctl_get_wqset(int idx)
3304 {
3305         struct g_wqset *gwqs;
3306
3307         if (!g_wqset_num)
3308                 queue_init(&g_wqset_list);
3309
3310         /* don't bother with locks: this is test-only code! */
3311         qe_foreach_element(gwqs, &g_wqset_list, link) {
3312                 if ((int)(wqset_id(gwqs->wqset) & 0xffffffff) == idx)
3313                         return gwqs->wqset;
3314         }
3315
3316         /* allocate a new one */
3317         ++g_wqset_num;
3318         gwqs = (struct g_wqset *)kalloc(sizeof(*gwqs));
3319         assert(gwqs != NULL);
3320
3321         gwqs->wqset = waitq_set_alloc(SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST, NULL);
3322         enqueue_tail(&g_wqset_list, &gwqs->link);
3323         printf("[WQ]: created new waitq set 0x%llx\n", wqset_id(gwqs->wqset));
3324
3325         return gwqs->wqset;
3326 }
3327
3328 #define MAX_GLOBAL_TEST_QUEUES 64
3329 static int g_wq_init = 0;
3330 static struct waitq  g_wq[MAX_GLOBAL_TEST_QUEUES];
3331
3332 static inline struct waitq *global_test_waitq(int idx)
3333 {
3334         if (idx < 0)
3335                 return NULL;
3336
3337         if (!g_wq_init) {
3338                 g_wq_init = 1;
3339                 for (int i = 0; i < MAX_GLOBAL_TEST_QUEUES; i++)
3340                         waitq_init(&g_wq[i], SYNC_POLICY_FIFO);
3341         }
3342
3343         return &g_wq[idx % MAX_GLOBAL_TEST_QUEUES];
3344 }
3345
3346 static int sysctl_waitq_wakeup_one SYSCTL_HANDLER_ARGS
3347 {
3348 #pragma unused(oidp, arg1, arg2)
3349         int error;
3350         int index;
3351         struct waitq *waitq;
3352         kern_return_t kr;
3353         int64_t event64 = 0;
3354
3355         error = SYSCTL_IN(req, &event64, sizeof(event64));
3356         if (error)
3357                 return error;
3358
3359         if (!req->newptr)
3360                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3361
3362         if (event64 < 0) {
3363                 index = (int)((-event64) & 0xffffffff);
3364                 waitq = wqset_waitq(sysctl_get_wqset(index));
3365                 index = -index;
3366         } else {
3367                 index = (int)event64;
3368                 waitq = global_test_waitq(index);
3369         }
3370
3371         event64 = 0;
3372
3373         printf("[WQ]: Waking one thread on waitq [%d] event:0x%llx\n",
3374                index, event64);
3375         kr = waitq_wakeup64_one(waitq, (event64_t)event64, THREAD_AWAKENED,
3376                                 WAITQ_ALL_PRIORITIES);
3377         printf("[WQ]: \tkr=%d\n", kr);
3378
3379         return SYSCTL_OUT(req, &kr, sizeof(kr));
3380 }
3381 SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_one, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3382             0, 0, sysctl_waitq_wakeup_one, "Q", "wakeup one thread waiting on given event");
3383
3384
3385 static int sysctl_waitq_wakeup_all SYSCTL_HANDLER_ARGS
3386 {
3387 #pragma unused(oidp, arg1, arg2)
3388         int error;
3389         int index;
3390         struct waitq *waitq;
3391         kern_return_t kr;
3392         int64_t event64 = 0;
3393
3394         error = SYSCTL_IN(req, &event64, sizeof(event64));
3395         if (error)
3396                 return error;
3397
3398         if (!req->newptr)
3399                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3400
3401         if (event64 < 0) {
3402                 index = (int)((-event64) & 0xffffffff);
3403                 waitq = wqset_waitq(sysctl_get_wqset(index));
3404                 index = -index;
3405         } else {
3406                 index = (int)event64;
3407                 waitq = global_test_waitq(index);
3408         }
3409
3410         event64 = 0;
3411
3412         printf("[WQ]: Waking all threads on waitq [%d] event:0x%llx\n",
3413                index, event64);
3414         kr = waitq_wakeup64_all(waitq, (event64_t)event64,
3415                                 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
3416         printf("[WQ]: \tkr=%d\n", kr);
3417
3418         return SYSCTL_OUT(req, &kr, sizeof(kr));
3419 }
3420 SYSCTL_PROC(_kern, OID_AUTO, waitq_wakeup_all, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3421             0, 0, sysctl_waitq_wakeup_all, "Q", "wakeup all threads waiting on given event");
3422
3423
3424 static int sysctl_waitq_wait SYSCTL_HANDLER_ARGS
3425 {
3426 #pragma unused(oidp, arg1, arg2)
3427         int error;
3428         int index;
3429         struct waitq *waitq;
3430         kern_return_t kr;
3431         int64_t event64 = 0;
3432
3433         error = SYSCTL_IN(req, &event64, sizeof(event64));
3434         if (error)
3435                 return error;
3436
3437         if (!req->newptr)
3438                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3439
3440         if (event64 < 0) {
3441                 index = (int)((-event64) & 0xffffffff);
3442                 waitq = wqset_waitq(sysctl_get_wqset(index));
3443                 index = -index;
3444         } else {
3445                 index = (int)event64;
3446                 waitq = global_test_waitq(index);
3447         }
3448
3449         event64 = 0;
3450
3451         printf("[WQ]: Current thread waiting on waitq [%d] event:0x%llx\n",
3452                index, event64);
3453         kr = waitq_assert_wait64(waitq, (event64_t)event64, THREAD_INTERRUPTIBLE, 0);
3454         if (kr == THREAD_WAITING)
3455                 thread_block(THREAD_CONTINUE_NULL);
3456         printf("[WQ]: \tWoke Up: kr=%d\n", kr);
3457
3458         return SYSCTL_OUT(req, &kr, sizeof(kr));
3459 }
3460 SYSCTL_PROC(_kern, OID_AUTO, waitq_wait, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3461             0, 0, sysctl_waitq_wait, "Q", "start waiting on given event");
3462
3463
3464 static int sysctl_wqset_select SYSCTL_HANDLER_ARGS
3465 {
3466 #pragma unused(oidp, arg1, arg2)
3467         int error;
3468         struct waitq_set *wqset;
3469         uint64_t event64 = 0;
3470
3471         error = SYSCTL_IN(req, &event64, sizeof(event64));
3472         if (error)
3473                 return error;
3474
3475         if (!req->newptr)
3476                 goto out;
3477
3478         wqset = sysctl_get_wqset((int)(event64 & 0xffffffff));
3479         g_waitq_set = wqset;
3480
3481         event64 = wqset_id(wqset);
3482         printf("[WQ]: selected wqset 0x%llx\n", event64);
3483
3484 out:
3485         if (g_waitq_set)
3486                 event64 = wqset_id(g_waitq_set);
3487         else
3488                 event64 = (uint64_t)(-1);
3489
3490         return SYSCTL_OUT(req, &event64, sizeof(event64));
3491 }
3492 SYSCTL_PROC(_kern, OID_AUTO, wqset_select, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3493             0, 0, sysctl_wqset_select, "Q", "select/create a global waitq set");
3494
3495
3496 static int sysctl_waitq_link SYSCTL_HANDLER_ARGS
3497 {
3498 #pragma unused(oidp, arg1, arg2)
3499         int error;
3500         int index;
3501         struct waitq *waitq;
3502         struct waitq_set *wqset;
3503         kern_return_t kr;
3504         uint64_t reserved_link = 0;
3505         int64_t event64 = 0;
3506
3507         error = SYSCTL_IN(req, &event64, sizeof(event64));
3508         if (error)
3509                 return error;
3510
3511         if (!req->newptr)
3512                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3513
3514         if (!g_waitq_set)
3515                 g_waitq_set = sysctl_get_wqset(1);
3516         wqset = g_waitq_set;
3517
3518         if (event64 < 0) {
3519                 struct waitq_set *tmp;
3520                 index = (int)((-event64) & 0xffffffff);
3521                 tmp = sysctl_get_wqset(index);
3522                 if (tmp == wqset)
3523                         goto out;
3524                 waitq = wqset_waitq(tmp);
3525                 index = -index;
3526         } else {
3527                 index = (int)event64;
3528                 waitq = global_test_waitq(index);
3529         }
3530
3531         printf("[WQ]: linking waitq [%d] to global wqset (0x%llx)\n",
3532                index, wqset_id(wqset));
3533         reserved_link = waitq_link_reserve(waitq);
3534         kr = waitq_link(waitq, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
3535         waitq_link_release(reserved_link);
3536
3537         printf("[WQ]: \tkr=%d\n", kr);
3538
3539 out:
3540         return SYSCTL_OUT(req, &kr, sizeof(kr));
3541 }
3542 SYSCTL_PROC(_kern, OID_AUTO, waitq_link, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3543             0, 0, sysctl_waitq_link, "Q", "link global waitq to test waitq set");
3544
3545
3546 static int sysctl_waitq_unlink SYSCTL_HANDLER_ARGS
3547 {
3548 #pragma unused(oidp, arg1, arg2)
3549         int error;
3550         int index;
3551         struct waitq *waitq;
3552         struct waitq_set *wqset;
3553         kern_return_t kr;
3554         uint64_t event64 = 0;
3555
3556         error = SYSCTL_IN(req, &event64, sizeof(event64));
3557         if (error)
3558                 return error;
3559
3560         if (!req->newptr)
3561                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3562
3563         if (!g_waitq_set)
3564                 g_waitq_set = sysctl_get_wqset(1);
3565         wqset = g_waitq_set;
3566
3567         index = (int)event64;
3568         waitq = global_test_waitq(index);
3569
3570         printf("[WQ]: unlinking waitq [%d] from global wqset (0x%llx)\n",
3571                index, wqset_id(wqset));
3572
3573         kr = waitq_unlink(waitq, wqset);
3574         printf("[WQ]: \tkr=%d\n", kr);
3575
3576         return SYSCTL_OUT(req, &kr, sizeof(kr));
3577 }
3578 SYSCTL_PROC(_kern, OID_AUTO, waitq_unlink, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3579             0, 0, sysctl_waitq_unlink, "Q", "unlink global waitq from test waitq set");
3580
3581
3582 static int sysctl_waitq_clear_prepost SYSCTL_HANDLER_ARGS
3583 {
3584 #pragma unused(oidp, arg1, arg2)
3585         struct waitq *waitq;
3586         uint64_t event64 = 0;
3587         int error, index;
3588
3589         error = SYSCTL_IN(req, &event64, sizeof(event64));
3590         if (error)
3591                 return error;
3592
3593         if (!req->newptr)
3594                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3595
3596         index = (int)event64;
3597         waitq = global_test_waitq(index);
3598
3599         printf("[WQ]: clearing prepost on waitq [%d]\n", index);
3600         waitq_clear_prepost(waitq);
3601
3602         return SYSCTL_OUT(req, &event64, sizeof(event64));
3603 }
3604 SYSCTL_PROC(_kern, OID_AUTO, waitq_clear_prepost, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3605             0, 0, sysctl_waitq_clear_prepost, "Q", "clear prepost on given waitq");
3606
3607
3608 static int sysctl_wqset_unlink_all SYSCTL_HANDLER_ARGS
3609 {
3610 #pragma unused(oidp, arg1, arg2)
3611         int error;
3612         struct waitq_set *wqset;
3613         kern_return_t kr;
3614         uint64_t event64 = 0;
3615
3616         error = SYSCTL_IN(req, &event64, sizeof(event64));
3617         if (error)
3618                 return error;
3619
3620         if (!req->newptr)
3621                 return SYSCTL_OUT(req, &event64, sizeof(event64));
3622
3623         if (!g_waitq_set)
3624                 g_waitq_set = sysctl_get_wqset(1);
3625         wqset = g_waitq_set;
3626
3627         printf("[WQ]: unlinking all queues from global wqset (0x%llx)\n",
3628                wqset_id(wqset));
3629
3630         kr = waitq_set_unlink_all(wqset);
3631         printf("[WQ]: \tkr=%d\n", kr);
3632
3633         return SYSCTL_OUT(req, &kr, sizeof(kr));
3634 }
3635 SYSCTL_PROC(_kern, OID_AUTO, wqset_unlink_all, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3636             0, 0, sysctl_wqset_unlink_all, "Q", "unlink all queues from test waitq set");
3637
3638
3639 static int sysctl_wqset_clear_preposts SYSCTL_HANDLER_ARGS
3640 {
3641 #pragma unused(oidp, arg1, arg2)
3642         struct waitq_set *wqset = NULL;
3643         uint64_t event64 = 0;
3644         int error, index;
3645
3646         error = SYSCTL_IN(req, &event64, sizeof(event64));
3647         if (error)
3648                 return error;
3649
3650         if (!req->newptr)
3651                 goto out;
3652
3653         index = (int)((event64) & 0xffffffff);
3654         wqset = sysctl_get_wqset(index);
3655         assert(wqset != NULL);
3656
3657         printf("[WQ]: clearing preposts on wqset 0x%llx\n", wqset_id(wqset));
3658         waitq_set_clear_preposts(wqset);
3659
3660 out:
3661         if (wqset)
3662                 event64 = wqset_id(wqset);
3663         else
3664                 event64 = (uint64_t)(-1);
3665
3666         return SYSCTL_OUT(req, &event64, sizeof(event64));
3667 }
3668 SYSCTL_PROC(_kern, OID_AUTO, wqset_clear_preposts, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
3669             0, 0, sysctl_wqset_clear_preposts, "Q", "clear preposts on given waitq set");
3670
3671 #endif /* CONFIG_WAITQ_DEBUG */
3672 #endif /* defined(DEVELOPMENT) || defined(DEBUG) */