bsd/kern/sys_generic.c

   1 /*
   2  * Copyright (c) 2006 Apple Computer, Inc. All Rights Reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  31 /*
  32  * Copyright (c) 1982, 1986, 1989, 1993
  33  *      The Regents of the University of California.  All rights reserved.
  34  * (c) UNIX System Laboratories, Inc.
  35  * All or some portions of this file are derived from material licensed
  36  * to the University of California by American Telephone and Telegraph
  37  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  38  * the permission of UNIX System Laboratories, Inc.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. All advertising materials mentioning features or use of this software
  49  *    must display the following acknowledgement:
  50  *      This product includes software developed by the University of
  51  *      California, Berkeley and its contributors.
  52  * 4. Neither the name of the University nor the names of its contributors
  53  *    may be used to endorse or promote products derived from this software
  54  *    without specific prior written permission.
  55  *
  56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  66  * SUCH DAMAGE.
  67  *
  68  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
  69  */
  70
  71 #include <sys/param.h>
  72 #include <sys/systm.h>
  73 #include <sys/filedesc.h>
  74 #include <sys/ioctl.h>
  75 #include <sys/file_internal.h>
  76 #include <sys/proc_internal.h>
  77 #include <sys/socketvar.h>
  78 #if KTRACE
  79 #include <sys/uio_internal.h>
  80 #else
  81 #include <sys/uio.h>
  82 #endif
  83 #include <sys/kernel.h>
  84 #include <sys/stat.h>
  85 #include <sys/malloc.h>
  86 #include <sys/sysproto.h>
  87
  88 #include <sys/mount_internal.h>
  89 #include <sys/protosw.h>
  90 #include <sys/ev.h>
  91 #include <sys/user.h>
  92 #include <sys/kdebug.h>
  93 #include <sys/poll.h>
  94 #include <sys/event.h>
  95 #include <sys/eventvar.h>
  96
  97 #include <mach/mach_types.h>
  98 #include <kern/kern_types.h>
  99 #include <kern/assert.h>
 100 #include <kern/kalloc.h>
 101 #include <kern/thread.h>
 102 #include <kern/clock.h>
 103
 104 #include <sys/mbuf.h>
 105 #include <sys/socket.h>
 106 #include <sys/socketvar.h>
 107 #include <sys/errno.h>
 108 #include <sys/syscall.h>
 109 #include <sys/pipe.h>
 110
 111 #include <bsm/audit_kernel.h>
 112
 113 #include <net/if.h>
 114 #include <net/route.h>
 115
 116 #include <netinet/in.h>
 117 #include <netinet/in_systm.h>
 118 #include <netinet/ip.h>
 119 #include <netinet/in_pcb.h>
 120 #include <netinet/ip_var.h>
 121 #include <netinet/ip6.h>
 122 #include <netinet/tcp.h>
 123 #include <netinet/tcp_fsm.h>
 124 #include <netinet/tcp_seq.h>
 125 #include <netinet/tcp_timer.h>
 126 #include <netinet/tcp_var.h>
 127 #include <netinet/tcpip.h>
 128 #include <netinet/tcp_debug.h>
 129 /* for wait queue based select */
 130 #include <kern/wait_queue.h>
 131 #include <kern/kalloc.h>
 132 #if KTRACE
 133 #include <sys/ktrace.h>
 134 #endif
 135 #include <sys/vnode_internal.h>
 136
 137 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 138 int wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 139 extern void     *get_bsduthreadarg(thread_t);
 140 extern int      *get_bsduthreadrval(thread_t);
 141
 142 __private_extern__ int  dofileread(struct proc *p, struct fileproc *fp, int fd,
 143                                                                    user_addr_t bufp, user_size_t nbyte,
 144                                                                    off_t offset, int flags, user_ssize_t *retval);
 145 __private_extern__ int  dofilewrite(struct proc *p, struct fileproc *fp, int fd,
 146                                                                         user_addr_t bufp, user_size_t nbyte,
 147                                                                         off_t offset, int flags, user_ssize_t *retval);
 148 __private_extern__ int  preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
 149 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
 150
 151 #if NETAT
 152 extern int appletalk_inited;
 153 #endif /* NETAT */
 154
 155 #define f_flag f_fglob->fg_flag
 156 #define f_type f_fglob->fg_type
 157 #define f_msgcount f_fglob->fg_msgcount
 158 #define f_cred f_fglob->fg_cred
 159 #define f_ops f_fglob->fg_ops
 160 #define f_offset f_fglob->fg_offset
 161 #define f_data f_fglob->fg_data
 162 /*
 163  * Read system call.
 164  */
 165 int
 166 read(p, uap, retval)
 167         struct proc *p;
 168         register struct read_args *uap;
 169         user_ssize_t *retval;
 170 {
 171         struct fileproc *fp;
 172         int error;
 173         int fd = uap->fd;
 174
 175         if ( (error = preparefileread(p, &fp, fd, 0)) )
 176                 return (error);
 177
 178         error = dofileread(p, fp, uap->fd, uap->cbuf, uap->nbyte,
 179                            (off_t)-1, 0, retval);
 180
 181         donefileread(p, fp, fd);
 182
 183         return (error);
 184 }
 185
 186 /*
 187  * Pread system call
 188  */
 189 int
 190 pread(p, uap, retval)
 191         struct proc *p;
 192         register struct pread_args *uap;
 193         user_ssize_t *retval;
 194 {
 195         struct fileproc *fp;
 196         int fd = uap->fd;
 197         int error;
 198
 199         if ( (error = preparefileread(p, &fp, fd, 1)) )
 200                 return (error);
 201
 202         error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
 203                         uap->offset, FOF_OFFSET, retval);
 204
 205         donefileread(p, fp, fd);
 206
 207         if (!error)
 208             KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
 209               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 210
 211         return (error);
 212 }
 213
 214 /*
 215  * Code common for read and pread
 216  */
 217
 218 void
 219 donefileread(struct proc *p, struct fileproc *fp, int fd)
 220 {
 221         proc_fdlock(p);
 222
 223         fp->f_flags &= ~FP_INCHRREAD;
 224
 225         fp_drop(p, fd, fp, 1);
 226         proc_fdunlock(p);
 227 }
 228
 229 int
 230 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
 231 {
 232         vnode_t vp;
 233         int     error;
 234         struct fileproc *fp;
 235
 236         proc_fdlock(p);
 237
 238         error = fp_lookup(p, fd, &fp, 1);
 239
 240         if (error) {
 241                 proc_fdunlock(p);
 242                 return (error);
 243         }
 244         if ((fp->f_flag & FREAD) == 0) {
 245                 error = EBADF;
 246                 goto out;
 247         }
 248         if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
 249                 error = ESPIPE;
 250                 goto out;
 251         }
 252         if (fp->f_type == DTYPE_VNODE) {
 253                 vp = (struct vnode *)fp->f_fglob->fg_data;
 254
 255                 if (vp->v_type == VCHR)
 256                         fp->f_flags |= FP_INCHRREAD;
 257         }
 258
 259         *fp_ret = fp;
 260
 261         proc_fdunlock(p);
 262         return (0);
 263
 264 out:
 265         fp_drop(p, fd, fp, 1);
 266         proc_fdunlock(p);
 267         return (error);
 268 }
 269
 270
 271 __private_extern__ int
 272 dofileread(p, fp, fd, bufp, nbyte, offset, flags, retval)
 273         struct proc *p;
 274         struct fileproc *fp;
 275         int fd, flags;
 276         user_addr_t bufp;
 277         user_size_t nbyte;
 278         off_t offset;
 279         user_ssize_t *retval;
 280 {
 281         uio_t auio;
 282         user_ssize_t bytecnt;
 283         long error = 0;
 284         char uio_buf[ UIO_SIZEOF(1) ];
 285 #if KTRACE
 286         uio_t ktruio = NULL;
 287         char ktr_uio_buf[ UIO_SIZEOF(1) ];
 288         int didktr = 0;
 289 #endif
 290
 291         // LP64todo - do we want to raise this?
 292         if (nbyte > INT_MAX)
 293                 return (EINVAL);
 294
 295         if (IS_64BIT_PROCESS(p)) {
 296                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
 297                                                                           &uio_buf[0], sizeof(uio_buf));
 298         } else {
 299                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
 300                                                                           &uio_buf[0], sizeof(uio_buf));
 301         }
 302         uio_addiov(auio, bufp, nbyte);
 303
 304 #if KTRACE
 305         /*
 306         * if tracing, save a copy of iovec
 307         */
 308         if (KTRPOINT(p, KTR_GENIO)) {
 309                 didktr = 1;
 310
 311                 if (IS_64BIT_PROCESS(p)) {
 312                         ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
 313                                                                           &ktr_uio_buf[0], sizeof(ktr_uio_buf));
 314                 } else {
 315                         ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
 316                                                                           &ktr_uio_buf[0], sizeof(ktr_uio_buf));
 317                 }
 318                 uio_addiov(ktruio, bufp, nbyte);
 319         }
 320 #endif
 321         bytecnt = nbyte;
 322
 323         if ((error = fo_read(fp, auio, fp->f_cred, flags, p))) {
 324                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 325                         error == EINTR || error == EWOULDBLOCK))
 326                         error = 0;
 327         }
 328         bytecnt -= uio_resid(auio);
 329 #if KTRACE
 330         if (didktr && error == 0) {
 331                 uio_setresid(ktruio, bytecnt);
 332                 ktrgenio(p->p_tracep, fd, UIO_READ, ktruio, error);
 333         }
 334 #endif
 335
 336         *retval = bytecnt;
 337
 338         return (error);
 339 }
 340
 341 /*
 342  * Scatter read system call.
 343  */
 344 int
 345 readv(p, uap, retval)
 346         struct proc *p;
 347         register struct readv_args *uap;
 348         user_ssize_t *retval;
 349 {
 350         uio_t auio = NULL;
 351         int error;
 352         int size_of_iovec;
 353         struct user_iovec *iovp;
 354
 355         /* Verify range bedfore calling uio_create() */
 356         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 357                 return (EINVAL);
 358
 359         /* allocate a uio large enough to hold the number of iovecs passed */
 360         auio = uio_create(uap->iovcnt, 0,
 361                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 362                                   UIO_READ);
 363
 364         /* get location of iovecs within the uio.  then copyin the iovecs from
 365          * user space.
 366          */
 367         iovp = uio_iovsaddr(auio);
 368         if (iovp == NULL) {
 369                 error = ENOMEM;
 370                 goto ExitThisRoutine;
 371         }
 372         size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec));
 373         error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec));
 374         if (error) {
 375                 goto ExitThisRoutine;
 376         }
 377
 378         /* finalize uio_t for use and do the IO
 379          */
 380         uio_calculateresid(auio);
 381         error = rd_uio(p, uap->fd, auio, retval);
 382
 383 ExitThisRoutine:
 384         if (auio != NULL) {
 385                 uio_free(auio);
 386         }
 387         return (error);
 388 }
 389
 390 /*
 391  * Write system call
 392  */
 393 int
 394 write(p, uap, retval)
 395         struct proc *p;
 396         register struct write_args *uap;
 397         user_ssize_t *retval;
 398 {
 399         struct fileproc *fp;
 400         int error;
 401         int fd = uap->fd;
 402
 403         error = fp_lookup(p,fd,&fp,0);
 404         if (error)
 405                 return(error);
 406         if ((fp->f_flag & FWRITE) == 0) {
 407                 error = EBADF;
 408         } else {
 409                 error = dofilewrite(p, fp, uap->fd, uap->cbuf, uap->nbyte,
 410                         (off_t)-1, 0, retval);
 411         }
 412         if (error == 0)
 413                 fp_drop_written(p, fd, fp);
 414         else
 415                 fp_drop(p, fd, fp, 0);
 416         return(error);
 417 }
 418
 419 /*
 420  * pwrite system call
 421  */
 422 int
 423 pwrite(p, uap, retval)
 424         struct proc *p;
 425         register struct pwrite_args *uap;
 426         user_ssize_t *retval;
 427 {
 428         struct fileproc *fp;
 429         int error;
 430         int fd = uap->fd;
 431
 432         error = fp_lookup(p,fd,&fp,0);
 433         if (error)
 434                 return(error);
 435
 436         if ((fp->f_flag & FWRITE) == 0) {
 437                 error = EBADF;
 438         } else {
 439                 if (fp->f_type != DTYPE_VNODE) {
 440                         error = ESPIPE;
 441                 } else {
 442                     error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
 443                         uap->offset, FOF_OFFSET, retval);
 444                 }
 445         }
 446         if (error == 0)
 447                 fp_drop_written(p, fd, fp);
 448         else
 449                 fp_drop(p, fd, fp, 0);
 450
 451         if (!error)
 452             KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
 453               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 454
 455         return(error);
 456 }
 457
 458 __private_extern__ int
 459 dofilewrite(p, fp, fd, bufp, nbyte, offset, flags, retval)
 460         struct proc *p;
 461         struct fileproc *fp;
 462         int fd, flags;
 463         user_addr_t bufp;
 464         user_size_t nbyte;
 465         off_t offset;
 466         user_ssize_t *retval;
 467 {
 468         uio_t auio;
 469         long error = 0;
 470         user_ssize_t bytecnt;
 471         char uio_buf[ UIO_SIZEOF(1) ];
 472 #if KTRACE
 473         uio_t ktruio;
 474         int didktr = 0;
 475         char ktr_uio_buf[ UIO_SIZEOF(1) ];
 476 #endif
 477
 478         // LP64todo - do we want to raise this?
 479         if (nbyte > INT_MAX)
 480                 return (EINVAL);
 481
 482         if (IS_64BIT_PROCESS(p)) {
 483                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
 484                                                                           &uio_buf[0], sizeof(uio_buf));
 485         } else {
 486                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
 487                                                                           &uio_buf[0], sizeof(uio_buf));
 488         }
 489         uio_addiov(auio, bufp, nbyte);
 490
 491 #if KTRACE
 492         /*
 493         * if tracing, save a copy of iovec and uio
 494         */
 495         if (KTRPOINT(p, KTR_GENIO)) {
 496                 didktr = 1;
 497
 498                 if (IS_64BIT_PROCESS(p)) {
 499                         ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
 500                                                                                   &ktr_uio_buf[0], sizeof(ktr_uio_buf));
 501                 } else {
 502                         ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
 503                                                                                   &ktr_uio_buf[0], sizeof(ktr_uio_buf));
 504                 }
 505                 uio_addiov(ktruio, bufp, nbyte);
 506         }
 507 #endif
 508         bytecnt = nbyte;
 509         if ((error = fo_write(fp, auio, fp->f_cred, flags, p))) {
 510                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 511                         error == EINTR || error == EWOULDBLOCK))
 512                         error = 0;
 513                 /* The socket layer handles SIGPIPE */
 514                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
 515                         psignal(p, SIGPIPE);
 516         }
 517         bytecnt -= uio_resid(auio);
 518 #if KTRACE
 519         if (didktr && error == 0) {
 520                 uio_setresid(ktruio, bytecnt);
 521                 ktrgenio(p->p_tracep, fd, UIO_WRITE, ktruio, error);
 522         }
 523 #endif
 524         *retval = bytecnt;
 525
 526         return (error);
 527 }
 528
 529 /*
 530  * Gather write system call
 531  */
 532 int
 533 writev(p, uap, retval)
 534         struct proc *p;
 535         register struct writev_args *uap;
 536         user_ssize_t *retval;
 537 {
 538         uio_t auio = NULL;
 539         int error;
 540         int size_of_iovec;
 541         struct user_iovec *iovp;
 542
 543         /* Verify range bedfore calling uio_create() */
 544         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 545                 return (EINVAL);
 546
 547         /* allocate a uio large enough to hold the number of iovecs passed */
 548         auio = uio_create(uap->iovcnt, 0,
 549                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 550                                   UIO_WRITE);
 551
 552         /* get location of iovecs within the uio.  then copyin the iovecs from
 553          * user space.
 554          */
 555         iovp = uio_iovsaddr(auio);
 556         if (iovp == NULL) {
 557                 error = ENOMEM;
 558                 goto ExitThisRoutine;
 559         }
 560         size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec));
 561         error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec));
 562         if (error) {
 563                 goto ExitThisRoutine;
 564         }
 565
 566         /* finalize uio_t for use and do the IO
 567          */
 568         uio_calculateresid(auio);
 569         error = wr_uio(p, uap->fd, auio, retval);
 570
 571 ExitThisRoutine:
 572         if (auio != NULL) {
 573                 uio_free(auio);
 574         }
 575         return (error);
 576 }
 577
 578
 579 int
 580 wr_uio(p, fdes, uio, retval)
 581         struct proc *p;
 582         int fdes;
 583         register uio_t uio;
 584         user_ssize_t *retval;
 585 {
 586         struct fileproc *fp;
 587         int error;
 588         user_ssize_t count;
 589 #if KTRACE
 590         struct iovec_64 *ktriov = NULL;
 591         struct uio ktruio;
 592         int didktr = 0;
 593         u_int iovlen;
 594 #endif
 595
 596         error = fp_lookup(p,fdes,&fp,0);
 597         if (error)
 598                 return(error);
 599
 600         if ((fp->f_flag & FWRITE) == 0) {
 601                 error = EBADF;
 602                 goto out;
 603         }
 604         count = uio_resid(uio);
 605 #if KTRACE
 606         /*
 607          * if tracing, save a copy of iovec
 608          */
 609         if (KTRPOINT(p, KTR_GENIO)) {
 610                 iovlen = uio->uio_iovcnt *
 611                         (IS_64BIT_PROCESS(p) ? sizeof (struct iovec_64) : sizeof (struct iovec_32));
 612                 MALLOC(ktriov, struct iovec_64 *, iovlen, M_TEMP, M_WAITOK);
 613                 if (ktriov != NULL) {
 614                         bcopy((caddr_t)uio->uio_iovs.iov64p, (caddr_t)ktriov, iovlen);
 615                         ktruio = *uio;
 616                         didktr = 1;
 617                 }
 618         }
 619 #endif
 620         error = fo_write(fp, uio, fp->f_cred, 0, p);
 621         if (error) {
 622                 if (uio_resid(uio) != count && (error == ERESTART ||
 623                                                 error == EINTR || error == EWOULDBLOCK))
 624                         error = 0;
 625                 /* The socket layer handles SIGPIPE */
 626                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
 627                         psignal(p, SIGPIPE);
 628         }
 629         *retval = count - uio_resid(uio);
 630
 631 #if KTRACE
 632         if (didktr) {
 633                 if (error == 0) {
 634                         ktruio.uio_iovs.iov64p = ktriov;
 635                         uio_setresid(&ktruio, *retval);
 636                         ktrgenio(p->p_tracep, fdes, UIO_WRITE, &ktruio, error);
 637                 }
 638                 FREE(ktriov, M_TEMP);
 639         }
 640 #endif
 641
 642 out:
 643         if ( (error == 0) )
 644                 fp_drop_written(p, fdes, fp);
 645         else
 646                 fp_drop(p, fdes, fp, 0);
 647         return(error);
 648 }
 649
 650
 651 int
 652 rd_uio(p, fdes, uio, retval)
 653         struct proc *p;
 654         int fdes;
 655         register uio_t uio;
 656         user_ssize_t *retval;
 657 {
 658         struct fileproc *fp;
 659         int error;
 660         user_ssize_t count;
 661 #if KTRACE
 662         struct iovec_64 *ktriov = NULL;
 663         struct uio ktruio;
 664         int didktr = 0;
 665         u_int iovlen;
 666 #endif
 667
 668         if ( (error = preparefileread(p, &fp, fdes, 0)) )
 669                 return (error);
 670
 671         count = uio_resid(uio);
 672 #if KTRACE
 673         /*
 674          * if tracing, save a copy of iovec
 675          */
 676         if (KTRPOINT(p, KTR_GENIO)) {
 677                 iovlen = uio->uio_iovcnt *
 678                         (IS_64BIT_PROCESS(p) ? sizeof (struct iovec_64) : sizeof (struct iovec_32));
 679                 MALLOC(ktriov, struct iovec_64 *, iovlen, M_TEMP, M_WAITOK);
 680                 if (ktriov != NULL) {
 681                         bcopy((caddr_t)uio->uio_iovs.iov64p, (caddr_t)ktriov, iovlen);
 682                         ktruio = *uio;
 683                         didktr = 1;
 684                 }
 685         }
 686 #endif
 687         error = fo_read(fp, uio, fp->f_cred, 0, p);
 688
 689         if (error) {
 690                 if (uio_resid(uio) != count && (error == ERESTART ||
 691                                                 error == EINTR || error == EWOULDBLOCK))
 692                         error = 0;
 693         }
 694         *retval = count - uio_resid(uio);
 695
 696 #if KTRACE
 697         if (didktr) {
 698                 if (error == 0) {
 699                         ktruio.uio_iovs.iov64p = ktriov;
 700                         uio_setresid(&ktruio, *retval);
 701                         ktrgenio(p->p_tracep, fdes, UIO_READ, &ktruio, error);
 702                 }
 703                 FREE(ktriov, M_TEMP);
 704         }
 705 #endif
 706         donefileread(p, fp, fdes);
 707
 708         return (error);
 709 }
 710
 711 /*
 712  * Ioctl system call
 713  *
 714  */
 715 int
 716 ioctl(struct proc *p, register struct ioctl_args *uap, __unused register_t *retval)
 717 {
 718         struct fileproc *fp;
 719         register u_long com;
 720         int error = 0;
 721         register u_int size;
 722         caddr_t datap, memp;
 723         boolean_t is64bit;
 724         int tmp;
 725 #define STK_PARAMS      128
 726         char stkbuf[STK_PARAMS];
 727         int fd = uap->fd;
 728
 729         AUDIT_ARG(fd, uap->fd);
 730         AUDIT_ARG(cmd, CAST_DOWN(int, uap->com)); /* LP64todo: uap->com is a user-land long */
 731         AUDIT_ARG(addr, uap->data);
 732
 733         is64bit = proc_is64bit(p);
 734
 735         proc_fdlock(p);
 736         error = fp_lookup(p,fd,&fp,1);
 737         if (error)  {
 738                 proc_fdunlock(p);
 739                 return(error);
 740         }
 741
 742         AUDIT_ARG(file, p, fp);
 743
 744         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 745                         error = EBADF;
 746                         goto out;
 747         }
 748
 749 #if NETAT
 750         /*
 751          * ### LD 6/11/97 Hack Alert: this is to get AppleTalk to work
 752          * while implementing an ATioctl system call
 753          */
 754         {
 755                 if (appletalk_inited && ((uap->com & 0x0000FFFF) == 0xff99)) {
 756                         u_long  fixed_command;
 757 #ifdef APPLETALK_DEBUG
 758                         kprintf("ioctl: special AppleTalk \n");
 759 #endif
 760                         datap = &stkbuf[0];
 761                         *(user_addr_t *)datap = uap->data;
 762                         fixed_command = _IOW(0, 0xff99, uap->data);
 763                         error = fo_ioctl(fp, fixed_command, datap, p);
 764                         goto out;
 765                 }
 766         }
 767
 768 #endif /* NETAT */
 769
 770
 771         switch (com = uap->com) {
 772         case FIONCLEX:
 773                 *fdflags(p, uap->fd) &= ~UF_EXCLOSE;
 774                 error =0;
 775                 goto out;
 776         case FIOCLEX:
 777                 *fdflags(p, uap->fd) |= UF_EXCLOSE;
 778                 error =0;
 779                 goto out;
 780         }
 781
 782         /*
 783          * Interpret high order word to find amount of data to be
 784          * copied to/from the user's address space.
 785          */
 786         size = IOCPARM_LEN(com);
 787         if (size > IOCPARM_MAX) {
 788                         error = ENOTTY;
 789                         goto out;
 790         }
 791         memp = NULL;
 792         if (size > sizeof (stkbuf)) {
 793                 proc_fdunlock(p);
 794                 if ((memp = (caddr_t)kalloc(size)) == 0) {
 795                         proc_fdlock(p);
 796                         error = ENOMEM;
 797                         goto out;
 798                 }
 799                 proc_fdlock(p);
 800                 datap = memp;
 801         } else
 802                 datap = &stkbuf[0];
 803         if (com&IOC_IN) {
 804                 if (size) {
 805                         proc_fdunlock(p);
 806                         error = copyin(uap->data, datap, size);
 807                         if (error) {
 808                                 if (memp)
 809                                         kfree(memp, size);
 810                                 proc_fdlock(p);
 811                                 goto out;
 812                         }
 813                         proc_fdlock(p);
 814                 } else {
 815                         /* XXX - IOC_IN and no size?  we should proably return an error here!! */
 816                         if (is64bit) {
 817                                 *(user_addr_t *)datap = uap->data;
 818                         }
 819                         else {
 820                                 *(uint32_t *)datap = (uint32_t)uap->data;
 821                         }
 822                 }
 823         } else if ((com&IOC_OUT) && size)
 824                 /*
 825                  * Zero the buffer so the user always
 826                  * gets back something deterministic.
 827                  */
 828                 bzero(datap, size);
 829         else if (com&IOC_VOID) {
 830                 /* XXX - this is odd since IOC_VOID means no parameters */
 831                 if (is64bit) {
 832                         *(user_addr_t *)datap = uap->data;
 833                 }
 834                 else {
 835                         *(uint32_t *)datap = (uint32_t)uap->data;
 836                 }
 837         }
 838
 839         switch (com) {
 840
 841         case FIONBIO:
 842                 if ( (tmp = *(int *)datap) )
 843                         fp->f_flag |= FNONBLOCK;
 844                 else
 845                         fp->f_flag &= ~FNONBLOCK;
 846                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
 847                 break;
 848
 849         case FIOASYNC:
 850                 if ( (tmp = *(int *)datap) )
 851                         fp->f_flag |= FASYNC;
 852                 else
 853                         fp->f_flag &= ~FASYNC;
 854                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
 855                 break;
 856
 857         case FIOSETOWN:
 858                 tmp = *(int *)datap;
 859                 if (fp->f_type == DTYPE_SOCKET) {
 860                         ((struct socket *)fp->f_data)->so_pgid = tmp;
 861                         error = 0;
 862                         break;
 863                 }
 864                 if (fp->f_type == DTYPE_PIPE) {
 865                         error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
 866                         break;
 867                 }
 868                 if (tmp <= 0) {
 869                         tmp = -tmp;
 870                 } else {
 871                         struct proc *p1 = pfind(tmp);
 872                         if (p1 == 0) {
 873                                 error = ESRCH;
 874                                 break;
 875                         }
 876                         tmp = p1->p_pgrp->pg_id;
 877                 }
 878                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
 879                 break;
 880
 881         case FIOGETOWN:
 882                 if (fp->f_type == DTYPE_SOCKET) {
 883                         error = 0;
 884                         *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
 885                         break;
 886                 }
 887                 error = fo_ioctl(fp, TIOCGPGRP, datap, p);
 888                 *(int *)datap = -*(int *)datap;
 889                 break;
 890
 891         default:
 892                 error = fo_ioctl(fp, com, datap, p);
 893                 /*
 894                  * Copy any data to user, size was
 895                  * already set and checked above.
 896                  */
 897                 if (error == 0 && (com&IOC_OUT) && size)
 898                         error = copyout(datap, uap->data, (u_int)size);
 899                 break;
 900         }
 901         proc_fdunlock(p);
 902         if (memp)
 903                 kfree(memp, size);
 904         proc_fdlock(p);
 905 out:
 906         fp_drop(p, fd, fp, 1);
 907         proc_fdunlock(p);
 908         return(error);
 909 }
 910
 911 int     selwait, nselcoll;
 912 #define SEL_FIRSTPASS 1
 913 #define SEL_SECONDPASS 2
 914 extern int selcontinue(int error);
 915 extern int selprocess(int error, int sel_pass);
 916 static int selscan(struct proc *p, struct _select * sel,
 917                         int nfd, register_t *retval, int sel_pass, wait_queue_sub_t wqsub);
 918 static int selcount(struct proc *p, u_int32_t *ibits, u_int32_t *obits,
 919                         int nfd, int * count);
 920 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
 921 extern uint64_t tvtoabstime(struct timeval      *tvp);
 922
 923 /*
 924  * Select system call.
 925  */
 926 int
 927 select(struct proc *p, struct select_args *uap, register_t *retval)
 928 {
 929         int error = 0;
 930         u_int ni, nw, size;
 931         thread_t th_act;
 932         struct uthread  *uth;
 933         struct _select *sel;
 934         int needzerofill = 1;
 935         int count = 0;
 936
 937         th_act = current_thread();
 938         uth = get_bsdthread_info(th_act);
 939         sel = &uth->uu_select;
 940         retval = (int *)get_bsduthreadrval(th_act);
 941         *retval = 0;
 942
 943         if (uap->nd < 0) {
 944                 return (EINVAL);
 945         }
 946
 947         if (uap->nd > p->p_fd->fd_nfiles)
 948                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
 949
 950         nw = howmany(uap->nd, NFDBITS);
 951         ni = nw * sizeof(fd_mask);
 952
 953         /*
 954          * if this is the first select by the thread
 955          * allocate the space for bits.
 956          */
 957         if (sel->nbytes == 0) {
 958                 sel->nbytes = 3 * ni;
 959                 MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
 960                 MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
 961                 if ((sel->ibits == NULL) || (sel->obits == NULL))
 962                         panic("select out of memory");
 963                 needzerofill = 0;
 964         }
 965
 966         /*
 967          * if the previously allocated space for the bits
 968          * is smaller than what is requested. Reallocate.
 969          */
 970         if (sel->nbytes < (3 * ni)) {
 971                 sel->nbytes = (3 * ni);
 972                 FREE(sel->ibits, M_TEMP);
 973                 FREE(sel->obits, M_TEMP);
 974                 MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
 975                 MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
 976                 if ((sel->ibits == NULL) || (sel->obits == NULL))
 977                         panic("select out of memory");
 978                 needzerofill = 0;
 979         }
 980
 981         if (needzerofill) {
 982                 bzero((caddr_t)sel->ibits, sel->nbytes);
 983                 bzero((caddr_t)sel->obits, sel->nbytes);
 984         }
 985
 986         /*
 987          * get the bits from the user address space
 988          */
 989 #define getbits(name, x) \
 990         do { \
 991                 if (uap->name && (error = copyin(uap->name, \
 992                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
 993                         goto continuation; \
 994         } while (0)
 995
 996         getbits(in, 0);
 997         getbits(ou, 1);
 998         getbits(ex, 2);
 999 #undef  getbits
1000
1001         if (uap->tv) {
1002                 struct timeval atv;
1003                 if (IS_64BIT_PROCESS(p)) {
1004                         struct user_timeval atv64;
1005                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
1006                         /* Loses resolution - assume timeout < 68 years */
1007                         atv.tv_sec = atv64.tv_sec;
1008                         atv.tv_usec = atv64.tv_usec;
1009                 } else {
1010                         error = copyin(uap->tv, (caddr_t)&atv, sizeof(atv));
1011                 }
1012                 if (error)
1013                         goto continuation;
1014                 if (itimerfix(&atv)) {
1015                         error = EINVAL;
1016                         goto continuation;
1017                 }
1018
1019                 clock_absolutetime_interval_to_deadline(
1020                                                                                 tvtoabstime(&atv), &sel->abstime);
1021         }
1022         else
1023                 sel->abstime = 0;
1024
1025         if ( (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count)) ) {
1026                         goto continuation;
1027         }
1028
1029         sel->count = count;
1030         size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK);
1031         if (sel->allocsize) {
1032                 if (sel->wqset == 0)
1033                         panic("select: wql memory smashed");
1034                 /* needed for the select now */
1035                 if (size > sel->allocsize) {
1036                         kfree(sel->wqset,  sel->allocsize);
1037                         sel->allocsize = size;
1038                         sel->wqset = (wait_queue_set_t)kalloc(size);
1039                         if (sel->wqset == (wait_queue_set_t)NULL)
1040                                 panic("failed to allocate memory for waitqueue\n");
1041                 }
1042         } else {
1043                 sel->count = count;
1044                 sel->allocsize = size;
1045                 sel->wqset = (wait_queue_set_t)kalloc(sel->allocsize);
1046                 if (sel->wqset == (wait_queue_set_t)NULL)
1047                         panic("failed to allocate memory for waitqueue\n");
1048         }
1049         bzero(sel->wqset, size);
1050         sel->wql = (char *)sel->wqset + SIZEOF_WAITQUEUE_SET;
1051         wait_queue_set_init(sel->wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST));
1052
1053 continuation:
1054         return selprocess(error, SEL_FIRSTPASS);
1055 }
1056
1057 int
1058 selcontinue(int error)
1059 {
1060         return selprocess(error, SEL_SECONDPASS);
1061 }
1062
1063 int
1064 selprocess(int error, int sel_pass)
1065 {
1066         int ncoll;
1067         u_int ni, nw;
1068         thread_t th_act;
1069         struct uthread  *uth;
1070         struct proc *p;
1071         struct select_args *uap;
1072         int *retval;
1073         struct _select *sel;
1074         int unwind = 1;
1075         int prepost = 0;
1076         int somewakeup = 0;
1077         int doretry = 0;
1078         wait_result_t wait_result;
1079
1080         p = current_proc();
1081         th_act = current_thread();
1082         uap = (struct select_args *)get_bsduthreadarg(th_act);
1083         retval = (int *)get_bsduthreadrval(th_act);
1084         uth = get_bsdthread_info(th_act);
1085         sel = &uth->uu_select;
1086
1087         /* if it is first pass wait queue is not setup yet */
1088         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
1089                         unwind = 0;
1090         if (sel->count == 0)
1091                         unwind = 0;
1092 retry:
1093         if (error != 0) {
1094           goto done;
1095         }
1096
1097         ncoll = nselcoll;
1098         p->p_flag |= P_SELECT;
1099         /* skip scans if the select is just for timeouts */
1100         if (sel->count) {
1101                 if (sel_pass == SEL_FIRSTPASS)
1102                         wait_queue_sub_clearrefs(sel->wqset);
1103
1104                 error = selscan(p, sel, uap->nd, retval, sel_pass, sel->wqset);
1105                 if (error || *retval) {
1106                         goto done;
1107                 }
1108                 if (prepost) {
1109                         /* if the select of log, then we canwakeup and discover some one
1110                         * else already read the data; go toselct again if time permits
1111                         */
1112                         prepost = 0;
1113                         doretry = 1;
1114                 }
1115                 if (somewakeup) {
1116                         somewakeup = 0;
1117                         doretry = 1;
1118                 }
1119         }
1120
1121         if (uap->tv) {
1122                 uint64_t        now;
1123
1124                 clock_get_uptime(&now);
1125                 if (now >= sel->abstime)
1126                         goto done;
1127         }
1128
1129         if (doretry) {
1130                 /* cleanup obits and try again */
1131                 doretry = 0;
1132                 sel_pass = SEL_FIRSTPASS;
1133                 goto retry;
1134         }
1135
1136         /*
1137          * To effect a poll, the timeout argument should be
1138          * non-nil, pointing to a zero-valued timeval structure.
1139          */
1140         if (uap->tv && sel->abstime == 0) {
1141                 goto done;
1142         }
1143
1144         /* No spurious wakeups due to colls,no need to check for them */
1145          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1146                 sel_pass = SEL_FIRSTPASS;
1147                 goto retry;
1148         }
1149
1150         p->p_flag &= ~P_SELECT;
1151
1152         /* if the select is just for timeout skip check */
1153         if (sel->count &&(sel_pass == SEL_SECONDPASS))
1154                 panic("selprocess: 2nd pass assertwaiting");
1155
1156         /* Wait Queue Subordinate has waitqueue as first element */
1157         wait_result = wait_queue_assert_wait((wait_queue_t)sel->wqset,
1158                                              &selwait, THREAD_ABORTSAFE, sel->abstime);
1159         if (wait_result != THREAD_AWAKENED) {
1160                 /* there are no preposted events */
1161                 error = tsleep1(NULL, PSOCK | PCATCH,
1162                                 "select", 0, selcontinue);
1163         } else  {
1164                 prepost = 1;
1165                 error = 0;
1166         }
1167
1168         sel_pass = SEL_SECONDPASS;
1169         if (error == 0) {
1170                 if (!prepost)
1171                         somewakeup =1;
1172                 goto retry;
1173         }
1174 done:
1175         if (unwind) {
1176                 wait_subqueue_unlink_all(sel->wqset);
1177                 seldrop(p, sel->ibits, uap->nd);
1178         }
1179         p->p_flag &= ~P_SELECT;
1180         /* select is not restarted after signals... */
1181         if (error == ERESTART)
1182                 error = EINTR;
1183         if (error == EWOULDBLOCK)
1184                 error = 0;
1185         nw = howmany(uap->nd, NFDBITS);
1186         ni = nw * sizeof(fd_mask);
1187
1188 #define putbits(name, x) \
1189         do { \
1190                 if (uap->name && (error2 = \
1191                         copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1192                         error = error2; \
1193         } while (0)
1194
1195         if (error == 0) {
1196                 int error2;
1197
1198                 putbits(in, 0);
1199                 putbits(ou, 1);
1200                 putbits(ex, 2);
1201 #undef putbits
1202         }
1203         return(error);
1204 }
1205
1206 static int
1207 selscan(p, sel, nfd, retval, sel_pass, wqsub)
1208         struct proc *p;
1209         struct _select *sel;
1210         int nfd;
1211         register_t *retval;
1212         int sel_pass;
1213         wait_queue_sub_t wqsub;
1214 {
1215         register struct filedesc *fdp = p->p_fd;
1216         register int msk, i, j, fd;
1217         register u_int32_t bits;
1218         struct fileproc *fp;
1219         int n = 0;
1220         int nc = 0;
1221         static int flag[3] = { FREAD, FWRITE, 0 };
1222         u_int32_t *iptr, *optr;
1223         u_int nw;
1224         u_int32_t *ibits, *obits;
1225         char * wql;
1226         char * wql_ptr;
1227
1228         /*
1229          * Problems when reboot; due to MacOSX signal probs
1230          * in Beaker1C ; verify that the p->p_fd is valid
1231          */
1232         if (fdp == NULL) {
1233                 *retval=0;
1234                 return(EIO);
1235         }
1236         ibits = sel->ibits;
1237         obits = sel->obits;
1238         wql = sel->wql;
1239
1240         nw = howmany(nfd, NFDBITS);
1241
1242         nc = 0;
1243         proc_fdlock(p);
1244
1245         if (sel->count) {
1246                 for (msk = 0; msk < 3; msk++) {
1247                         iptr = (u_int32_t *)&ibits[msk * nw];
1248                         optr = (u_int32_t *)&obits[msk * nw];
1249
1250                         for (i = 0; i < nfd; i += NFDBITS) {
1251                                 bits = iptr[i/NFDBITS];
1252
1253                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1254                                         bits &= ~(1 << j);
1255                                         fp = fdp->fd_ofiles[fd];
1256
1257                                         if (fp == NULL ||
1258                                                 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1259                                                 proc_fdunlock(p);
1260                                                 return(EBADF);
1261                                         }
1262                                         if (sel_pass == SEL_SECONDPASS) {
1263                                                 wql_ptr = (char *)0;
1264                                                 fp->f_flags &= ~FP_INSELECT;
1265                                                 fp->f_waddr = (void *)0;
1266                                         } else {
1267                                                 wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
1268                                                 fp->f_flags |= FP_INSELECT;
1269                                                 fp->f_waddr = (void *)wqsub;
1270                                         }
1271                                         if (fp->f_ops && fo_select(fp, flag[msk], wql_ptr, p)) {
1272                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1273                                                 n++;
1274                                         }
1275                                         nc++;
1276                                 }
1277                         }
1278                 }
1279         }
1280         proc_fdunlock(p);
1281         *retval = n;
1282         return (0);
1283 }
1284
1285 static int poll_callback(struct kqueue *, struct kevent *, void *);
1286
1287 struct poll_continue_args {
1288         user_addr_t pca_fds;
1289         u_int pca_nfds;
1290         u_int pca_rfds;
1291 };
1292
1293 int
1294 poll(struct proc *p, struct poll_args *uap, register_t *retval)
1295 {
1296         struct poll_continue_args *cont;
1297         struct pollfd *fds;
1298         struct kqueue *kq;
1299         struct timeval atv;
1300         int ncoll, error = 0;
1301         u_int nfds = uap->nfds;
1302         u_int rfds = 0;
1303         u_int i;
1304         size_t ni;
1305
1306         /*
1307          * This is kinda bogus.  We have fd limits, but that is not
1308          * really related to the size of the pollfd array.  Make sure
1309          * we let the process use at least FD_SETSIZE entries and at
1310          * least enough for the current limits.  We want to be reasonably
1311          * safe, but not overly restrictive.
1312          */
1313         if (nfds > OPEN_MAX ||
1314             (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE))
1315                 return (EINVAL);
1316
1317         kq = kqueue_alloc(p);
1318         if (kq == NULL)
1319                 return (EAGAIN);
1320
1321         ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1322         MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1323         if (NULL == cont) {
1324                 error = EAGAIN;
1325                 goto out;
1326         }
1327
1328         fds = (struct pollfd *)&cont[1];
1329         error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1330         if (error)
1331                 goto out;
1332
1333         if (uap->timeout != -1) {
1334                 struct timeval rtv;
1335
1336                 atv.tv_sec = uap->timeout / 1000;
1337                 atv.tv_usec = (uap->timeout % 1000) * 1000;
1338                 if (itimerfix(&atv)) {
1339                         error = EINVAL;
1340                         goto out;
1341                 }
1342                 getmicrouptime(&rtv);
1343                 timevaladd(&atv, &rtv);
1344         } else {
1345                 atv.tv_sec = 0;
1346                 atv.tv_usec = 0;
1347         }
1348
1349         /* JMM - all this P_SELECT stuff is bogus */
1350         ncoll = nselcoll;
1351         p->p_flag |= P_SELECT;
1352
1353         for (i = 0; i < nfds; i++) {
1354                 short events = fds[i].events;
1355                 struct kevent kev;
1356                 int kerror = 0;
1357
1358                 /* per spec, ignore fd values below zero */
1359                 if (fds[i].fd < 0) {
1360                         fds[i].revents = 0;
1361                         continue;
1362                 }
1363
1364                 /* convert the poll event into a kqueue kevent */
1365                 kev.ident = fds[i].fd;
1366                 kev.flags = EV_ADD | EV_ONESHOT | EV_POLL;
1367                 kev.fflags = NOTE_LOWAT;
1368                 kev.data = 1; /* efficiency be damned: any data should trigger */
1369                 kev.udata = CAST_USER_ADDR_T(&fds[i]);
1370
1371                 /* Handle input events */
1372                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND )) {
1373                         kev.filter = EVFILT_READ;
1374                         if (!(events & ( POLLIN | POLLRDNORM )))
1375                                 kev.flags |= EV_OOBAND;
1376                         kerror = kevent_register(kq, &kev, p);
1377                 }
1378
1379                 /* Handle output events */
1380                 if (kerror == 0 &&
1381                     events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) {
1382                         kev.filter = EVFILT_WRITE;
1383                         kerror = kevent_register(kq, &kev, p);
1384                 }
1385
1386                 /* Handle BSD extension vnode events */
1387                 if (kerror == 0 &&
1388                     events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) {
1389                         kev.filter = EVFILT_VNODE;
1390                         kev.fflags = 0;
1391                         if (events & POLLEXTEND)
1392                                 kev.fflags |= NOTE_EXTEND;
1393                         if (events & POLLATTRIB)
1394                                 kev.fflags |= NOTE_ATTRIB;
1395                         if (events & POLLNLINK)
1396                                 kev.fflags |= NOTE_LINK;
1397                         if (events & POLLWRITE)
1398                                 kev.fflags |= NOTE_WRITE;
1399                         kerror = kevent_register(kq, &kev, p);
1400                 }
1401
1402                 if (kerror != 0) {
1403                         fds[i].revents = POLLNVAL;
1404                         rfds++;
1405                 } else
1406                         fds[i].revents = 0;
1407         }
1408
1409         /* Did we have any trouble registering? */
1410         if (rfds > 0)
1411                 goto done;
1412
1413         /* scan for, and possibly wait for, the kevents to trigger */
1414         cont->pca_fds = uap->fds;
1415         cont->pca_nfds = nfds;
1416         cont->pca_rfds = rfds;
1417         error = kevent_scan(kq, poll_callback, NULL, cont, &atv, p);
1418         rfds = cont->pca_rfds;
1419
1420  done:
1421         p->p_flag &= ~P_SELECT;
1422         /* poll is not restarted after signals... */
1423         if (error == ERESTART)
1424                 error = EINTR;
1425         if (error == EWOULDBLOCK)
1426                 error = 0;
1427         if (error == 0) {
1428                 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1429                 *retval = rfds;
1430         }
1431
1432  out:
1433         if (NULL != cont)
1434                 FREE(cont, M_TEMP);
1435
1436         kqueue_dealloc(kq, p);
1437         return (error);
1438 }
1439
1440 static int
1441 poll_callback(__unused struct kqueue *kq, struct kevent *kevp, void *data)
1442 {
1443         struct poll_continue_args *cont = (struct poll_continue_args *)data;
1444         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1445         short mask;
1446
1447         /* convert the results back into revents */
1448         if (kevp->flags & EV_EOF)
1449                 fds->revents |= POLLHUP;
1450         if (kevp->flags & EV_ERROR)
1451                 fds->revents |= POLLERR;
1452         cont->pca_rfds++;
1453
1454         switch (kevp->filter) {
1455         case EVFILT_READ:
1456                 if (fds->revents & POLLHUP)
1457                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
1458                 else {
1459                         mask = 0;
1460                         if (kevp->data != 0)
1461                                 mask |= (POLLIN | POLLRDNORM );
1462                         if (kevp->flags & EV_OOBAND)
1463                                 mask |= ( POLLPRI | POLLRDBAND );
1464                 }
1465                 fds->revents |= (fds->events & mask);
1466                 break;
1467
1468         case EVFILT_WRITE:
1469                 if (!(fds->revents & POLLHUP))
1470                         fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
1471                 break;
1472
1473         case EVFILT_PROC:
1474                 if (kevp->fflags & NOTE_EXTEND)
1475                         fds->revents |= (fds->events & POLLEXTEND);
1476                 if (kevp->fflags & NOTE_ATTRIB)
1477                         fds->revents |= (fds->events & POLLATTRIB);
1478                 if (kevp->fflags & NOTE_LINK)
1479                         fds->revents |= (fds->events & POLLNLINK);
1480                 if (kevp->fflags & NOTE_WRITE)
1481                         fds->revents |= (fds->events & POLLWRITE);
1482                 break;
1483         }
1484         return 0;
1485 }
1486
1487 int
1488 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1489 {
1490
1491         return (1);
1492 }
1493
1494 static int
1495 selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits,
1496                  int nfd, int *count)
1497 {
1498         register struct filedesc *fdp = p->p_fd;
1499         register int msk, i, j, fd;
1500         register u_int32_t bits;
1501         struct fileproc *fp;
1502         int n = 0;
1503         u_int32_t *iptr;
1504         u_int nw;
1505         int error=0;
1506         int dropcount;
1507
1508         /*
1509          * Problems when reboot; due to MacOSX signal probs
1510          * in Beaker1C ; verify that the p->p_fd is valid
1511          */
1512         if (fdp == NULL) {
1513                 *count=0;
1514                 return(EIO);
1515         }
1516         nw = howmany(nfd, NFDBITS);
1517
1518         proc_fdlock(p);
1519         for (msk = 0; msk < 3; msk++) {
1520                 iptr = (u_int32_t *)&ibits[msk * nw];
1521                 for (i = 0; i < nfd; i += NFDBITS) {
1522                         bits = iptr[i/NFDBITS];
1523                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1524                                 bits &= ~(1 << j);
1525                                 fp = fdp->fd_ofiles[fd];
1526                                 if (fp == NULL ||
1527                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1528                                                 *count=0;
1529                                                 error = EBADF;
1530                                                 goto bad;
1531                                 }
1532                                 fp->f_iocount++;
1533                                 n++;
1534                         }
1535                 }
1536         }
1537         proc_fdunlock(p);
1538
1539         *count = n;
1540         return (0);
1541 bad:
1542         dropcount = 0;
1543
1544         if (n== 0)
1545                 goto out;
1546         /* undo the iocounts */
1547         for (msk = 0; msk < 3; msk++) {
1548                 iptr = (u_int32_t *)&ibits[msk * nw];
1549                 for (i = 0; i < nfd; i += NFDBITS) {
1550                         bits = iptr[i/NFDBITS];
1551                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1552                                 bits &= ~(1 << j);
1553                                 fp = fdp->fd_ofiles[fd];
1554                                 if (dropcount >= n)
1555                                         goto out;
1556                                 fp->f_iocount--;
1557
1558                                 if (p->p_fpdrainwait && fp->f_iocount == 0) {
1559                                         p->p_fpdrainwait = 0;
1560                                         wakeup(&p->p_fpdrainwait);
1561                                 }
1562                                 dropcount++;
1563                         }
1564                 }
1565         }
1566 out:
1567         proc_fdunlock(p);
1568         return(error);
1569 }
1570
1571 static int
1572 seldrop(p, ibits, nfd)
1573         struct proc *p;
1574         u_int32_t *ibits;
1575         int nfd;
1576 {
1577         register struct filedesc *fdp = p->p_fd;
1578         register int msk, i, j, fd;
1579         register u_int32_t bits;
1580         struct fileproc *fp;
1581         int n = 0;
1582         u_int32_t *iptr;
1583         u_int nw;
1584
1585         /*
1586          * Problems when reboot; due to MacOSX signal probs
1587          * in Beaker1C ; verify that the p->p_fd is valid
1588          */
1589         if (fdp == NULL) {
1590                 return(EIO);
1591         }
1592
1593         nw = howmany(nfd, NFDBITS);
1594
1595
1596         proc_fdlock(p);
1597         for (msk = 0; msk < 3; msk++) {
1598                 iptr = (u_int32_t *)&ibits[msk * nw];
1599                 for (i = 0; i < nfd; i += NFDBITS) {
1600                         bits = iptr[i/NFDBITS];
1601                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1602                                 bits &= ~(1 << j);
1603                                 fp = fdp->fd_ofiles[fd];
1604                                 if (fp == NULL
1605 #if 0
1606                         /* if you are here then it is being closed */
1607                                         || (fdp->fd_ofileflags[fd] & UF_RESERVED)
1608 #endif
1609                                         ) {
1610                                                 proc_fdunlock(p);
1611                                                 return(EBADF);
1612                                 }
1613                                 n++;
1614                                 fp->f_iocount--;
1615                                 fp->f_flags &= ~FP_INSELECT;
1616
1617                                 if (p->p_fpdrainwait && fp->f_iocount == 0) {
1618                                         p->p_fpdrainwait = 0;
1619                                         wakeup(&p->p_fpdrainwait);
1620                                 }
1621                         }
1622                 }
1623         }
1624         proc_fdunlock(p);
1625         return (0);
1626 }
1627
1628 /*
1629  * Record a select request.
1630  */
1631 void
1632 selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql)
1633 {
1634         thread_t        cur_act = current_thread();
1635         struct uthread * ut = get_bsdthread_info(cur_act);
1636
1637         /* need to look at collisions */
1638
1639         if ((p_wql == (void *)0) && ((sip->si_flags & SI_INITED) == 0)) {
1640                 return;
1641         }
1642
1643         /*do not record if this is second pass of select */
1644         if((p_wql == (void *)0)) {
1645                 return;
1646         }
1647
1648         if ((sip->si_flags & SI_INITED) == 0) {
1649                 wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
1650                 sip->si_flags |= SI_INITED;
1651                 sip->si_flags &= ~SI_CLEAR;
1652         }
1653
1654         if (sip->si_flags & SI_RECORDED) {
1655                 sip->si_flags |= SI_COLL;
1656         } else
1657                 sip->si_flags &= ~SI_COLL;
1658
1659         sip->si_flags |= SI_RECORDED;
1660         if (!wait_queue_member(&sip->si_wait_queue, ut->uu_select.wqset))
1661                 wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_select.wqset,
1662                                         (wait_queue_link_t)p_wql);
1663
1664         return;
1665 }
1666
1667 void
1668 selwakeup(sip)
1669         register struct selinfo *sip;
1670 {
1671
1672         if ((sip->si_flags & SI_INITED) == 0) {
1673                 return;
1674         }
1675
1676         if (sip->si_flags & SI_COLL) {
1677                 nselcoll++;
1678                 sip->si_flags &= ~SI_COLL;
1679 #if 0
1680                 /* will not  support */
1681                 //wakeup((caddr_t)&selwait);
1682 #endif
1683         }
1684
1685         if (sip->si_flags & SI_RECORDED) {
1686                 wait_queue_wakeup_all(&sip->si_wait_queue, &selwait, THREAD_AWAKENED);
1687                 sip->si_flags &= ~SI_RECORDED;
1688         }
1689
1690 }
1691
1692 void
1693 selthreadclear(sip)
1694         register struct selinfo *sip;
1695 {
1696
1697         if ((sip->si_flags & SI_INITED) == 0) {
1698                 return;
1699         }
1700         if (sip->si_flags & SI_RECORDED) {
1701                         selwakeup(sip);
1702                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
1703         }
1704         sip->si_flags |= SI_CLEAR;
1705         wait_queue_unlinkall_nofree(&sip->si_wait_queue);
1706 }
1707
1708
1709
1710
1711 #define DBG_EVENT       0x10
1712
1713 #define DBG_POST        0x10
1714 #define DBG_WATCH       0x11
1715 #define DBG_WAIT        0x12
1716 #define DBG_MOD         0x13
1717 #define DBG_EWAKEUP     0x14
1718 #define DBG_ENQUEUE     0x15
1719 #define DBG_DEQUEUE     0x16
1720
1721 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
1722 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
1723 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
1724 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
1725 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
1726 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
1727 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
1728
1729
1730 #define EVPROCDEQUE(p, evq)     do {                            \
1731         proc_lock(p);                                           \
1732         if (evq->ee_flags & EV_QUEUED) {                        \
1733                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);      \
1734                 evq->ee_flags &= ~EV_QUEUED;                    \
1735         }                                                       \
1736         proc_unlock(p);                                         \
1737 } while (0);
1738
1739
1740 /*
1741  * called upon socket close. deque and free all events for
1742  * the socket...  socket must be locked by caller.
1743  */
1744 void
1745 evsofree(struct socket *sp)
1746 {
1747         struct eventqelt *evq, *next;
1748         proc_t  p;
1749
1750         if (sp == NULL)
1751                 return;
1752
1753         for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
1754                 next = evq->ee_slist.tqe_next;
1755                 p = evq->ee_proc;
1756
1757                 if (evq->ee_flags & EV_QUEUED) {
1758                         EVPROCDEQUE(p, evq);
1759                 }
1760                 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
1761                 FREE(evq, M_TEMP);
1762         }
1763 }
1764
1765
1766 /*
1767  * called upon pipe close. deque and free all events for
1768  * the pipe... pipe must be locked by caller
1769  */
1770 void
1771 evpipefree(struct pipe *cpipe)
1772 {
1773         struct eventqelt *evq, *next;
1774         proc_t  p;
1775
1776         for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
1777                 next = evq->ee_slist.tqe_next;
1778                 p = evq->ee_proc;
1779
1780                 EVPROCDEQUE(p, evq);
1781
1782                 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
1783                 FREE(evq, M_TEMP);
1784         }
1785 }
1786
1787
1788 /*
1789  * enqueue this event if it's not already queued. wakeup
1790  * the proc if we do queue this event to it...
1791  * entered with proc lock held... we drop it before
1792  * doing the wakeup and return in that state
1793  */
1794 static void
1795 evprocenque(struct eventqelt *evq)
1796 {
1797         proc_t  p;
1798
1799         assert(evq);
1800         p = evq->ee_proc;
1801
1802         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, evq, evq->ee_flags, evq->ee_eventmask,0,0);
1803
1804         proc_lock(p);
1805
1806         if (evq->ee_flags & EV_QUEUED) {
1807                 proc_unlock(p);
1808
1809                 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1810                 return;
1811         }
1812         evq->ee_flags |= EV_QUEUED;
1813
1814         TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
1815
1816         proc_unlock(p);
1817
1818         wakeup(&p->p_evlist);
1819
1820         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1821 }
1822
1823
1824 /*
1825  * pipe lock must be taken by the caller
1826  */
1827 void
1828 postpipeevent(struct pipe *pipep, int event)
1829 {
1830         int     mask;
1831         struct eventqelt *evq;
1832
1833         if (pipep == NULL)
1834                 return;
1835         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
1836
1837         for (evq = pipep->pipe_evlist.tqh_first;
1838              evq != NULL; evq = evq->ee_slist.tqe_next) {
1839
1840                 if (evq->ee_eventmask == 0)
1841                         continue;
1842                 mask = 0;
1843
1844                 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
1845
1846                 case EV_RWBYTES:
1847                   if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
1848                           mask |= EV_RE;
1849                           evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
1850                   }
1851                   if ((evq->ee_eventmask & EV_WR) &&
1852                       (pipep->pipe_buffer.size - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
1853
1854                           if (pipep->pipe_state & PIPE_EOF) {
1855                                   mask |= EV_WR|EV_RESET;
1856                                   break;
1857                           }
1858                           mask |= EV_WR;
1859                           evq->ee_req.er_wcnt = pipep->pipe_buffer.size - pipep->pipe_buffer.cnt;
1860                   }
1861                   break;
1862
1863                 case EV_WCLOSED:
1864                 case EV_RCLOSED:
1865                   if ((evq->ee_eventmask & EV_RE)) {
1866                           mask |= EV_RE|EV_RCLOSED;
1867                   }
1868                   if ((evq->ee_eventmask & EV_WR)) {
1869                           mask |= EV_WR|EV_WCLOSED;
1870                   }
1871                   break;
1872
1873                 default:
1874                   return;
1875                 }
1876                 if (mask) {
1877                         /*
1878                          * disarm... postevents are nops until this event is 'read' via
1879                          * waitevent and then re-armed via modwatch
1880                          */
1881                         evq->ee_eventmask = 0;
1882
1883                         /*
1884                          * since events are disarmed until after the waitevent
1885                          * the ee_req.er_xxxx fields can't change once we've
1886                          * inserted this event into the proc queue...
1887                          * therefore, the waitevent will see a 'consistent'
1888                          * snapshot of the event, even though it won't hold
1889                          * the pipe lock, and we're updating the event outside
1890                          * of the proc lock, which it will hold
1891                          */
1892                         evq->ee_req.er_eventbits |= mask;
1893
1894                         KERNEL_DEBUG(DBG_MISC_POST, evq, evq->ee_req.er_eventbits, mask, 1,0);
1895
1896                         evprocenque(evq);
1897                 }
1898         }
1899         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
1900 }
1901
1902
1903 /*
1904  * given either a sockbuf or a socket run down the
1905  * event list and queue ready events found...
1906  * the socket must be locked by the caller
1907  */
1908 void
1909 postevent(struct socket *sp, struct sockbuf *sb, int event)
1910 {
1911         int     mask;
1912         struct  eventqelt *evq;
1913         struct  tcpcb *tp;
1914
1915         if (sb)
1916                 sp = sb->sb_so;
1917         if (sp == NULL)
1918                 return;
1919
1920         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
1921
1922         for (evq = sp->so_evlist.tqh_first;
1923              evq != NULL; evq = evq->ee_slist.tqe_next) {
1924
1925                 if (evq->ee_eventmask == 0)
1926                         continue;
1927                 mask = 0;
1928
1929                 /* ready for reading:
1930                    - byte cnt >= receive low water mark
1931                    - read-half of conn closed
1932                    - conn pending for listening sock
1933                    - socket error pending
1934
1935                    ready for writing
1936                    - byte cnt avail >= send low water mark
1937                    - write half of conn closed
1938                    - socket error pending
1939                    - non-blocking conn completed successfully
1940
1941                    exception pending
1942                    - out of band data
1943                    - sock at out of band mark
1944                 */
1945
1946                 switch (event & EV_DMASK) {
1947
1948                 case EV_OOB:
1949                   if ((evq->ee_eventmask & EV_EX)) {
1950                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
1951                                   mask |= EV_EX|EV_OOB;
1952                   }
1953                   break;
1954
1955                 case EV_RWBYTES|EV_OOB:
1956                   if ((evq->ee_eventmask & EV_EX)) {
1957                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
1958                                   mask |= EV_EX|EV_OOB;
1959                   }
1960                   /*
1961                    * fall into the next case
1962                    */
1963                 case EV_RWBYTES:
1964                   if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
1965                           if (sp->so_error) {
1966                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
1967                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
1968                                               (tp->t_state == TCPS_CLOSED)) {
1969                                                   mask |= EV_RE|EV_RESET;
1970                                                   break;
1971                                           }
1972                                   }
1973                           }
1974                           mask |= EV_RE;
1975                           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
1976
1977                           if (sp->so_state & SS_CANTRCVMORE) {
1978                                   mask |= EV_FIN;
1979                                   break;
1980                           }
1981                   }
1982                   if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
1983                           if (sp->so_error) {
1984                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
1985                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
1986                                               (tp->t_state == TCPS_CLOSED)) {
1987                                                   mask |= EV_WR|EV_RESET;
1988                                                   break;
1989                                           }
1990                                   }
1991                           }
1992                           mask |= EV_WR;
1993                           evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
1994                   }
1995                   break;
1996
1997                 case EV_RCONN:
1998                   if ((evq->ee_eventmask & EV_RE)) {
1999                           mask |= EV_RE|EV_RCONN;
2000                           evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
2001                   }
2002                   break;
2003
2004                 case EV_WCONN:
2005                   if ((evq->ee_eventmask & EV_WR)) {
2006                           mask |= EV_WR|EV_WCONN;
2007                   }
2008                   break;
2009
2010                 case EV_RCLOSED:
2011                   if ((evq->ee_eventmask & EV_RE)) {
2012                           mask |= EV_RE|EV_RCLOSED;
2013                   }
2014                   break;
2015
2016                 case EV_WCLOSED:
2017                   if ((evq->ee_eventmask & EV_WR)) {
2018                           mask |= EV_WR|EV_WCLOSED;
2019                   }
2020                   break;
2021
2022                 case EV_FIN:
2023                   if (evq->ee_eventmask & EV_RE) {
2024                           mask |= EV_RE|EV_FIN;
2025                   }
2026                   break;
2027
2028                 case EV_RESET:
2029                 case EV_TIMEOUT:
2030                   if (evq->ee_eventmask & EV_RE) {
2031                           mask |= EV_RE | event;
2032                   }
2033                   if (evq->ee_eventmask & EV_WR) {
2034                           mask |= EV_WR | event;
2035                   }
2036                   break;
2037
2038                 default:
2039                   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
2040                   return;
2041                 } /* switch */
2042
2043                 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
2044
2045                 if (mask) {
2046                         /*
2047                          * disarm... postevents are nops until this event is 'read' via
2048                          * waitevent and then re-armed via modwatch
2049                          */
2050                         evq->ee_eventmask = 0;
2051
2052                         /*
2053                          * since events are disarmed until after the waitevent
2054                          * the ee_req.er_xxxx fields can't change once we've
2055                          * inserted this event into the proc queue...
2056                          * since waitevent can't see this event until we
2057                          * enqueue it, waitevent will see a 'consistent'
2058                          * snapshot of the event, even though it won't hold
2059                          * the socket lock, and we're updating the event outside
2060                          * of the proc lock, which it will hold
2061                          */
2062                         evq->ee_req.er_eventbits |= mask;
2063
2064                         evprocenque(evq);
2065                 }
2066         }
2067         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
2068 }
2069
2070
2071 /*
2072  * watchevent system call. user passes us an event to watch
2073  * for. we malloc an event object, initialize it, and queue
2074  * it to the open socket. when the event occurs, postevent()
2075  * will enque it back to our proc where we can retrieve it
2076  * via waitevent().
2077  *
2078  * should this prevent duplicate events on same socket?
2079  */
2080 int
2081 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
2082 {
2083         struct eventqelt *evq = (struct eventqelt *)0;
2084         struct eventqelt *np = NULL;
2085         struct eventreq *erp;
2086         struct fileproc *fp = NULL;
2087         int error;
2088
2089         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
2090
2091         // get a qelt and fill with users req
2092         MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
2093
2094         if (evq == NULL)
2095                 panic("can't MALLOC evq");
2096         erp = &evq->ee_req;
2097
2098         // get users request pkt
2099         if ( (error = copyin(CAST_USER_ADDR_T(uap->u_req), (caddr_t)erp,
2100                            sizeof(struct eventreq))) ) {
2101                 FREE(evq, M_TEMP);
2102
2103                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2104                 return(error);
2105         }
2106         KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,evq,0,0);
2107
2108         // validate, freeing qelt if errors
2109         error = 0;
2110         proc_fdlock(p);
2111
2112         if (erp->er_type != EV_FD) {
2113                 error = EINVAL;
2114         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2115                 error = EBADF;
2116         } else if (fp->f_type == DTYPE_SOCKET) {
2117                 socket_lock((struct socket *)fp->f_data, 1);
2118                 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2119         } else if (fp->f_type == DTYPE_PIPE) {
2120                 PIPE_LOCK((struct pipe *)fp->f_data);
2121                 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2122         } else {
2123                 fp_drop(p, erp->er_handle, fp, 1);
2124                 error = EINVAL;
2125         }
2126         proc_fdunlock(p);
2127
2128         if (error) {
2129                 FREE(evq, M_TEMP);
2130
2131                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2132                 return(error);
2133         }
2134
2135         /*
2136          * only allow one watch per file per proc
2137          */
2138         for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2139                 if (np->ee_proc == p) {
2140                         if (fp->f_type == DTYPE_SOCKET)
2141                                 socket_unlock((struct socket *)fp->f_data, 1);
2142                         else
2143                                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2144                         fp_drop(p, erp->er_handle, fp, 0);
2145                         FREE(evq, M_TEMP);
2146
2147                         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2148                         return(EINVAL);
2149                 }
2150         }
2151         erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
2152         evq->ee_proc = p;
2153         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2154         evq->ee_flags = 0;
2155
2156         if (fp->f_type == DTYPE_SOCKET) {
2157                 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2158                 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
2159
2160                 socket_unlock((struct socket *)fp->f_data, 1);
2161         } else {
2162                 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2163                 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2164
2165                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2166         }
2167         fp_drop_event(p, erp->er_handle, fp);
2168
2169         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
2170         return(0);
2171 }
2172
2173
2174
2175 /*
2176  * waitevent system call.
2177  * grabs the next waiting event for this proc and returns
2178  * it. if no events, user can request to sleep with timeout
2179  * or poll mode (tv=NULL);
2180  */
2181 int
2182 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
2183 {
2184         int error = 0;
2185         struct eventqelt *evq;
2186         struct eventreq   er;
2187         uint64_t abstime, interval;
2188
2189         if (uap->tv) {
2190                 struct timeval atv;
2191
2192                 error = copyin(CAST_USER_ADDR_T(uap->tv), (caddr_t)&atv, sizeof (atv));
2193                 if (error)
2194                         return(error);
2195                 if (itimerfix(&atv)) {
2196                         error = EINVAL;
2197                         return(error);
2198                 }
2199                 interval = tvtoabstime(&atv);
2200         } else
2201                 interval = 0;
2202
2203         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
2204
2205         proc_lock(p);
2206 retry:
2207         if ((evq = p->p_evlist.tqh_first) != NULL) {
2208                 /*
2209                  * found one... make a local copy while it's still on the queue
2210                  * to prevent it from changing while in the midst of copying
2211                  * don't want to hold the proc lock across a copyout because
2212                  * it might block on a page fault at the target in user space
2213                  */
2214                 bcopy((caddr_t)&evq->ee_req, (caddr_t)&er, sizeof (struct eventreq));
2215
2216                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2217
2218                 evq->ee_flags &= ~EV_QUEUED;
2219
2220                 proc_unlock(p);
2221
2222                 error = copyout((caddr_t)&er, CAST_USER_ADDR_T(uap->u_req), sizeof(struct eventreq));
2223
2224                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
2225                              evq->ee_req.er_handle,evq->ee_req.er_eventbits,evq,0);
2226                 return (error);
2227         }
2228         else {
2229                 if (uap->tv && interval == 0) {
2230                         proc_unlock(p);
2231                         *retval = 1;  // poll failed
2232
2233                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
2234                         return (error);
2235                 }
2236                 if (interval != 0)
2237                         clock_absolutetime_interval_to_deadline(interval, &abstime);
2238                 else
2239                         abstime = 0;
2240
2241                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,&p->p_evlist,0,0,0);
2242
2243                 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
2244
2245                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,&p->p_evlist,0,0,0);
2246
2247                 if (error == 0)
2248                         goto retry;
2249                 if (error == ERESTART)
2250                         error = EINTR;
2251                 if (error == EWOULDBLOCK) {
2252                         *retval = 1;
2253                         error = 0;
2254                 }
2255         }
2256         proc_unlock(p);
2257
2258         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
2259         return (error);
2260 }
2261
2262
2263 /*
2264  * modwatch system call. user passes in event to modify.
2265  * if we find it we reset the event bits and que/deque event
2266  * it needed.
2267  */
2268 int
2269 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
2270 {
2271         struct eventreq er;
2272         struct eventreq *erp = &er;
2273         struct eventqelt *evq;
2274         int error;
2275         struct fileproc *fp;
2276         int flag;
2277
2278         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
2279
2280         /*
2281          * get user's request pkt
2282          */
2283         if ((error = copyin(CAST_USER_ADDR_T(uap->u_req), (caddr_t)erp,
2284                              sizeof(struct eventreq)))) {
2285                         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2286                 return(error);
2287         }
2288         proc_fdlock(p);
2289
2290         if (erp->er_type != EV_FD) {
2291                 error = EINVAL;
2292         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2293                 error = EBADF;
2294         } else if (fp->f_type == DTYPE_SOCKET) {
2295                 socket_lock((struct socket *)fp->f_data, 1);
2296                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2297         } else if (fp->f_type == DTYPE_PIPE) {
2298                 PIPE_LOCK((struct pipe *)fp->f_data);
2299                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2300         } else {
2301                 fp_drop(p, erp->er_handle, fp, 1);
2302                 error = EINVAL;
2303         }
2304
2305         if (error) {
2306                 proc_fdunlock(p);
2307                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2308                 return(error);
2309         }
2310
2311         if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2312                 fp->f_flags &= ~FP_WAITEVENT;
2313         }
2314         proc_fdunlock(p);
2315
2316         // locate event if possible
2317         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2318                 if (evq->ee_proc == p)
2319                         break;
2320         }
2321         if (evq == NULL) {
2322                 if (fp->f_type == DTYPE_SOCKET)
2323                         socket_unlock((struct socket *)fp->f_data, 1);
2324                 else
2325                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2326                 fp_drop(p, erp->er_handle, fp, 0);
2327                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
2328                 return(EINVAL);
2329         }
2330         KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,evq,0,0);
2331
2332         if (uap->u_eventmask == EV_RM) {
2333                 EVPROCDEQUE(p, evq);
2334
2335                 if (fp->f_type == DTYPE_SOCKET) {
2336                         TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2337                         socket_unlock((struct socket *)fp->f_data, 1);
2338                 } else {
2339                         TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2340                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2341                 }
2342                 fp_drop(p, erp->er_handle, fp, 0);
2343                 FREE(evq, M_TEMP);
2344                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
2345                 return(0);
2346         }
2347         switch (uap->u_eventmask & EV_MASK) {
2348
2349         case 0:
2350                 flag = 0;
2351                 break;
2352
2353         case EV_RE:
2354         case EV_WR:
2355         case EV_RE|EV_WR:
2356                 flag = EV_RWBYTES;
2357                 break;
2358
2359         case EV_EX:
2360                 flag = EV_OOB;
2361                 break;
2362
2363         case EV_EX|EV_RE:
2364         case EV_EX|EV_WR:
2365         case EV_EX|EV_RE|EV_WR:
2366                 flag = EV_OOB|EV_RWBYTES;
2367                 break;
2368
2369         default:
2370                 if (fp->f_type == DTYPE_SOCKET)
2371                         socket_unlock((struct socket *)fp->f_data, 1);
2372                 else
2373                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2374                 fp_drop(p, erp->er_handle, fp, 0);
2375                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2376                 return(EINVAL);
2377         }
2378         /*
2379          * since we're holding the socket/pipe lock, the event
2380          * cannot go from the unqueued state to the queued state
2381          * however, it can go from the queued state to the unqueued state
2382          * since that direction is protected by the proc_lock...
2383          * so do a quick check for EV_QUEUED w/o holding the proc lock
2384          * since by far the common case will be NOT EV_QUEUED, this saves
2385          * us taking the proc_lock the majority of the time
2386          */
2387         if (evq->ee_flags & EV_QUEUED) {
2388                 /*
2389                  * EVPROCDEQUE will recheck the state after it grabs the proc_lock
2390                  */
2391                 EVPROCDEQUE(p, evq);
2392         }
2393         /*
2394          * while the event is off the proc queue and
2395          * we're holding the socket/pipe lock
2396          * it's safe to update these fields...
2397          */
2398         evq->ee_req.er_eventbits = 0;
2399         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2400
2401         if (fp->f_type == DTYPE_SOCKET) {
2402                 postevent((struct socket *)fp->f_data, 0, flag);
2403                 socket_unlock((struct socket *)fp->f_data, 1);
2404         }
2405         else {
2406                 postpipeevent((struct pipe *)fp->f_data, flag);
2407                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2408         }
2409         fp_drop(p, erp->er_handle, fp, 0);
2410         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,fp->f_data,flag,0);
2411         return(0);
2412 }
2413
2414 /* this routine is called from the close of fd with proc_fdlock held */
2415 int
2416 waitevent_close(struct proc *p, struct fileproc *fp)
2417 {
2418         struct eventqelt *evq;
2419
2420
2421         fp->f_flags &= ~FP_WAITEVENT;
2422
2423         if (fp->f_type == DTYPE_SOCKET) {
2424                 socket_lock((struct socket *)fp->f_data, 1);
2425                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2426         }
2427         else if (fp->f_type == DTYPE_PIPE) {
2428                 PIPE_LOCK((struct pipe *)fp->f_data);
2429                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2430         }
2431         else {
2432                 return(EINVAL);
2433         }
2434         proc_fdunlock(p);
2435
2436
2437         // locate event if possible
2438         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2439                 if (evq->ee_proc == p)
2440                         break;
2441         }
2442         if (evq == NULL) {
2443                 if (fp->f_type == DTYPE_SOCKET)
2444                         socket_unlock((struct socket *)fp->f_data, 1);
2445                 else
2446                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2447
2448                 proc_fdlock(p);
2449
2450                 return(EINVAL);
2451         }
2452         EVPROCDEQUE(p, evq);
2453
2454         if (fp->f_type == DTYPE_SOCKET) {
2455                 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2456                 socket_unlock((struct socket *)fp->f_data, 1);
2457         } else {
2458                 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2459                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2460         }
2461         FREE(evq, M_TEMP);
2462
2463         proc_fdlock(p);
2464
2465         return(0);
2466 }
2467