bsd/kern/sys_generic.c

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1989, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  * (c) UNIX System Laboratories, Inc.
  33  * All or some portions of this file are derived from material licensed
  34  * to the University of California by American Telephone and Telegraph
  35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  36  * the permission of UNIX System Laboratories, Inc.
  37  *
  38  * Redistribution and use in source and binary forms, with or without
  39  * modification, are permitted provided that the following conditions
  40  * are met:
  41  * 1. Redistributions of source code must retain the above copyright
  42  *    notice, this list of conditions and the following disclaimer.
  43  * 2. Redistributions in binary form must reproduce the above copyright
  44  *    notice, this list of conditions and the following disclaimer in the
  45  *    documentation and/or other materials provided with the distribution.
  46  * 3. All advertising materials mentioning features or use of this software
  47  *    must display the following acknowledgement:
  48  *      This product includes software developed by the University of
  49  *      California, Berkeley and its contributors.
  50  * 4. Neither the name of the University nor the names of its contributors
  51  *    may be used to endorse or promote products derived from this software
  52  *    without specific prior written permission.
  53  *
  54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  64  * SUCH DAMAGE.
  65  *
  66  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
  67  */
  68
  69 #include <sys/param.h>
  70 #include <sys/systm.h>
  71 #include <sys/filedesc.h>
  72 #include <sys/ioctl.h>
  73 #include <sys/file_internal.h>
  74 #include <sys/proc_internal.h>
  75 #include <sys/socketvar.h>
  76 #if KTRACE
  77 #include <sys/uio_internal.h>
  78 #else
  79 #include <sys/uio.h>
  80 #endif
  81 #include <sys/kernel.h>
  82 #include <sys/stat.h>
  83 #include <sys/malloc.h>
  84 #include <sys/sysproto.h>
  85
  86 #include <sys/mount_internal.h>
  87 #include <sys/protosw.h>
  88 #include <sys/ev.h>
  89 #include <sys/user.h>
  90 #include <sys/kdebug.h>
  91 #include <sys/poll.h>
  92 #include <sys/event.h>
  93 #include <sys/eventvar.h>
  94
  95 #include <mach/mach_types.h>
  96 #include <kern/kern_types.h>
  97 #include <kern/assert.h>
  98 #include <kern/kalloc.h>
  99 #include <kern/thread.h>
 100 #include <kern/clock.h>
 101
 102 #include <sys/mbuf.h>
 103 #include <sys/socket.h>
 104 #include <sys/socketvar.h>
 105 #include <sys/errno.h>
 106 #include <sys/syscall.h>
 107 #include <sys/pipe.h>
 108
 109 #include <bsm/audit_kernel.h>
 110
 111 #include <net/if.h>
 112 #include <net/route.h>
 113
 114 #include <netinet/in.h>
 115 #include <netinet/in_systm.h>
 116 #include <netinet/ip.h>
 117 #include <netinet/in_pcb.h>
 118 #include <netinet/ip_var.h>
 119 #include <netinet/ip6.h>
 120 #include <netinet/tcp.h>
 121 #include <netinet/tcp_fsm.h>
 122 #include <netinet/tcp_seq.h>
 123 #include <netinet/tcp_timer.h>
 124 #include <netinet/tcp_var.h>
 125 #include <netinet/tcpip.h>
 126 #include <netinet/tcp_debug.h>
 127 /* for wait queue based select */
 128 #include <kern/wait_queue.h>
 129 #include <kern/kalloc.h>
 130 #if KTRACE
 131 #include <sys/ktrace.h>
 132 #endif
 133 #include <sys/vnode_internal.h>
 134
 135 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 136 int wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
 137 extern void     *get_bsduthreadarg(thread_t);
 138 extern int      *get_bsduthreadrval(thread_t);
 139
 140 __private_extern__ int  dofileread(struct proc *p, struct fileproc *fp, int fd,
 141                                                                    user_addr_t bufp, user_size_t nbyte,
 142                                                                    off_t offset, int flags, user_ssize_t *retval);
 143 __private_extern__ int  dofilewrite(struct proc *p, struct fileproc *fp, int fd,
 144                                                                         user_addr_t bufp, user_size_t nbyte,
 145                                                                         off_t offset, int flags, user_ssize_t *retval);
 146 __private_extern__ int  preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
 147 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
 148
 149 #if NETAT
 150 extern int appletalk_inited;
 151 #endif /* NETAT */
 152
 153 #define f_flag f_fglob->fg_flag
 154 #define f_type f_fglob->fg_type
 155 #define f_msgcount f_fglob->fg_msgcount
 156 #define f_cred f_fglob->fg_cred
 157 #define f_ops f_fglob->fg_ops
 158 #define f_offset f_fglob->fg_offset
 159 #define f_data f_fglob->fg_data
 160 /*
 161  * Read system call.
 162  */
 163 int
 164 read(p, uap, retval)
 165         struct proc *p;
 166         register struct read_args *uap;
 167         user_ssize_t *retval;
 168 {
 169         struct fileproc *fp;
 170         int error;
 171         int fd = uap->fd;
 172
 173         if ( (error = preparefileread(p, &fp, fd, 0)) )
 174                 return (error);
 175
 176         error = dofileread(p, fp, uap->fd, uap->cbuf, uap->nbyte,
 177                            (off_t)-1, 0, retval);
 178
 179         donefileread(p, fp, fd);
 180
 181         return (error);
 182 }
 183
 184 /*
 185  * Pread system call
 186  */
 187 int
 188 pread(p, uap, retval)
 189         struct proc *p;
 190         register struct pread_args *uap;
 191         user_ssize_t *retval;
 192 {
 193         struct fileproc *fp;
 194         int fd = uap->fd;
 195         int error;
 196
 197         if ( (error = preparefileread(p, &fp, fd, 1)) )
 198                 return (error);
 199
 200         error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
 201                         uap->offset, FOF_OFFSET, retval);
 202
 203         donefileread(p, fp, fd);
 204
 205         if (!error)
 206             KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
 207               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 208
 209         return (error);
 210 }
 211
 212 /*
 213  * Code common for read and pread
 214  */
 215
 216 void
 217 donefileread(struct proc *p, struct fileproc *fp, int fd)
 218 {
 219         proc_fdlock(p);
 220
 221         fp->f_flags &= ~FP_INCHRREAD;
 222
 223         fp_drop(p, fd, fp, 1);
 224         proc_fdunlock(p);
 225 }
 226
 227 int
 228 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
 229 {
 230         vnode_t vp;
 231         int     error;
 232         struct fileproc *fp;
 233
 234         proc_fdlock(p);
 235
 236         error = fp_lookup(p, fd, &fp, 1);
 237
 238         if (error) {
 239                 proc_fdunlock(p);
 240                 return (error);
 241         }
 242         if ((fp->f_flag & FREAD) == 0) {
 243                 error = EBADF;
 244                 goto out;
 245         }
 246         if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
 247                 error = ESPIPE;
 248                 goto out;
 249         }
 250         if (fp->f_type == DTYPE_VNODE) {
 251                 vp = (struct vnode *)fp->f_fglob->fg_data;
 252
 253                 if (vp->v_type == VCHR)
 254                         fp->f_flags |= FP_INCHRREAD;
 255         }
 256
 257         *fp_ret = fp;
 258
 259         proc_fdunlock(p);
 260         return (0);
 261
 262 out:
 263         fp_drop(p, fd, fp, 1);
 264         proc_fdunlock(p);
 265         return (error);
 266 }
 267
 268
 269 __private_extern__ int
 270 dofileread(p, fp, fd, bufp, nbyte, offset, flags, retval)
 271         struct proc *p;
 272         struct fileproc *fp;
 273         int fd, flags;
 274         user_addr_t bufp;
 275         user_size_t nbyte;
 276         off_t offset;
 277         user_ssize_t *retval;
 278 {
 279         uio_t auio;
 280         user_ssize_t bytecnt;
 281         long error = 0;
 282         char uio_buf[ UIO_SIZEOF(1) ];
 283 #if KTRACE
 284         uio_t ktruio = NULL;
 285         char ktr_uio_buf[ UIO_SIZEOF(1) ];
 286         int didktr = 0;
 287 #endif
 288
 289         // LP64todo - do we want to raise this?
 290         if (nbyte > INT_MAX)
 291                 return (EINVAL);
 292
 293         if (IS_64BIT_PROCESS(p)) {
 294                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
 295                                                                           &uio_buf[0], sizeof(uio_buf));
 296         } else {
 297                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
 298                                                                           &uio_buf[0], sizeof(uio_buf));
 299         }
 300         uio_addiov(auio, bufp, nbyte);
 301
 302 #if KTRACE
 303         /*
 304         * if tracing, save a copy of iovec
 305         */
 306         if (KTRPOINT(p, KTR_GENIO)) {
 307                 didktr = 1;
 308
 309                 if (IS_64BIT_PROCESS(p)) {
 310                         ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
 311                                                                           &ktr_uio_buf[0], sizeof(ktr_uio_buf));
 312                 } else {
 313                         ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
 314                                                                           &ktr_uio_buf[0], sizeof(ktr_uio_buf));
 315                 }
 316                 uio_addiov(ktruio, bufp, nbyte);
 317         }
 318 #endif
 319         bytecnt = nbyte;
 320
 321         if ((error = fo_read(fp, auio, fp->f_cred, flags, p))) {
 322                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 323                         error == EINTR || error == EWOULDBLOCK))
 324                         error = 0;
 325         }
 326         bytecnt -= uio_resid(auio);
 327 #if KTRACE
 328         if (didktr && error == 0) {
 329                 uio_setresid(ktruio, bytecnt);
 330                 ktrgenio(p->p_tracep, fd, UIO_READ, ktruio, error);
 331         }
 332 #endif
 333
 334         *retval = bytecnt;
 335
 336         return (error);
 337 }
 338
 339 /*
 340  * Scatter read system call.
 341  */
 342 int
 343 readv(p, uap, retval)
 344         struct proc *p;
 345         register struct readv_args *uap;
 346         user_ssize_t *retval;
 347 {
 348         uio_t auio = NULL;
 349         int error;
 350         int size_of_iovec;
 351         struct user_iovec *iovp;
 352
 353         /* Verify range bedfore calling uio_create() */
 354         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 355                 return (EINVAL);
 356
 357         /* allocate a uio large enough to hold the number of iovecs passed */
 358         auio = uio_create(uap->iovcnt, 0,
 359                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 360                                   UIO_READ);
 361
 362         /* get location of iovecs within the uio.  then copyin the iovecs from
 363          * user space.
 364          */
 365         iovp = uio_iovsaddr(auio);
 366         if (iovp == NULL) {
 367                 error = ENOMEM;
 368                 goto ExitThisRoutine;
 369         }
 370         size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec));
 371         error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec));
 372         if (error) {
 373                 goto ExitThisRoutine;
 374         }
 375
 376         /* finalize uio_t for use and do the IO
 377          */
 378         uio_calculateresid(auio);
 379         error = rd_uio(p, uap->fd, auio, retval);
 380
 381 ExitThisRoutine:
 382         if (auio != NULL) {
 383                 uio_free(auio);
 384         }
 385         return (error);
 386 }
 387
 388 /*
 389  * Write system call
 390  */
 391 int
 392 write(p, uap, retval)
 393         struct proc *p;
 394         register struct write_args *uap;
 395         user_ssize_t *retval;
 396 {
 397         struct fileproc *fp;
 398         int error;
 399         int fd = uap->fd;
 400
 401         error = fp_lookup(p,fd,&fp,0);
 402         if (error)
 403                 return(error);
 404         if ((fp->f_flag & FWRITE) == 0) {
 405                 error = EBADF;
 406         } else {
 407                 error = dofilewrite(p, fp, uap->fd, uap->cbuf, uap->nbyte,
 408                         (off_t)-1, 0, retval);
 409         }
 410         if (error == 0)
 411                 fp_drop_written(p, fd, fp);
 412         else
 413                 fp_drop(p, fd, fp, 0);
 414         return(error);
 415 }
 416
 417 /*
 418  * pwrite system call
 419  */
 420 int
 421 pwrite(p, uap, retval)
 422         struct proc *p;
 423         register struct pwrite_args *uap;
 424         user_ssize_t *retval;
 425 {
 426         struct fileproc *fp;
 427         int error;
 428         int fd = uap->fd;
 429
 430         error = fp_lookup(p,fd,&fp,0);
 431         if (error)
 432                 return(error);
 433
 434         if ((fp->f_flag & FWRITE) == 0) {
 435                 error = EBADF;
 436         } else {
 437                 if (fp->f_type != DTYPE_VNODE) {
 438                         error = ESPIPE;
 439                 } else {
 440                     error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
 441                         uap->offset, FOF_OFFSET, retval);
 442                 }
 443         }
 444         if (error == 0)
 445                 fp_drop_written(p, fd, fp);
 446         else
 447                 fp_drop(p, fd, fp, 0);
 448
 449         if (!error)
 450             KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
 451               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
 452
 453         return(error);
 454 }
 455
 456 __private_extern__ int
 457 dofilewrite(p, fp, fd, bufp, nbyte, offset, flags, retval)
 458         struct proc *p;
 459         struct fileproc *fp;
 460         int fd, flags;
 461         user_addr_t bufp;
 462         user_size_t nbyte;
 463         off_t offset;
 464         user_ssize_t *retval;
 465 {
 466         uio_t auio;
 467         long error = 0;
 468         user_ssize_t bytecnt;
 469         char uio_buf[ UIO_SIZEOF(1) ];
 470 #if KTRACE
 471         uio_t ktruio;
 472         int didktr = 0;
 473         char ktr_uio_buf[ UIO_SIZEOF(1) ];
 474 #endif
 475
 476         // LP64todo - do we want to raise this?
 477         if (nbyte > INT_MAX)
 478                 return (EINVAL);
 479
 480         if (IS_64BIT_PROCESS(p)) {
 481                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
 482                                                                           &uio_buf[0], sizeof(uio_buf));
 483         } else {
 484                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
 485                                                                           &uio_buf[0], sizeof(uio_buf));
 486         }
 487         uio_addiov(auio, bufp, nbyte);
 488
 489 #if KTRACE
 490         /*
 491         * if tracing, save a copy of iovec and uio
 492         */
 493         if (KTRPOINT(p, KTR_GENIO)) {
 494                 didktr = 1;
 495
 496                 if (IS_64BIT_PROCESS(p)) {
 497                         ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
 498                                                                                   &ktr_uio_buf[0], sizeof(ktr_uio_buf));
 499                 } else {
 500                         ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
 501                                                                                   &ktr_uio_buf[0], sizeof(ktr_uio_buf));
 502                 }
 503                 uio_addiov(ktruio, bufp, nbyte);
 504         }
 505 #endif
 506         bytecnt = nbyte;
 507         if ((error = fo_write(fp, auio, fp->f_cred, flags, p))) {
 508                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
 509                         error == EINTR || error == EWOULDBLOCK))
 510                         error = 0;
 511                 /* The socket layer handles SIGPIPE */
 512                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
 513                         psignal(p, SIGPIPE);
 514         }
 515         bytecnt -= uio_resid(auio);
 516 #if KTRACE
 517         if (didktr && error == 0) {
 518                 uio_setresid(ktruio, bytecnt);
 519                 ktrgenio(p->p_tracep, fd, UIO_WRITE, ktruio, error);
 520         }
 521 #endif
 522         *retval = bytecnt;
 523
 524         return (error);
 525 }
 526
 527 /*
 528  * Gather write system call
 529  */
 530 int
 531 writev(p, uap, retval)
 532         struct proc *p;
 533         register struct writev_args *uap;
 534         user_ssize_t *retval;
 535 {
 536         uio_t auio = NULL;
 537         int error;
 538         int size_of_iovec;
 539         struct user_iovec *iovp;
 540
 541         /* Verify range bedfore calling uio_create() */
 542         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
 543                 return (EINVAL);
 544
 545         /* allocate a uio large enough to hold the number of iovecs passed */
 546         auio = uio_create(uap->iovcnt, 0,
 547                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
 548                                   UIO_WRITE);
 549
 550         /* get location of iovecs within the uio.  then copyin the iovecs from
 551          * user space.
 552          */
 553         iovp = uio_iovsaddr(auio);
 554         if (iovp == NULL) {
 555                 error = ENOMEM;
 556                 goto ExitThisRoutine;
 557         }
 558         size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec));
 559         error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec));
 560         if (error) {
 561                 goto ExitThisRoutine;
 562         }
 563
 564         /* finalize uio_t for use and do the IO
 565          */
 566         uio_calculateresid(auio);
 567         error = wr_uio(p, uap->fd, auio, retval);
 568
 569 ExitThisRoutine:
 570         if (auio != NULL) {
 571                 uio_free(auio);
 572         }
 573         return (error);
 574 }
 575
 576
 577 int
 578 wr_uio(p, fdes, uio, retval)
 579         struct proc *p;
 580         int fdes;
 581         register uio_t uio;
 582         user_ssize_t *retval;
 583 {
 584         struct fileproc *fp;
 585         int error;
 586         user_ssize_t count;
 587 #if KTRACE
 588         struct iovec_64 *ktriov = NULL;
 589         struct uio ktruio;
 590         int didktr = 0;
 591         u_int iovlen;
 592 #endif
 593
 594         error = fp_lookup(p,fdes,&fp,0);
 595         if (error)
 596                 return(error);
 597
 598         if ((fp->f_flag & FWRITE) == 0) {
 599                 error = EBADF;
 600                 goto out;
 601         }
 602         count = uio_resid(uio);
 603 #if KTRACE
 604         /*
 605          * if tracing, save a copy of iovec
 606          */
 607         if (KTRPOINT(p, KTR_GENIO)) {
 608                 iovlen = uio->uio_iovcnt *
 609                         (IS_64BIT_PROCESS(p) ? sizeof (struct iovec_64) : sizeof (struct iovec_32));
 610                 MALLOC(ktriov, struct iovec_64 *, iovlen, M_TEMP, M_WAITOK);
 611                 if (ktriov != NULL) {
 612                         bcopy((caddr_t)uio->uio_iovs.iov64p, (caddr_t)ktriov, iovlen);
 613                         ktruio = *uio;
 614                         didktr = 1;
 615                 }
 616         }
 617 #endif
 618         error = fo_write(fp, uio, fp->f_cred, 0, p);
 619         if (error) {
 620                 if (uio_resid(uio) != count && (error == ERESTART ||
 621                                                 error == EINTR || error == EWOULDBLOCK))
 622                         error = 0;
 623                 /* The socket layer handles SIGPIPE */
 624                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
 625                         psignal(p, SIGPIPE);
 626         }
 627         *retval = count - uio_resid(uio);
 628
 629 #if KTRACE
 630         if (didktr) {
 631                 if (error == 0) {
 632                         ktruio.uio_iovs.iov64p = ktriov;
 633                         uio_setresid(&ktruio, *retval);
 634                         ktrgenio(p->p_tracep, fdes, UIO_WRITE, &ktruio, error);
 635                 }
 636                 FREE(ktriov, M_TEMP);
 637         }
 638 #endif
 639
 640 out:
 641         if ( (error == 0) )
 642                 fp_drop_written(p, fdes, fp);
 643         else
 644                 fp_drop(p, fdes, fp, 0);
 645         return(error);
 646 }
 647
 648
 649 int
 650 rd_uio(p, fdes, uio, retval)
 651         struct proc *p;
 652         int fdes;
 653         register uio_t uio;
 654         user_ssize_t *retval;
 655 {
 656         struct fileproc *fp;
 657         int error;
 658         user_ssize_t count;
 659 #if KTRACE
 660         struct iovec_64 *ktriov = NULL;
 661         struct uio ktruio;
 662         int didktr = 0;
 663         u_int iovlen;
 664 #endif
 665
 666         if ( (error = preparefileread(p, &fp, fdes, 0)) )
 667                 return (error);
 668
 669         count = uio_resid(uio);
 670 #if KTRACE
 671         /*
 672          * if tracing, save a copy of iovec
 673          */
 674         if (KTRPOINT(p, KTR_GENIO)) {
 675                 iovlen = uio->uio_iovcnt *
 676                         (IS_64BIT_PROCESS(p) ? sizeof (struct iovec_64) : sizeof (struct iovec_32));
 677                 MALLOC(ktriov, struct iovec_64 *, iovlen, M_TEMP, M_WAITOK);
 678                 if (ktriov != NULL) {
 679                         bcopy((caddr_t)uio->uio_iovs.iov64p, (caddr_t)ktriov, iovlen);
 680                         ktruio = *uio;
 681                         didktr = 1;
 682                 }
 683         }
 684 #endif
 685         error = fo_read(fp, uio, fp->f_cred, 0, p);
 686
 687         if (error) {
 688                 if (uio_resid(uio) != count && (error == ERESTART ||
 689                                                 error == EINTR || error == EWOULDBLOCK))
 690                         error = 0;
 691         }
 692         *retval = count - uio_resid(uio);
 693
 694 #if KTRACE
 695         if (didktr) {
 696                 if (error == 0) {
 697                         ktruio.uio_iovs.iov64p = ktriov;
 698                         uio_setresid(&ktruio, *retval);
 699                         ktrgenio(p->p_tracep, fdes, UIO_READ, &ktruio, error);
 700                 }
 701                 FREE(ktriov, M_TEMP);
 702         }
 703 #endif
 704         donefileread(p, fp, fdes);
 705
 706         return (error);
 707 }
 708
 709 /*
 710  * Ioctl system call
 711  *
 712  */
 713 int
 714 ioctl(struct proc *p, register struct ioctl_args *uap, __unused register_t *retval)
 715 {
 716         struct fileproc *fp;
 717         register u_long com;
 718         int error = 0;
 719         register u_int size;
 720         caddr_t datap, memp;
 721         boolean_t is64bit;
 722         int tmp;
 723 #define STK_PARAMS      128
 724         char stkbuf[STK_PARAMS];
 725         int fd = uap->fd;
 726
 727         AUDIT_ARG(fd, uap->fd);
 728         AUDIT_ARG(cmd, CAST_DOWN(int, uap->com)); /* LP64todo: uap->com is a user-land long */
 729         AUDIT_ARG(addr, uap->data);
 730
 731         is64bit = proc_is64bit(p);
 732
 733         proc_fdlock(p);
 734         error = fp_lookup(p,fd,&fp,1);
 735         if (error)  {
 736                 proc_fdunlock(p);
 737                 return(error);
 738         }
 739
 740         AUDIT_ARG(file, p, fp);
 741
 742         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
 743                         error = EBADF;
 744                         goto out;
 745         }
 746
 747 #if NETAT
 748         /*
 749          * ### LD 6/11/97 Hack Alert: this is to get AppleTalk to work
 750          * while implementing an ATioctl system call
 751          */
 752         {
 753                 if (appletalk_inited && ((uap->com & 0x0000FFFF) == 0xff99)) {
 754                         u_long  fixed_command;
 755 #ifdef APPLETALK_DEBUG
 756                         kprintf("ioctl: special AppleTalk \n");
 757 #endif
 758                         datap = &stkbuf[0];
 759                         *(user_addr_t *)datap = uap->data;
 760                         fixed_command = _IOW(0, 0xff99, uap->data);
 761                         error = fo_ioctl(fp, fixed_command, datap, p);
 762                         goto out;
 763                 }
 764         }
 765
 766 #endif /* NETAT */
 767
 768
 769         switch (com = uap->com) {
 770         case FIONCLEX:
 771                 *fdflags(p, uap->fd) &= ~UF_EXCLOSE;
 772                 error =0;
 773                 goto out;
 774         case FIOCLEX:
 775                 *fdflags(p, uap->fd) |= UF_EXCLOSE;
 776                 error =0;
 777                 goto out;
 778         }
 779
 780         /*
 781          * Interpret high order word to find amount of data to be
 782          * copied to/from the user's address space.
 783          */
 784         size = IOCPARM_LEN(com);
 785         if (size > IOCPARM_MAX) {
 786                         error = ENOTTY;
 787                         goto out;
 788         }
 789         memp = NULL;
 790         if (size > sizeof (stkbuf)) {
 791                 proc_fdunlock(p);
 792                 if ((memp = (caddr_t)kalloc(size)) == 0) {
 793                         proc_fdlock(p);
 794                         error = ENOMEM;
 795                         goto out;
 796                 }
 797                 proc_fdlock(p);
 798                 datap = memp;
 799         } else
 800                 datap = &stkbuf[0];
 801         if (com&IOC_IN) {
 802                 if (size) {
 803                         proc_fdunlock(p);
 804                         error = copyin(uap->data, datap, size);
 805                         if (error) {
 806                                 if (memp)
 807                                         kfree(memp, size);
 808                                 proc_fdlock(p);
 809                                 goto out;
 810                         }
 811                         proc_fdlock(p);
 812                 } else {
 813                         /* XXX - IOC_IN and no size?  we should proably return an error here!! */
 814                         if (is64bit) {
 815                                 *(user_addr_t *)datap = uap->data;
 816                         }
 817                         else {
 818                                 *(uint32_t *)datap = (uint32_t)uap->data;
 819                         }
 820                 }
 821         } else if ((com&IOC_OUT) && size)
 822                 /*
 823                  * Zero the buffer so the user always
 824                  * gets back something deterministic.
 825                  */
 826                 bzero(datap, size);
 827         else if (com&IOC_VOID) {
 828                 /* XXX - this is odd since IOC_VOID means no parameters */
 829                 if (is64bit) {
 830                         *(user_addr_t *)datap = uap->data;
 831                 }
 832                 else {
 833                         *(uint32_t *)datap = (uint32_t)uap->data;
 834                 }
 835         }
 836
 837         switch (com) {
 838
 839         case FIONBIO:
 840                 if ( (tmp = *(int *)datap) )
 841                         fp->f_flag |= FNONBLOCK;
 842                 else
 843                         fp->f_flag &= ~FNONBLOCK;
 844                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
 845                 break;
 846
 847         case FIOASYNC:
 848                 if ( (tmp = *(int *)datap) )
 849                         fp->f_flag |= FASYNC;
 850                 else
 851                         fp->f_flag &= ~FASYNC;
 852                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
 853                 break;
 854
 855         case FIOSETOWN:
 856                 tmp = *(int *)datap;
 857                 if (fp->f_type == DTYPE_SOCKET) {
 858                         ((struct socket *)fp->f_data)->so_pgid = tmp;
 859                         error = 0;
 860                         break;
 861                 }
 862                 if (fp->f_type == DTYPE_PIPE) {
 863                         error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
 864                         break;
 865                 }
 866                 if (tmp <= 0) {
 867                         tmp = -tmp;
 868                 } else {
 869                         struct proc *p1 = pfind(tmp);
 870                         if (p1 == 0) {
 871                                 error = ESRCH;
 872                                 break;
 873                         }
 874                         tmp = p1->p_pgrp->pg_id;
 875                 }
 876                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
 877                 break;
 878
 879         case FIOGETOWN:
 880                 if (fp->f_type == DTYPE_SOCKET) {
 881                         error = 0;
 882                         *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
 883                         break;
 884                 }
 885                 error = fo_ioctl(fp, TIOCGPGRP, datap, p);
 886                 *(int *)datap = -*(int *)datap;
 887                 break;
 888
 889         default:
 890                 error = fo_ioctl(fp, com, datap, p);
 891                 /*
 892                  * Copy any data to user, size was
 893                  * already set and checked above.
 894                  */
 895                 if (error == 0 && (com&IOC_OUT) && size)
 896                         error = copyout(datap, uap->data, (u_int)size);
 897                 break;
 898         }
 899         proc_fdunlock(p);
 900         if (memp)
 901                 kfree(memp, size);
 902         proc_fdlock(p);
 903 out:
 904         fp_drop(p, fd, fp, 1);
 905         proc_fdunlock(p);
 906         return(error);
 907 }
 908
 909 int     selwait, nselcoll;
 910 #define SEL_FIRSTPASS 1
 911 #define SEL_SECONDPASS 2
 912 extern int selcontinue(int error);
 913 extern int selprocess(int error, int sel_pass);
 914 static int selscan(struct proc *p, struct _select * sel,
 915                         int nfd, register_t *retval, int sel_pass, wait_queue_sub_t wqsub);
 916 static int selcount(struct proc *p, u_int32_t *ibits, u_int32_t *obits,
 917                         int nfd, int * count);
 918 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
 919 extern uint64_t tvtoabstime(struct timeval      *tvp);
 920
 921 /*
 922  * Select system call.
 923  */
 924 int
 925 select(struct proc *p, struct select_args *uap, register_t *retval)
 926 {
 927         int error = 0;
 928         u_int ni, nw, size;
 929         thread_t th_act;
 930         struct uthread  *uth;
 931         struct _select *sel;
 932         int needzerofill = 1;
 933         int count = 0;
 934
 935         th_act = current_thread();
 936         uth = get_bsdthread_info(th_act);
 937         sel = &uth->uu_select;
 938         retval = (int *)get_bsduthreadrval(th_act);
 939         *retval = 0;
 940
 941         if (uap->nd < 0) {
 942                 return (EINVAL);
 943         }
 944
 945         if (uap->nd > p->p_fd->fd_nfiles)
 946                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
 947
 948         nw = howmany(uap->nd, NFDBITS);
 949         ni = nw * sizeof(fd_mask);
 950
 951         /*
 952          * if this is the first select by the thread
 953          * allocate the space for bits.
 954          */
 955         if (sel->nbytes == 0) {
 956                 sel->nbytes = 3 * ni;
 957                 MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
 958                 MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
 959                 if ((sel->ibits == NULL) || (sel->obits == NULL))
 960                         panic("select out of memory");
 961                 needzerofill = 0;
 962         }
 963
 964         /*
 965          * if the previously allocated space for the bits
 966          * is smaller than what is requested. Reallocate.
 967          */
 968         if (sel->nbytes < (3 * ni)) {
 969                 sel->nbytes = (3 * ni);
 970                 FREE(sel->ibits, M_TEMP);
 971                 FREE(sel->obits, M_TEMP);
 972                 MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
 973                 MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
 974                 if ((sel->ibits == NULL) || (sel->obits == NULL))
 975                         panic("select out of memory");
 976                 needzerofill = 0;
 977         }
 978
 979         if (needzerofill) {
 980                 bzero((caddr_t)sel->ibits, sel->nbytes);
 981                 bzero((caddr_t)sel->obits, sel->nbytes);
 982         }
 983
 984         /*
 985          * get the bits from the user address space
 986          */
 987 #define getbits(name, x) \
 988         do { \
 989                 if (uap->name && (error = copyin(uap->name, \
 990                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
 991                         goto continuation; \
 992         } while (0)
 993
 994         getbits(in, 0);
 995         getbits(ou, 1);
 996         getbits(ex, 2);
 997 #undef  getbits
 998
 999         if (uap->tv) {
1000                 struct timeval atv;
1001                 if (IS_64BIT_PROCESS(p)) {
1002                         struct user_timeval atv64;
1003                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
1004                         /* Loses resolution - assume timeout < 68 years */
1005                         atv.tv_sec = atv64.tv_sec;
1006                         atv.tv_usec = atv64.tv_usec;
1007                 } else {
1008                         error = copyin(uap->tv, (caddr_t)&atv, sizeof(atv));
1009                 }
1010                 if (error)
1011                         goto continuation;
1012                 if (itimerfix(&atv)) {
1013                         error = EINVAL;
1014                         goto continuation;
1015                 }
1016
1017                 clock_absolutetime_interval_to_deadline(
1018                                                                                 tvtoabstime(&atv), &sel->abstime);
1019         }
1020         else
1021                 sel->abstime = 0;
1022
1023         if ( (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count)) ) {
1024                         goto continuation;
1025         }
1026
1027         sel->count = count;
1028         size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK);
1029         if (sel->allocsize) {
1030                 if (sel->wqset == 0)
1031                         panic("select: wql memory smashed");
1032                 /* needed for the select now */
1033                 if (size > sel->allocsize) {
1034                         kfree(sel->wqset,  sel->allocsize);
1035                         sel->allocsize = size;
1036                         sel->wqset = (wait_queue_set_t)kalloc(size);
1037                         if (sel->wqset == (wait_queue_set_t)NULL)
1038                                 panic("failed to allocate memory for waitqueue\n");
1039                 }
1040         } else {
1041                 sel->count = count;
1042                 sel->allocsize = size;
1043                 sel->wqset = (wait_queue_set_t)kalloc(sel->allocsize);
1044                 if (sel->wqset == (wait_queue_set_t)NULL)
1045                         panic("failed to allocate memory for waitqueue\n");
1046         }
1047         bzero(sel->wqset, size);
1048         sel->wql = (char *)sel->wqset + SIZEOF_WAITQUEUE_SET;
1049         wait_queue_set_init(sel->wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST));
1050
1051 continuation:
1052         return selprocess(error, SEL_FIRSTPASS);
1053 }
1054
1055 int
1056 selcontinue(int error)
1057 {
1058         return selprocess(error, SEL_SECONDPASS);
1059 }
1060
1061 int
1062 selprocess(int error, int sel_pass)
1063 {
1064         int ncoll;
1065         u_int ni, nw;
1066         thread_t th_act;
1067         struct uthread  *uth;
1068         struct proc *p;
1069         struct select_args *uap;
1070         int *retval;
1071         struct _select *sel;
1072         int unwind = 1;
1073         int prepost = 0;
1074         int somewakeup = 0;
1075         int doretry = 0;
1076         wait_result_t wait_result;
1077
1078         p = current_proc();
1079         th_act = current_thread();
1080         uap = (struct select_args *)get_bsduthreadarg(th_act);
1081         retval = (int *)get_bsduthreadrval(th_act);
1082         uth = get_bsdthread_info(th_act);
1083         sel = &uth->uu_select;
1084
1085         /* if it is first pass wait queue is not setup yet */
1086         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
1087                         unwind = 0;
1088         if (sel->count == 0)
1089                         unwind = 0;
1090 retry:
1091         if (error != 0) {
1092           goto done;
1093         }
1094
1095         ncoll = nselcoll;
1096         p->p_flag |= P_SELECT;
1097         /* skip scans if the select is just for timeouts */
1098         if (sel->count) {
1099                 if (sel_pass == SEL_FIRSTPASS)
1100                         wait_queue_sub_clearrefs(sel->wqset);
1101
1102                 error = selscan(p, sel, uap->nd, retval, sel_pass, sel->wqset);
1103                 if (error || *retval) {
1104                         goto done;
1105                 }
1106                 if (prepost) {
1107                         /* if the select of log, then we canwakeup and discover some one
1108                         * else already read the data; go toselct again if time permits
1109                         */
1110                         prepost = 0;
1111                         doretry = 1;
1112                 }
1113                 if (somewakeup) {
1114                         somewakeup = 0;
1115                         doretry = 1;
1116                 }
1117         }
1118
1119         if (uap->tv) {
1120                 uint64_t        now;
1121
1122                 clock_get_uptime(&now);
1123                 if (now >= sel->abstime)
1124                         goto done;
1125         }
1126
1127         if (doretry) {
1128                 /* cleanup obits and try again */
1129                 doretry = 0;
1130                 sel_pass = SEL_FIRSTPASS;
1131                 goto retry;
1132         }
1133
1134         /*
1135          * To effect a poll, the timeout argument should be
1136          * non-nil, pointing to a zero-valued timeval structure.
1137          */
1138         if (uap->tv && sel->abstime == 0) {
1139                 goto done;
1140         }
1141
1142         /* No spurious wakeups due to colls,no need to check for them */
1143          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1144                 sel_pass = SEL_FIRSTPASS;
1145                 goto retry;
1146         }
1147
1148         p->p_flag &= ~P_SELECT;
1149
1150         /* if the select is just for timeout skip check */
1151         if (sel->count &&(sel_pass == SEL_SECONDPASS))
1152                 panic("selprocess: 2nd pass assertwaiting");
1153
1154         /* Wait Queue Subordinate has waitqueue as first element */
1155         wait_result = wait_queue_assert_wait((wait_queue_t)sel->wqset,
1156                                              &selwait, THREAD_ABORTSAFE, sel->abstime);
1157         if (wait_result != THREAD_AWAKENED) {
1158                 /* there are no preposted events */
1159                 error = tsleep1(NULL, PSOCK | PCATCH,
1160                                 "select", 0, selcontinue);
1161         } else  {
1162                 prepost = 1;
1163                 error = 0;
1164         }
1165
1166         sel_pass = SEL_SECONDPASS;
1167         if (error == 0) {
1168                 if (!prepost)
1169                         somewakeup =1;
1170                 goto retry;
1171         }
1172 done:
1173         if (unwind) {
1174                 wait_subqueue_unlink_all(sel->wqset);
1175                 seldrop(p, sel->ibits, uap->nd);
1176         }
1177         p->p_flag &= ~P_SELECT;
1178         /* select is not restarted after signals... */
1179         if (error == ERESTART)
1180                 error = EINTR;
1181         if (error == EWOULDBLOCK)
1182                 error = 0;
1183         nw = howmany(uap->nd, NFDBITS);
1184         ni = nw * sizeof(fd_mask);
1185
1186 #define putbits(name, x) \
1187         do { \
1188                 if (uap->name && (error2 = \
1189                         copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1190                         error = error2; \
1191         } while (0)
1192
1193         if (error == 0) {
1194                 int error2;
1195
1196                 putbits(in, 0);
1197                 putbits(ou, 1);
1198                 putbits(ex, 2);
1199 #undef putbits
1200         }
1201         return(error);
1202 }
1203
1204 static int
1205 selscan(p, sel, nfd, retval, sel_pass, wqsub)
1206         struct proc *p;
1207         struct _select *sel;
1208         int nfd;
1209         register_t *retval;
1210         int sel_pass;
1211         wait_queue_sub_t wqsub;
1212 {
1213         register struct filedesc *fdp = p->p_fd;
1214         register int msk, i, j, fd;
1215         register u_int32_t bits;
1216         struct fileproc *fp;
1217         int n = 0;
1218         int nc = 0;
1219         static int flag[3] = { FREAD, FWRITE, 0 };
1220         u_int32_t *iptr, *optr;
1221         u_int nw;
1222         u_int32_t *ibits, *obits;
1223         char * wql;
1224         char * wql_ptr;
1225
1226         /*
1227          * Problems when reboot; due to MacOSX signal probs
1228          * in Beaker1C ; verify that the p->p_fd is valid
1229          */
1230         if (fdp == NULL) {
1231                 *retval=0;
1232                 return(EIO);
1233         }
1234         ibits = sel->ibits;
1235         obits = sel->obits;
1236         wql = sel->wql;
1237
1238         nw = howmany(nfd, NFDBITS);
1239
1240         nc = 0;
1241         proc_fdlock(p);
1242
1243         if (sel->count) {
1244                 for (msk = 0; msk < 3; msk++) {
1245                         iptr = (u_int32_t *)&ibits[msk * nw];
1246                         optr = (u_int32_t *)&obits[msk * nw];
1247
1248                         for (i = 0; i < nfd; i += NFDBITS) {
1249                                 bits = iptr[i/NFDBITS];
1250
1251                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1252                                         bits &= ~(1 << j);
1253                                         fp = fdp->fd_ofiles[fd];
1254
1255                                         if (fp == NULL ||
1256                                                 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1257                                                 proc_fdunlock(p);
1258                                                 return(EBADF);
1259                                         }
1260                                         if (sel_pass == SEL_SECONDPASS) {
1261                                                 wql_ptr = (char *)0;
1262                                                 fp->f_flags &= ~FP_INSELECT;
1263                                                 fp->f_waddr = (void *)0;
1264                                         } else {
1265                                                 wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
1266                                                 fp->f_flags |= FP_INSELECT;
1267                                                 fp->f_waddr = (void *)wqsub;
1268                                         }
1269                                         if (fp->f_ops && fo_select(fp, flag[msk], wql_ptr, p)) {
1270                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1271                                                 n++;
1272                                         }
1273                                         nc++;
1274                                 }
1275                         }
1276                 }
1277         }
1278         proc_fdunlock(p);
1279         *retval = n;
1280         return (0);
1281 }
1282
1283 static int poll_callback(struct kqueue *, struct kevent *, void *);
1284
1285 struct poll_continue_args {
1286         user_addr_t pca_fds;
1287         u_int pca_nfds;
1288         u_int pca_rfds;
1289 };
1290
1291 int
1292 poll(struct proc *p, struct poll_args *uap, register_t *retval)
1293 {
1294         struct poll_continue_args *cont;
1295         struct pollfd *fds;
1296         struct kqueue *kq;
1297         struct timeval atv;
1298         int ncoll, error = 0;
1299         u_int nfds = uap->nfds;
1300         u_int rfds = 0;
1301         u_int i;
1302         size_t ni;
1303
1304         /*
1305          * This is kinda bogus.  We have fd limits, but that is not
1306          * really related to the size of the pollfd array.  Make sure
1307          * we let the process use at least FD_SETSIZE entries and at
1308          * least enough for the current limits.  We want to be reasonably
1309          * safe, but not overly restrictive.
1310          */
1311         if (nfds > OPEN_MAX ||
1312             (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE))
1313                 return (EINVAL);
1314
1315         kq = kqueue_alloc(p);
1316         if (kq == NULL)
1317                 return (EAGAIN);
1318
1319         ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1320         MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1321         if (NULL == cont) {
1322                 error = EAGAIN;
1323                 goto out;
1324         }
1325
1326         fds = (struct pollfd *)&cont[1];
1327         error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1328         if (error)
1329                 goto out;
1330
1331         if (uap->timeout != -1) {
1332                 struct timeval rtv;
1333
1334                 atv.tv_sec = uap->timeout / 1000;
1335                 atv.tv_usec = (uap->timeout % 1000) * 1000;
1336                 if (itimerfix(&atv)) {
1337                         error = EINVAL;
1338                         goto out;
1339                 }
1340                 getmicrouptime(&rtv);
1341                 timevaladd(&atv, &rtv);
1342         } else {
1343                 atv.tv_sec = 0;
1344                 atv.tv_usec = 0;
1345         }
1346
1347         /* JMM - all this P_SELECT stuff is bogus */
1348         ncoll = nselcoll;
1349         p->p_flag |= P_SELECT;
1350
1351         for (i = 0; i < nfds; i++) {
1352                 short events = fds[i].events;
1353                 struct kevent kev;
1354                 int kerror = 0;
1355
1356                 /* per spec, ignore fd values below zero */
1357                 if (fds[i].fd < 0) {
1358                         fds[i].revents = 0;
1359                         continue;
1360                 }
1361
1362                 /* convert the poll event into a kqueue kevent */
1363                 kev.ident = fds[i].fd;
1364                 kev.flags = EV_ADD | EV_ONESHOT | EV_POLL;
1365                 kev.fflags = NOTE_LOWAT;
1366                 kev.data = 1; /* efficiency be damned: any data should trigger */
1367                 kev.udata = CAST_USER_ADDR_T(&fds[i]);
1368
1369                 /* Handle input events */
1370                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND )) {
1371                         kev.filter = EVFILT_READ;
1372                         if (!(events & ( POLLIN | POLLRDNORM )))
1373                                 kev.flags |= EV_OOBAND;
1374                         kerror = kevent_register(kq, &kev, p);
1375                 }
1376
1377                 /* Handle output events */
1378                 if (kerror == 0 &&
1379                     events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) {
1380                         kev.filter = EVFILT_WRITE;
1381                         kerror = kevent_register(kq, &kev, p);
1382                 }
1383
1384                 /* Handle BSD extension vnode events */
1385                 if (kerror == 0 &&
1386                     events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) {
1387                         kev.filter = EVFILT_VNODE;
1388                         kev.fflags = 0;
1389                         if (events & POLLEXTEND)
1390                                 kev.fflags |= NOTE_EXTEND;
1391                         if (events & POLLATTRIB)
1392                                 kev.fflags |= NOTE_ATTRIB;
1393                         if (events & POLLNLINK)
1394                                 kev.fflags |= NOTE_LINK;
1395                         if (events & POLLWRITE)
1396                                 kev.fflags |= NOTE_WRITE;
1397                         kerror = kevent_register(kq, &kev, p);
1398                 }
1399
1400                 if (kerror != 0) {
1401                         fds[i].revents = POLLNVAL;
1402                         rfds++;
1403                 } else
1404                         fds[i].revents = 0;
1405         }
1406
1407         /* Did we have any trouble registering? */
1408         if (rfds > 0)
1409                 goto done;
1410
1411         /* scan for, and possibly wait for, the kevents to trigger */
1412         cont->pca_fds = uap->fds;
1413         cont->pca_nfds = nfds;
1414         cont->pca_rfds = rfds;
1415         error = kevent_scan(kq, poll_callback, NULL, cont, &atv, p);
1416         rfds = cont->pca_rfds;
1417
1418  done:
1419         p->p_flag &= ~P_SELECT;
1420         /* poll is not restarted after signals... */
1421         if (error == ERESTART)
1422                 error = EINTR;
1423         if (error == EWOULDBLOCK)
1424                 error = 0;
1425         if (error == 0) {
1426                 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1427                 *retval = rfds;
1428         }
1429
1430  out:
1431         if (NULL != cont)
1432                 FREE(cont, M_TEMP);
1433
1434         kqueue_dealloc(kq, p);
1435         return (error);
1436 }
1437
1438 static int
1439 poll_callback(__unused struct kqueue *kq, struct kevent *kevp, void *data)
1440 {
1441         struct poll_continue_args *cont = (struct poll_continue_args *)data;
1442         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1443         short mask;
1444
1445         /* convert the results back into revents */
1446         if (kevp->flags & EV_EOF)
1447                 fds->revents |= POLLHUP;
1448         if (kevp->flags & EV_ERROR)
1449                 fds->revents |= POLLERR;
1450         cont->pca_rfds++;
1451
1452         switch (kevp->filter) {
1453         case EVFILT_READ:
1454                 if (fds->revents & POLLHUP)
1455                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
1456                 else {
1457                         mask = 0;
1458                         if (kevp->data != 0)
1459                                 mask |= (POLLIN | POLLRDNORM );
1460                         if (kevp->flags & EV_OOBAND)
1461                                 mask |= ( POLLPRI | POLLRDBAND );
1462                 }
1463                 fds->revents |= (fds->events & mask);
1464                 break;
1465
1466         case EVFILT_WRITE:
1467                 if (!(fds->revents & POLLHUP))
1468                         fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
1469                 break;
1470
1471         case EVFILT_PROC:
1472                 if (kevp->fflags & NOTE_EXTEND)
1473                         fds->revents |= (fds->events & POLLEXTEND);
1474                 if (kevp->fflags & NOTE_ATTRIB)
1475                         fds->revents |= (fds->events & POLLATTRIB);
1476                 if (kevp->fflags & NOTE_LINK)
1477                         fds->revents |= (fds->events & POLLNLINK);
1478                 if (kevp->fflags & NOTE_WRITE)
1479                         fds->revents |= (fds->events & POLLWRITE);
1480                 break;
1481         }
1482         return 0;
1483 }
1484
1485 int
1486 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1487 {
1488
1489         return (1);
1490 }
1491
1492 static int
1493 selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits,
1494                  int nfd, int *count)
1495 {
1496         register struct filedesc *fdp = p->p_fd;
1497         register int msk, i, j, fd;
1498         register u_int32_t bits;
1499         struct fileproc *fp;
1500         int n = 0;
1501         u_int32_t *iptr;
1502         u_int nw;
1503         int error=0;
1504         int dropcount;
1505
1506         /*
1507          * Problems when reboot; due to MacOSX signal probs
1508          * in Beaker1C ; verify that the p->p_fd is valid
1509          */
1510         if (fdp == NULL) {
1511                 *count=0;
1512                 return(EIO);
1513         }
1514         nw = howmany(nfd, NFDBITS);
1515
1516         proc_fdlock(p);
1517         for (msk = 0; msk < 3; msk++) {
1518                 iptr = (u_int32_t *)&ibits[msk * nw];
1519                 for (i = 0; i < nfd; i += NFDBITS) {
1520                         bits = iptr[i/NFDBITS];
1521                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1522                                 bits &= ~(1 << j);
1523                                 fp = fdp->fd_ofiles[fd];
1524                                 if (fp == NULL ||
1525                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1526                                                 *count=0;
1527                                                 error = EBADF;
1528                                                 goto bad;
1529                                 }
1530                                 fp->f_iocount++;
1531                                 n++;
1532                         }
1533                 }
1534         }
1535         proc_fdunlock(p);
1536
1537         *count = n;
1538         return (0);
1539 bad:
1540         dropcount = 0;
1541
1542         if (n== 0)
1543                 goto out;
1544         /* undo the iocounts */
1545         for (msk = 0; msk < 3; msk++) {
1546                 iptr = (u_int32_t *)&ibits[msk * nw];
1547                 for (i = 0; i < nfd; i += NFDBITS) {
1548                         bits = iptr[i/NFDBITS];
1549                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1550                                 bits &= ~(1 << j);
1551                                 fp = fdp->fd_ofiles[fd];
1552                                 if (dropcount >= n)
1553                                         goto out;
1554                                 fp->f_iocount--;
1555
1556                                 if (p->p_fpdrainwait && fp->f_iocount == 0) {
1557                                         p->p_fpdrainwait = 0;
1558                                         wakeup(&p->p_fpdrainwait);
1559                                 }
1560                                 dropcount++;
1561                         }
1562                 }
1563         }
1564 out:
1565         proc_fdunlock(p);
1566         return(error);
1567 }
1568
1569 static int
1570 seldrop(p, ibits, nfd)
1571         struct proc *p;
1572         u_int32_t *ibits;
1573         int nfd;
1574 {
1575         register struct filedesc *fdp = p->p_fd;
1576         register int msk, i, j, fd;
1577         register u_int32_t bits;
1578         struct fileproc *fp;
1579         int n = 0;
1580         u_int32_t *iptr;
1581         u_int nw;
1582
1583         /*
1584          * Problems when reboot; due to MacOSX signal probs
1585          * in Beaker1C ; verify that the p->p_fd is valid
1586          */
1587         if (fdp == NULL) {
1588                 return(EIO);
1589         }
1590
1591         nw = howmany(nfd, NFDBITS);
1592
1593
1594         proc_fdlock(p);
1595         for (msk = 0; msk < 3; msk++) {
1596                 iptr = (u_int32_t *)&ibits[msk * nw];
1597                 for (i = 0; i < nfd; i += NFDBITS) {
1598                         bits = iptr[i/NFDBITS];
1599                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1600                                 bits &= ~(1 << j);
1601                                 fp = fdp->fd_ofiles[fd];
1602                                 if (fp == NULL
1603 #if 0
1604                         /* if you are here then it is being closed */
1605                                         || (fdp->fd_ofileflags[fd] & UF_RESERVED)
1606 #endif
1607                                         ) {
1608                                                 proc_fdunlock(p);
1609                                                 return(EBADF);
1610                                 }
1611                                 n++;
1612                                 fp->f_iocount--;
1613                                 fp->f_flags &= ~FP_INSELECT;
1614
1615                                 if (p->p_fpdrainwait && fp->f_iocount == 0) {
1616                                         p->p_fpdrainwait = 0;
1617                                         wakeup(&p->p_fpdrainwait);
1618                                 }
1619                         }
1620                 }
1621         }
1622         proc_fdunlock(p);
1623         return (0);
1624 }
1625
1626 /*
1627  * Record a select request.
1628  */
1629 void
1630 selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql)
1631 {
1632         thread_t        cur_act = current_thread();
1633         struct uthread * ut = get_bsdthread_info(cur_act);
1634
1635         /* need to look at collisions */
1636
1637         if ((p_wql == (void *)0) && ((sip->si_flags & SI_INITED) == 0)) {
1638                 return;
1639         }
1640
1641         /*do not record if this is second pass of select */
1642         if((p_wql == (void *)0)) {
1643                 return;
1644         }
1645
1646         if ((sip->si_flags & SI_INITED) == 0) {
1647                 wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
1648                 sip->si_flags |= SI_INITED;
1649                 sip->si_flags &= ~SI_CLEAR;
1650         }
1651
1652         if (sip->si_flags & SI_RECORDED) {
1653                 sip->si_flags |= SI_COLL;
1654         } else
1655                 sip->si_flags &= ~SI_COLL;
1656
1657         sip->si_flags |= SI_RECORDED;
1658         if (!wait_queue_member(&sip->si_wait_queue, ut->uu_select.wqset))
1659                 wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_select.wqset,
1660                                         (wait_queue_link_t)p_wql);
1661
1662         return;
1663 }
1664
1665 void
1666 selwakeup(sip)
1667         register struct selinfo *sip;
1668 {
1669
1670         if ((sip->si_flags & SI_INITED) == 0) {
1671                 return;
1672         }
1673
1674         if (sip->si_flags & SI_COLL) {
1675                 nselcoll++;
1676                 sip->si_flags &= ~SI_COLL;
1677 #if 0
1678                 /* will not  support */
1679                 //wakeup((caddr_t)&selwait);
1680 #endif
1681         }
1682
1683         if (sip->si_flags & SI_RECORDED) {
1684                 wait_queue_wakeup_all(&sip->si_wait_queue, &selwait, THREAD_AWAKENED);
1685                 sip->si_flags &= ~SI_RECORDED;
1686         }
1687
1688 }
1689
1690 void
1691 selthreadclear(sip)
1692         register struct selinfo *sip;
1693 {
1694
1695         if ((sip->si_flags & SI_INITED) == 0) {
1696                 return;
1697         }
1698         if (sip->si_flags & SI_RECORDED) {
1699                         selwakeup(sip);
1700                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
1701         }
1702         sip->si_flags |= SI_CLEAR;
1703         wait_queue_unlinkall_nofree(&sip->si_wait_queue);
1704 }
1705
1706
1707
1708
1709 #define DBG_POST        0x10
1710 #define DBG_WATCH       0x11
1711 #define DBG_WAIT        0x12
1712 #define DBG_MOD         0x13
1713 #define DBG_EWAKEUP     0x14
1714 #define DBG_ENQUEUE     0x15
1715 #define DBG_DEQUEUE     0x16
1716
1717 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
1718 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
1719 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
1720 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
1721 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
1722 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
1723 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
1724
1725
1726 #define EVPROCDEQUE(p, evq)     do {                            \
1727         proc_lock(p);                                           \
1728         if (evq->ee_flags & EV_QUEUED) {                        \
1729                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);      \
1730                 evq->ee_flags &= ~EV_QUEUED;                    \
1731         }                                                       \
1732         proc_unlock(p);                                         \
1733 } while (0);
1734
1735
1736 /*
1737  * called upon socket close. deque and free all events for
1738  * the socket...  socket must be locked by caller.
1739  */
1740 void
1741 evsofree(struct socket *sp)
1742 {
1743         struct eventqelt *evq, *next;
1744         proc_t  p;
1745
1746         if (sp == NULL)
1747                 return;
1748
1749         for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
1750                 next = evq->ee_slist.tqe_next;
1751                 p = evq->ee_proc;
1752
1753                 if (evq->ee_flags & EV_QUEUED) {
1754                         EVPROCDEQUE(p, evq);
1755                 }
1756                 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
1757                 FREE(evq, M_TEMP);
1758         }
1759 }
1760
1761
1762 /*
1763  * called upon pipe close. deque and free all events for
1764  * the pipe... pipe must be locked by caller
1765  */
1766 void
1767 evpipefree(struct pipe *cpipe)
1768 {
1769         struct eventqelt *evq, *next;
1770         proc_t  p;
1771
1772         for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
1773                 next = evq->ee_slist.tqe_next;
1774                 p = evq->ee_proc;
1775
1776                 EVPROCDEQUE(p, evq);
1777
1778                 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
1779                 FREE(evq, M_TEMP);
1780         }
1781 }
1782
1783
1784 /*
1785  * enqueue this event if it's not already queued. wakeup
1786  * the proc if we do queue this event to it...
1787  * entered with proc lock held... we drop it before
1788  * doing the wakeup and return in that state
1789  */
1790 static void
1791 evprocenque(struct eventqelt *evq)
1792 {
1793         proc_t  p;
1794
1795         assert(evq);
1796         p = evq->ee_proc;
1797
1798         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, evq, evq->ee_flags, evq->ee_eventmask,0,0);
1799
1800         proc_lock(p);
1801
1802         if (evq->ee_flags & EV_QUEUED) {
1803                 proc_unlock(p);
1804
1805                 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1806                 return;
1807         }
1808         evq->ee_flags |= EV_QUEUED;
1809
1810         TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
1811
1812         proc_unlock(p);
1813
1814         wakeup(&p->p_evlist);
1815
1816         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1817 }
1818
1819
1820 /*
1821  * pipe lock must be taken by the caller
1822  */
1823 void
1824 postpipeevent(struct pipe *pipep, int event)
1825 {
1826         int     mask;
1827         struct eventqelt *evq;
1828
1829         if (pipep == NULL)
1830                 return;
1831         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
1832
1833         for (evq = pipep->pipe_evlist.tqh_first;
1834              evq != NULL; evq = evq->ee_slist.tqe_next) {
1835
1836                 if (evq->ee_eventmask == 0)
1837                         continue;
1838                 mask = 0;
1839
1840                 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
1841
1842                 case EV_RWBYTES:
1843                   if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
1844                           mask |= EV_RE;
1845                           evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
1846                   }
1847                   if ((evq->ee_eventmask & EV_WR) &&
1848                       (pipep->pipe_buffer.size - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
1849
1850                           if (pipep->pipe_state & PIPE_EOF) {
1851                                   mask |= EV_WR|EV_RESET;
1852                                   break;
1853                           }
1854                           mask |= EV_WR;
1855                           evq->ee_req.er_wcnt = pipep->pipe_buffer.size - pipep->pipe_buffer.cnt;
1856                   }
1857                   break;
1858
1859                 case EV_WCLOSED:
1860                 case EV_RCLOSED:
1861                   if ((evq->ee_eventmask & EV_RE)) {
1862                           mask |= EV_RE|EV_RCLOSED;
1863                   }
1864                   if ((evq->ee_eventmask & EV_WR)) {
1865                           mask |= EV_WR|EV_WCLOSED;
1866                   }
1867                   break;
1868
1869                 default:
1870                   return;
1871                 }
1872                 if (mask) {
1873                         /*
1874                          * disarm... postevents are nops until this event is 'read' via
1875                          * waitevent and then re-armed via modwatch
1876                          */
1877                         evq->ee_eventmask = 0;
1878
1879                         /*
1880                          * since events are disarmed until after the waitevent
1881                          * the ee_req.er_xxxx fields can't change once we've
1882                          * inserted this event into the proc queue...
1883                          * therefore, the waitevent will see a 'consistent'
1884                          * snapshot of the event, even though it won't hold
1885                          * the pipe lock, and we're updating the event outside
1886                          * of the proc lock, which it will hold
1887                          */
1888                         evq->ee_req.er_eventbits |= mask;
1889
1890                         KERNEL_DEBUG(DBG_MISC_POST, evq, evq->ee_req.er_eventbits, mask, 1,0);
1891
1892                         evprocenque(evq);
1893                 }
1894         }
1895         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
1896 }
1897
1898
1899 /*
1900  * given either a sockbuf or a socket run down the
1901  * event list and queue ready events found...
1902  * the socket must be locked by the caller
1903  */
1904 void
1905 postevent(struct socket *sp, struct sockbuf *sb, int event)
1906 {
1907         int     mask;
1908         struct  eventqelt *evq;
1909         struct  tcpcb *tp;
1910
1911         if (sb)
1912                 sp = sb->sb_so;
1913         if (sp == NULL)
1914                 return;
1915
1916         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
1917
1918         for (evq = sp->so_evlist.tqh_first;
1919              evq != NULL; evq = evq->ee_slist.tqe_next) {
1920
1921                 if (evq->ee_eventmask == 0)
1922                         continue;
1923                 mask = 0;
1924
1925                 /* ready for reading:
1926                    - byte cnt >= receive low water mark
1927                    - read-half of conn closed
1928                    - conn pending for listening sock
1929                    - socket error pending
1930
1931                    ready for writing
1932                    - byte cnt avail >= send low water mark
1933                    - write half of conn closed
1934                    - socket error pending
1935                    - non-blocking conn completed successfully
1936
1937                    exception pending
1938                    - out of band data
1939                    - sock at out of band mark
1940                 */
1941
1942                 switch (event & EV_DMASK) {
1943
1944                 case EV_OOB:
1945                   if ((evq->ee_eventmask & EV_EX)) {
1946                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
1947                                   mask |= EV_EX|EV_OOB;
1948                   }
1949                   break;
1950
1951                 case EV_RWBYTES|EV_OOB:
1952                   if ((evq->ee_eventmask & EV_EX)) {
1953                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
1954                                   mask |= EV_EX|EV_OOB;
1955                   }
1956                   /*
1957                    * fall into the next case
1958                    */
1959                 case EV_RWBYTES:
1960                   if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
1961                           if (sp->so_error) {
1962                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
1963                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
1964                                               (tp->t_state == TCPS_CLOSED)) {
1965                                                   mask |= EV_RE|EV_RESET;
1966                                                   break;
1967                                           }
1968                                   }
1969                           }
1970                           mask |= EV_RE;
1971                           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
1972
1973                           if (sp->so_state & SS_CANTRCVMORE) {
1974                                   mask |= EV_FIN;
1975                                   break;
1976                           }
1977                   }
1978                   if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
1979                           if (sp->so_error) {
1980                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
1981                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
1982                                               (tp->t_state == TCPS_CLOSED)) {
1983                                                   mask |= EV_WR|EV_RESET;
1984                                                   break;
1985                                           }
1986                                   }
1987                           }
1988                           mask |= EV_WR;
1989                           evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
1990                   }
1991                   break;
1992
1993                 case EV_RCONN:
1994                   if ((evq->ee_eventmask & EV_RE)) {
1995                           mask |= EV_RE|EV_RCONN;
1996                           evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
1997                   }
1998                   break;
1999
2000                 case EV_WCONN:
2001                   if ((evq->ee_eventmask & EV_WR)) {
2002                           mask |= EV_WR|EV_WCONN;
2003                   }
2004                   break;
2005
2006                 case EV_RCLOSED:
2007                   if ((evq->ee_eventmask & EV_RE)) {
2008                           mask |= EV_RE|EV_RCLOSED;
2009                   }
2010                   break;
2011
2012                 case EV_WCLOSED:
2013                   if ((evq->ee_eventmask & EV_WR)) {
2014                           mask |= EV_WR|EV_WCLOSED;
2015                   }
2016                   break;
2017
2018                 case EV_FIN:
2019                   if (evq->ee_eventmask & EV_RE) {
2020                           mask |= EV_RE|EV_FIN;
2021                   }
2022                   break;
2023
2024                 case EV_RESET:
2025                 case EV_TIMEOUT:
2026                   if (evq->ee_eventmask & EV_RE) {
2027                           mask |= EV_RE | event;
2028                   }
2029                   if (evq->ee_eventmask & EV_WR) {
2030                           mask |= EV_WR | event;
2031                   }
2032                   break;
2033
2034                 default:
2035                   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
2036                   return;
2037                 } /* switch */
2038
2039                 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
2040
2041                 if (mask) {
2042                         /*
2043                          * disarm... postevents are nops until this event is 'read' via
2044                          * waitevent and then re-armed via modwatch
2045                          */
2046                         evq->ee_eventmask = 0;
2047
2048                         /*
2049                          * since events are disarmed until after the waitevent
2050                          * the ee_req.er_xxxx fields can't change once we've
2051                          * inserted this event into the proc queue...
2052                          * since waitevent can't see this event until we
2053                          * enqueue it, waitevent will see a 'consistent'
2054                          * snapshot of the event, even though it won't hold
2055                          * the socket lock, and we're updating the event outside
2056                          * of the proc lock, which it will hold
2057                          */
2058                         evq->ee_req.er_eventbits |= mask;
2059
2060                         evprocenque(evq);
2061                 }
2062         }
2063         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
2064 }
2065
2066
2067 /*
2068  * watchevent system call. user passes us an event to watch
2069  * for. we malloc an event object, initialize it, and queue
2070  * it to the open socket. when the event occurs, postevent()
2071  * will enque it back to our proc where we can retrieve it
2072  * via waitevent().
2073  *
2074  * should this prevent duplicate events on same socket?
2075  */
2076 int
2077 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
2078 {
2079         struct eventqelt *evq = (struct eventqelt *)0;
2080         struct eventqelt *np = NULL;
2081         struct eventreq *erp;
2082         struct fileproc *fp = NULL;
2083         int error;
2084
2085         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
2086
2087         // get a qelt and fill with users req
2088         MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
2089
2090         if (evq == NULL)
2091                 panic("can't MALLOC evq");
2092         erp = &evq->ee_req;
2093
2094         // get users request pkt
2095         if ( (error = copyin(CAST_USER_ADDR_T(uap->u_req), (caddr_t)erp,
2096                            sizeof(struct eventreq))) ) {
2097                 FREE(evq, M_TEMP);
2098
2099                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2100                 return(error);
2101         }
2102         KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,evq,0,0);
2103
2104         // validate, freeing qelt if errors
2105         error = 0;
2106         proc_fdlock(p);
2107
2108         if (erp->er_type != EV_FD) {
2109                 error = EINVAL;
2110         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2111                 error = EBADF;
2112         } else if (fp->f_type == DTYPE_SOCKET) {
2113                 socket_lock((struct socket *)fp->f_data, 1);
2114                 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2115         } else if (fp->f_type == DTYPE_PIPE) {
2116                 PIPE_LOCK((struct pipe *)fp->f_data);
2117                 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2118         } else {
2119                 fp_drop(p, erp->er_handle, fp, 1);
2120                 error = EINVAL;
2121         }
2122         proc_fdunlock(p);
2123
2124         if (error) {
2125                 FREE(evq, M_TEMP);
2126
2127                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2128                 return(error);
2129         }
2130
2131         /*
2132          * only allow one watch per file per proc
2133          */
2134         for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2135                 if (np->ee_proc == p) {
2136                         if (fp->f_type == DTYPE_SOCKET)
2137                                 socket_unlock((struct socket *)fp->f_data, 1);
2138                         else
2139                                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2140                         fp_drop(p, erp->er_handle, fp, 0);
2141                         FREE(evq, M_TEMP);
2142
2143                         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2144                         return(EINVAL);
2145                 }
2146         }
2147         erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
2148         evq->ee_proc = p;
2149         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2150         evq->ee_flags = 0;
2151
2152         if (fp->f_type == DTYPE_SOCKET) {
2153                 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2154                 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
2155
2156                 socket_unlock((struct socket *)fp->f_data, 1);
2157         } else {
2158                 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2159                 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2160
2161                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2162         }
2163         fp_drop_event(p, erp->er_handle, fp);
2164
2165         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
2166         return(0);
2167 }
2168
2169
2170
2171 /*
2172  * waitevent system call.
2173  * grabs the next waiting event for this proc and returns
2174  * it. if no events, user can request to sleep with timeout
2175  * or poll mode (tv=NULL);
2176  */
2177 int
2178 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
2179 {
2180         int error = 0;
2181         struct eventqelt *evq;
2182         struct eventreq   er;
2183         uint64_t abstime, interval;
2184
2185         if (uap->tv) {
2186                 struct timeval atv;
2187
2188                 error = copyin(CAST_USER_ADDR_T(uap->tv), (caddr_t)&atv, sizeof (atv));
2189                 if (error)
2190                         return(error);
2191                 if (itimerfix(&atv)) {
2192                         error = EINVAL;
2193                         return(error);
2194                 }
2195                 interval = tvtoabstime(&atv);
2196         } else
2197                 interval = 0;
2198
2199         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
2200
2201         proc_lock(p);
2202 retry:
2203         if ((evq = p->p_evlist.tqh_first) != NULL) {
2204                 /*
2205                  * found one... make a local copy while it's still on the queue
2206                  * to prevent it from changing while in the midst of copying
2207                  * don't want to hold the proc lock across a copyout because
2208                  * it might block on a page fault at the target in user space
2209                  */
2210                 bcopy((caddr_t)&evq->ee_req, (caddr_t)&er, sizeof (struct eventreq));
2211
2212                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2213
2214                 evq->ee_flags &= ~EV_QUEUED;
2215
2216                 proc_unlock(p);
2217
2218                 error = copyout((caddr_t)&er, CAST_USER_ADDR_T(uap->u_req), sizeof(struct eventreq));
2219
2220                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
2221                              evq->ee_req.er_handle,evq->ee_req.er_eventbits,evq,0);
2222                 return (error);
2223         }
2224         else {
2225                 if (uap->tv && interval == 0) {
2226                         proc_unlock(p);
2227                         *retval = 1;  // poll failed
2228
2229                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
2230                         return (error);
2231                 }
2232                 if (interval != 0)
2233                         clock_absolutetime_interval_to_deadline(interval, &abstime);
2234                 else
2235                         abstime = 0;
2236
2237                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,&p->p_evlist,0,0,0);
2238
2239                 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
2240
2241                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,&p->p_evlist,0,0,0);
2242
2243                 if (error == 0)
2244                         goto retry;
2245                 if (error == ERESTART)
2246                         error = EINTR;
2247                 if (error == EWOULDBLOCK) {
2248                         *retval = 1;
2249                         error = 0;
2250                 }
2251         }
2252         proc_unlock(p);
2253
2254         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
2255         return (error);
2256 }
2257
2258
2259 /*
2260  * modwatch system call. user passes in event to modify.
2261  * if we find it we reset the event bits and que/deque event
2262  * it needed.
2263  */
2264 int
2265 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
2266 {
2267         struct eventreq er;
2268         struct eventreq *erp = &er;
2269         struct eventqelt *evq;
2270         int error;
2271         struct fileproc *fp;
2272         int flag;
2273
2274         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
2275
2276         /*
2277          * get user's request pkt
2278          */
2279         if ((error = copyin(CAST_USER_ADDR_T(uap->u_req), (caddr_t)erp,
2280                              sizeof(struct eventreq)))) {
2281                         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2282                 return(error);
2283         }
2284         proc_fdlock(p);
2285
2286         if (erp->er_type != EV_FD) {
2287                 error = EINVAL;
2288         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2289                 error = EBADF;
2290         } else if (fp->f_type == DTYPE_SOCKET) {
2291                 socket_lock((struct socket *)fp->f_data, 1);
2292                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2293         } else if (fp->f_type == DTYPE_PIPE) {
2294                 PIPE_LOCK((struct pipe *)fp->f_data);
2295                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2296         } else {
2297                 fp_drop(p, erp->er_handle, fp, 1);
2298                 error = EINVAL;
2299         }
2300
2301         if (error) {
2302                 proc_fdunlock(p);
2303                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2304                 return(error);
2305         }
2306
2307         if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2308                 fp->f_flags &= ~FP_WAITEVENT;
2309         }
2310         proc_fdunlock(p);
2311
2312         // locate event if possible
2313         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2314                 if (evq->ee_proc == p)
2315                         break;
2316         }
2317         if (evq == NULL) {
2318                 if (fp->f_type == DTYPE_SOCKET)
2319                         socket_unlock((struct socket *)fp->f_data, 1);
2320                 else
2321                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2322                 fp_drop(p, erp->er_handle, fp, 0);
2323                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
2324                 return(EINVAL);
2325         }
2326         KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,evq,0,0);
2327
2328         if (uap->u_eventmask == EV_RM) {
2329                 EVPROCDEQUE(p, evq);
2330
2331                 if (fp->f_type == DTYPE_SOCKET) {
2332                         TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2333                         socket_unlock((struct socket *)fp->f_data, 1);
2334                 } else {
2335                         TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2336                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2337                 }
2338                 fp_drop(p, erp->er_handle, fp, 0);
2339                 FREE(evq, M_TEMP);
2340                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
2341                 return(0);
2342         }
2343         switch (uap->u_eventmask & EV_MASK) {
2344
2345         case 0:
2346                 flag = 0;
2347                 break;
2348
2349         case EV_RE:
2350         case EV_WR:
2351         case EV_RE|EV_WR:
2352                 flag = EV_RWBYTES;
2353                 break;
2354
2355         case EV_EX:
2356                 flag = EV_OOB;
2357                 break;
2358
2359         case EV_EX|EV_RE:
2360         case EV_EX|EV_WR:
2361         case EV_EX|EV_RE|EV_WR:
2362                 flag = EV_OOB|EV_RWBYTES;
2363                 break;
2364
2365         default:
2366                 if (fp->f_type == DTYPE_SOCKET)
2367                         socket_unlock((struct socket *)fp->f_data, 1);
2368                 else
2369                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2370                 fp_drop(p, erp->er_handle, fp, 0);
2371                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2372                 return(EINVAL);
2373         }
2374         /*
2375          * since we're holding the socket/pipe lock, the event
2376          * cannot go from the unqueued state to the queued state
2377          * however, it can go from the queued state to the unqueued state
2378          * since that direction is protected by the proc_lock...
2379          * so do a quick check for EV_QUEUED w/o holding the proc lock
2380          * since by far the common case will be NOT EV_QUEUED, this saves
2381          * us taking the proc_lock the majority of the time
2382          */
2383         if (evq->ee_flags & EV_QUEUED) {
2384                 /*
2385                  * EVPROCDEQUE will recheck the state after it grabs the proc_lock
2386                  */
2387                 EVPROCDEQUE(p, evq);
2388         }
2389         /*
2390          * while the event is off the proc queue and
2391          * we're holding the socket/pipe lock
2392          * it's safe to update these fields...
2393          */
2394         evq->ee_req.er_eventbits = 0;
2395         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2396
2397         if (fp->f_type == DTYPE_SOCKET) {
2398                 postevent((struct socket *)fp->f_data, 0, flag);
2399                 socket_unlock((struct socket *)fp->f_data, 1);
2400         }
2401         else {
2402                 postpipeevent((struct pipe *)fp->f_data, flag);
2403                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2404         }
2405         fp_drop(p, erp->er_handle, fp, 0);
2406         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,fp->f_data,flag,0);
2407         return(0);
2408 }
2409
2410 /* this routine is called from the close of fd with proc_fdlock held */
2411 int
2412 waitevent_close(struct proc *p, struct fileproc *fp)
2413 {
2414         struct eventqelt *evq;
2415
2416
2417         fp->f_flags &= ~FP_WAITEVENT;
2418
2419         if (fp->f_type == DTYPE_SOCKET) {
2420                 socket_lock((struct socket *)fp->f_data, 1);
2421                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2422         }
2423         else if (fp->f_type == DTYPE_PIPE) {
2424                 PIPE_LOCK((struct pipe *)fp->f_data);
2425                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2426         }
2427         else {
2428                 return(EINVAL);
2429         }
2430         proc_fdunlock(p);
2431
2432
2433         // locate event if possible
2434         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2435                 if (evq->ee_proc == p)
2436                         break;
2437         }
2438         if (evq == NULL) {
2439                 if (fp->f_type == DTYPE_SOCKET)
2440                         socket_unlock((struct socket *)fp->f_data, 1);
2441                 else
2442                         PIPE_UNLOCK((struct pipe *)fp->f_data);
2443
2444                 proc_fdlock(p);
2445
2446                 return(EINVAL);
2447         }
2448         EVPROCDEQUE(p, evq);
2449
2450         if (fp->f_type == DTYPE_SOCKET) {
2451                 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2452                 socket_unlock((struct socket *)fp->f_data, 1);
2453         } else {
2454                 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2455                 PIPE_UNLOCK((struct pipe *)fp->f_data);
2456         }
2457         FREE(evq, M_TEMP);
2458
2459         proc_fdlock(p);
2460
2461         return(0);
2462 }
2463