bsd/kern/kern_event.c

   1 /*
   2  * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  */
  29 /*-
  30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  31  * All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  52  * SUCH DAMAGE.
  53  */
  54 /*
  55  *      @(#)kern_event.c       1.0 (3/31/2000)
  56  */
  57 #include <stdint.h>
  58
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/filedesc.h>
  62 #include <sys/kernel.h>
  63 #include <sys/proc_internal.h>
  64 #include <sys/kauth.h>
  65 #include <sys/malloc.h>
  66 #include <sys/unistd.h>
  67 #include <sys/file_internal.h>
  68 #include <sys/fcntl.h>
  69 #include <sys/select.h>
  70 #include <sys/queue.h>
  71 #include <sys/event.h>
  72 #include <sys/eventvar.h>
  73 #include <sys/protosw.h>
  74 #include <sys/socket.h>
  75 #include <sys/socketvar.h>
  76 #include <sys/stat.h>
  77 #include <sys/sysctl.h>
  78 #include <sys/uio.h>
  79 #include <sys/sysproto.h>
  80 #include <sys/user.h>
  81 #include <sys/vnode_internal.h>
  82 #include <string.h>
  83 #include <sys/proc_info.h>
  84
  85 #include <kern/lock.h>
  86 #include <kern/clock.h>
  87 #include <kern/thread_call.h>
  88 #include <kern/sched_prim.h>
  89 #include <kern/zalloc.h>
  90 #include <kern/assert.h>
  91
  92 #include <libkern/libkern.h>
  93 #include "net/net_str_id.h"
  94
  95 #include <mach/task.h>
  96 #include <kern/vm_pressure.h>
  97
  98 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
  99
 100 #define KQ_EVENT NULL
 101
 102 static inline void kqlock(struct kqueue *kq);
 103 static inline void kqunlock(struct kqueue *kq);
 104
 105 static int      kqlock2knoteuse(struct kqueue *kq, struct knote *kn);
 106 static int      kqlock2knoteusewait(struct kqueue *kq, struct knote *kn);
 107 static int      kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
 108 static int      knoteuse2kqlock(struct kqueue *kq, struct knote *kn);
 109
 110 static void     kqueue_wakeup(struct kqueue *kq, int closed);
 111 static int      kqueue_read(struct fileproc *fp, struct uio *uio,
 112                     int flags, vfs_context_t ctx);
 113 static int      kqueue_write(struct fileproc *fp, struct uio *uio,
 114                     int flags, vfs_context_t ctx);
 115 static int      kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
 116                     vfs_context_t ctx);
 117 static int      kqueue_select(struct fileproc *fp, int which, void *wql,
 118                     vfs_context_t ctx);
 119 static int      kqueue_close(struct fileglob *fg, vfs_context_t ctx);
 120 static int      kqueue_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx);
 121 static int      kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
 122 extern int      kqueue_stat(struct fileproc *fp, void  *ub, int isstat64, vfs_context_t ctx);
 123
 124 static struct fileops kqueueops = {
 125         .fo_read = kqueue_read,
 126         .fo_write = kqueue_write,
 127         .fo_ioctl = kqueue_ioctl,
 128         .fo_select = kqueue_select,
 129         .fo_close = kqueue_close,
 130         .fo_kqfilter = kqueue_kqfilter,
 131         .fo_drain = kqueue_drain,
 132 };
 133
 134 static int kevent_internal(struct proc *p, int iskev64, user_addr_t changelist,
 135                 int nchanges, user_addr_t eventlist, int nevents, int fd,
 136                 user_addr_t utimeout, unsigned int flags, int32_t *retval);
 137 static int kevent_copyin(user_addr_t *addrp, struct kevent64_s *kevp, struct proc *p, int iskev64);
 138 static int kevent_copyout(struct kevent64_s *kevp, user_addr_t *addrp, struct proc *p, int iskev64);
 139 char * kevent_description(struct kevent64_s *kevp, char *s, size_t n);
 140
 141 static int      kevent_callback(struct kqueue *kq, struct kevent64_s *kevp, void *data);
 142 static void     kevent_continue(struct kqueue *kq, void *data, int error);
 143 static void     kqueue_scan_continue(void *contp, wait_result_t wait_result);
 144 static int      kqueue_process(struct kqueue *kq, kevent_callback_t callback,
 145                                void *data, int *countp, struct proc *p);
 146 static int      kqueue_begin_processing(struct kqueue *kq);
 147 static void     kqueue_end_processing(struct kqueue *kq);
 148 static int      knote_process(struct knote *kn, kevent_callback_t callback,
 149                               void *data, struct kqtailq *inprocessp, struct proc *p);
 150 static void     knote_put(struct knote *kn);
 151 static int      knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p);
 152 static void     knote_drop(struct knote *kn, struct proc *p);
 153 static void     knote_activate(struct knote *kn, int);
 154 static void     knote_deactivate(struct knote *kn);
 155 static void     knote_enqueue(struct knote *kn);
 156 static void     knote_dequeue(struct knote *kn);
 157 static struct   knote *knote_alloc(void);
 158 static void     knote_free(struct knote *kn);
 159
 160 static int      filt_fileattach(struct knote *kn);
 161 static struct filterops file_filtops = {
 162         .f_isfd = 1,
 163         .f_attach = filt_fileattach,
 164 };
 165
 166 static void     filt_kqdetach(struct knote *kn);
 167 static int      filt_kqueue(struct knote *kn, long hint);
 168 static struct filterops kqread_filtops = {
 169         .f_isfd = 1,
 170         .f_detach = filt_kqdetach,
 171         .f_event = filt_kqueue,
 172 };
 173
 174 /*
 175  * placeholder for not-yet-implemented filters
 176  */
 177 static int      filt_badattach(struct knote *kn);
 178 static struct filterops bad_filtops = {
 179         .f_attach = filt_badattach,
 180 };
 181
 182 static int      filt_procattach(struct knote *kn);
 183 static void     filt_procdetach(struct knote *kn);
 184 static int      filt_proc(struct knote *kn, long hint);
 185 static struct filterops proc_filtops = {
 186         .f_attach = filt_procattach,
 187         .f_detach = filt_procdetach,
 188         .f_event = filt_proc,
 189 };
 190
 191 static int filt_vmattach(struct knote *kn);
 192 static void filt_vmdetach(struct knote *kn);
 193 static int filt_vm(struct knote *kn, long hint);
 194 static struct filterops vm_filtops = {
 195         .f_attach = filt_vmattach,
 196         .f_detach = filt_vmdetach,
 197         .f_event = filt_vm,
 198 };
 199
 200 extern struct filterops fs_filtops;
 201
 202 extern struct filterops sig_filtops;
 203
 204 /* Timer filter */
 205 static int      filt_timerattach(struct knote *kn);
 206 static void     filt_timerdetach(struct knote *kn);
 207 static int      filt_timer(struct knote *kn, long hint);
 208 static void     filt_timertouch(struct knote *kn, struct kevent64_s *kev,
 209                 long type);
 210 static struct filterops timer_filtops = {
 211         .f_attach = filt_timerattach,
 212         .f_detach = filt_timerdetach,
 213         .f_event = filt_timer,
 214         .f_touch = filt_timertouch,
 215 };
 216
 217 /* Helpers */
 218
 219 static void     filt_timerexpire(void *knx, void *param1);
 220 static int      filt_timervalidate(struct knote *kn);
 221 static void     filt_timerupdate(struct knote *kn);
 222 static void     filt_timercancel(struct knote *kn);
 223
 224 #define TIMER_RUNNING           0x1
 225 #define TIMER_CANCELWAIT        0x2
 226
 227 static lck_mtx_t _filt_timerlock;
 228 static void     filt_timerlock(void);
 229 static void     filt_timerunlock(void);
 230
 231 static zone_t   knote_zone;
 232
 233 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 234
 235 #if 0
 236 extern struct filterops aio_filtops;
 237 #endif
 238
 239 /* Mach portset filter */
 240 extern struct filterops machport_filtops;
 241
 242 /* User filter */
 243 static int      filt_userattach(struct knote *kn);
 244 static void     filt_userdetach(struct knote *kn);
 245 static int      filt_user(struct knote *kn, long hint);
 246 static void     filt_usertouch(struct knote *kn, struct kevent64_s *kev,
 247                 long type);
 248 static struct filterops user_filtops = {
 249         .f_attach = filt_userattach,
 250         .f_detach = filt_userdetach,
 251         .f_event = filt_user,
 252         .f_touch = filt_usertouch,
 253 };
 254
 255 /*
 256  * Table for for all system-defined filters.
 257  */
 258 static struct filterops *sysfilt_ops[] = {
 259         &file_filtops,                  /* EVFILT_READ */
 260         &file_filtops,                  /* EVFILT_WRITE */
 261 #if 0
 262         &aio_filtops,                   /* EVFILT_AIO */
 263 #else
 264         &bad_filtops,                   /* EVFILT_AIO */
 265 #endif
 266         &file_filtops,                  /* EVFILT_VNODE */
 267         &proc_filtops,                  /* EVFILT_PROC */
 268         &sig_filtops,                   /* EVFILT_SIGNAL */
 269         &timer_filtops,                 /* EVFILT_TIMER */
 270         &machport_filtops,              /* EVFILT_MACHPORT */
 271         &fs_filtops,                    /* EVFILT_FS */
 272         &user_filtops,                  /* EVFILT_USER */
 273         &bad_filtops,                   /* unused */
 274         &vm_filtops,                    /* EVFILT_VM */
 275 };
 276
 277 /*
 278  * kqueue/note lock attributes and implementations
 279  *
 280  *      kqueues have locks, while knotes have use counts
 281  *      Most of the knote state is guarded by the object lock.
 282  *      the knote "inuse" count and status use the kqueue lock.
 283  */
 284 lck_grp_attr_t * kq_lck_grp_attr;
 285 lck_grp_t * kq_lck_grp;
 286 lck_attr_t * kq_lck_attr;
 287
 288 static inline void
 289 kqlock(struct kqueue *kq)
 290 {
 291         lck_spin_lock(&kq->kq_lock);
 292 }
 293
 294 static inline void
 295 kqunlock(struct kqueue *kq)
 296 {
 297         lck_spin_unlock(&kq->kq_lock);
 298 }
 299
 300 /*
 301  * Convert a kq lock to a knote use referece.
 302  *
 303  *      If the knote is being dropped, we can't get
 304  *      a use reference, so just return with it
 305  *      still locked.
 306  *
 307  *      - kq locked at entry
 308  *      - unlock on exit if we get the use reference
 309  */
 310 static int
 311 kqlock2knoteuse(struct kqueue *kq, struct knote *kn)
 312 {
 313         if (kn->kn_status & KN_DROPPING)
 314                 return 0;
 315         kn->kn_inuse++;
 316         kqunlock(kq);
 317         return 1;
 318  }
 319
 320 /*
 321  * Convert a kq lock to a knote use referece,
 322  * but wait for attach and drop events to complete.
 323  *
 324  *      If the knote is being dropped, we can't get
 325  *      a use reference, so just return with it
 326  *      still locked.
 327  *
 328  *      - kq locked at entry
 329  *      - kq always unlocked on exit
 330  */
 331 static int
 332 kqlock2knoteusewait(struct kqueue *kq, struct knote *kn)
 333 {
 334         if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
 335                 kn->kn_status |= KN_USEWAIT;
 336                 wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kn->kn_status, THREAD_UNINT, 0);
 337                 kqunlock(kq);
 338                 thread_block(THREAD_CONTINUE_NULL);
 339                 return 0;
 340         }
 341         kn->kn_inuse++;
 342         kqunlock(kq);
 343         return 1;
 344  }
 345
 346
 347 /*
 348  * Convert from a knote use reference back to kq lock.
 349  *
 350  *      Drop a use reference and wake any waiters if
 351  *      this is the last one.
 352  *
 353  *      The exit return indicates if the knote is
 354  *      still alive - but the kqueue lock is taken
 355  *      unconditionally.
 356  */
 357 static int
 358 knoteuse2kqlock(struct kqueue *kq, struct knote *kn)
 359 {
 360         kqlock(kq);
 361         if (--kn->kn_inuse == 0) {
 362                 if ((kn->kn_status & KN_ATTACHING) != 0) {
 363                         kn->kn_status &= ~KN_ATTACHING;
 364                 }
 365                 if ((kn->kn_status & KN_USEWAIT) != 0) {
 366                         kn->kn_status &= ~KN_USEWAIT;
 367                         wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kn->kn_status, THREAD_AWAKENED);
 368                 }
 369         }
 370         return ((kn->kn_status & KN_DROPPING) == 0);
 371  }
 372
 373 /*
 374  * Convert a kq lock to a knote drop referece.
 375  *
 376  *      If the knote is in use, wait for the use count
 377  *      to subside.  We first mark our intention to drop
 378  *      it - keeping other users from "piling on."
 379  *      If we are too late, we have to wait for the
 380  *      other drop to complete.
 381  *
 382  *      - kq locked at entry
 383  *      - always unlocked on exit.
 384  *      - caller can't hold any locks that would prevent
 385  *        the other dropper from completing.
 386  */
 387 static int
 388 kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
 389 {
 390         int oktodrop;
 391
 392         oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
 393         kn->kn_status |= KN_DROPPING;
 394         if (oktodrop) {
 395                 if (kn->kn_inuse == 0) {
 396                         kqunlock(kq);
 397                         return oktodrop;
 398                 }
 399         }
 400         kn->kn_status |= KN_USEWAIT;
 401         wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kn->kn_status, THREAD_UNINT, 0);
 402         kqunlock(kq);
 403         thread_block(THREAD_CONTINUE_NULL);
 404         return oktodrop;
 405 }
 406
 407 /*
 408  * Release a knote use count reference.
 409  */
 410 static void
 411 knote_put(struct knote *kn)
 412 {
 413         struct kqueue *kq = kn->kn_kq;
 414
 415         kqlock(kq);
 416         if (--kn->kn_inuse == 0) {
 417                 if ((kn->kn_status & KN_USEWAIT) != 0) {
 418                         kn->kn_status &= ~KN_USEWAIT;
 419                         wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kn->kn_status, THREAD_AWAKENED);
 420                 }
 421         }
 422         kqunlock(kq);
 423  }
 424
 425 static int
 426 filt_fileattach(struct knote *kn)
 427 {
 428
 429         return (fo_kqfilter(kn->kn_fp, kn, vfs_context_current()));
 430 }
 431
 432 #define f_flag f_fglob->fg_flag
 433 #define f_type f_fglob->fg_type
 434 #define f_msgcount f_fglob->fg_msgcount
 435 #define f_cred f_fglob->fg_cred
 436 #define f_ops f_fglob->fg_ops
 437 #define f_offset f_fglob->fg_offset
 438 #define f_data f_fglob->fg_data
 439
 440 static void
 441 filt_kqdetach(struct knote *kn)
 442 {
 443         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 444
 445         kqlock(kq);
 446         KNOTE_DETACH(&kq->kq_sel.si_note, kn);
 447         kqunlock(kq);
 448 }
 449
 450 /*ARGSUSED*/
 451 static int
 452 filt_kqueue(struct knote *kn, __unused long hint)
 453 {
 454         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 455
 456         kn->kn_data = kq->kq_count;
 457         return (kn->kn_data > 0);
 458 }
 459
 460 static int
 461 filt_procattach(struct knote *kn)
 462 {
 463         struct proc *p;
 464         pid_t selfpid = (pid_t)0;
 465
 466         assert(PID_MAX < NOTE_PDATAMASK);
 467
 468         if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0)
 469                 return(ENOTSUP);
 470
 471         p = proc_find(kn->kn_id);
 472         if (p == NULL) {
 473                 return (ESRCH);
 474         }
 475
 476         if ((kn->kn_sfflags & NOTE_EXIT) != 0) {
 477                 selfpid = proc_selfpid();
 478                 /* check for validity of NOTE_EXISTATUS */
 479                 if (((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) &&
 480                         ((p->p_ppid != selfpid) && (((p->p_lflag & P_LTRACED) == 0) || (p->p_oppid != selfpid)))) {
 481                         proc_rele(p);
 482                         return(EACCES);
 483                 }
 484         }
 485
 486         proc_klist_lock();
 487
 488         kn->kn_flags |= EV_CLEAR;       /* automatically set */
 489         kn->kn_ptr.p_proc = p;          /* store the proc handle */
 490
 491         KNOTE_ATTACH(&p->p_klist, kn);
 492
 493         proc_klist_unlock();
 494
 495         proc_rele(p);
 496
 497         return (0);
 498 }
 499
 500 /*
 501  * The knote may be attached to a different process, which may exit,
 502  * leaving nothing for the knote to be attached to.  In that case,
 503  * the pointer to the process will have already been nulled out.
 504  */
 505 static void
 506 filt_procdetach(struct knote *kn)
 507 {
 508         struct proc *p;
 509
 510         proc_klist_lock();
 511
 512         p = kn->kn_ptr.p_proc;
 513         if (p != PROC_NULL) {
 514                 kn->kn_ptr.p_proc = PROC_NULL;
 515                 KNOTE_DETACH(&p->p_klist, kn);
 516         }
 517
 518         proc_klist_unlock();
 519 }
 520
 521 static int
 522 filt_proc(struct knote *kn, long hint)
 523 {
 524         /* hint is 0 when called from above */
 525         if (hint != 0) {
 526                 u_int event;
 527
 528                 /* ALWAYS CALLED WITH proc_klist_lock when (hint != 0) */
 529
 530                 /*
 531                  * mask off extra data
 532                  */
 533                 event = (u_int)hint & NOTE_PCTRLMASK;
 534
 535                 /*
 536                  * if the user is interested in this event, record it.
 537                  */
 538                 if (kn->kn_sfflags & event)
 539                         kn->kn_fflags |= event;
 540
 541                 if (event == NOTE_REAP || (event == NOTE_EXIT && !(kn->kn_sfflags & NOTE_REAP))) {
 542                         kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 543                 }
 544                 if ((event == NOTE_EXIT) && ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0)) {
 545                         kn->kn_fflags |= NOTE_EXITSTATUS;
 546                         kn->kn_data = (hint & NOTE_PDATAMASK);
 547                 }
 548                 if ((event == NOTE_RESOURCEEND) && ((kn->kn_sfflags & NOTE_RESOURCEEND) != 0)) {
 549                         kn->kn_fflags |= NOTE_RESOURCEEND;
 550                         kn->kn_data = (hint & NOTE_PDATAMASK);
 551                 }
 552         }
 553
 554         /* atomic check, no locking need when called from above */
 555         return (kn->kn_fflags != 0);
 556 }
 557
 558 /*
 559  * Virtual memory kevents
 560  *
 561  * author: Matt Jacobson [matthew_jacobson@apple.com]
 562  */
 563
 564 static int
 565 filt_vmattach(struct knote *kn)
 566 {
 567         /*
 568          * The note will be cleared once the information has been flushed to the client.
 569          * If there is still pressure, we will be re-alerted.
 570          */
 571         kn->kn_flags |= EV_CLEAR;
 572
 573         return vm_knote_register(kn);
 574 }
 575
 576 static void
 577 filt_vmdetach(struct knote *kn)
 578 {
 579         vm_knote_unregister(kn);
 580 }
 581
 582 static int
 583 filt_vm(struct knote *kn, long hint)
 584 {
 585         /* hint == 0 means this is just an alive? check (always true) */
 586         if (hint != 0) {
 587                 /* If this knote is interested in the event specified in hint... */
 588                 if ((kn->kn_sfflags & hint) != 0) {
 589                         kn->kn_fflags |= hint;
 590                 }
 591         }
 592
 593         return (kn->kn_fflags != 0);
 594 }
 595
 596 /*
 597  * filt_timervalidate - process data from user
 598  *
 599  *      Converts to either interval or deadline format.
 600  *
 601  *      The saved-data field in the knote contains the
 602  *      time value.  The saved filter-flags indicates
 603  *      the unit of measurement.
 604  *
 605  *      After validation, either the saved-data field
 606  *      contains the interval in absolute time, or ext[0]
 607  *      contains the expected deadline. If that deadline
 608  *      is in the past, ext[0] is 0.
 609  *
 610  *      Returns EINVAL for unrecognized units of time.
 611  *
 612  *      Timer filter lock is held.
 613  *
 614  */
 615 static int
 616 filt_timervalidate(struct knote *kn)
 617 {
 618         uint64_t multiplier;
 619         uint64_t raw;
 620
 621         switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS)) {
 622         case NOTE_SECONDS:
 623                 multiplier = NSEC_PER_SEC;
 624                 break;
 625         case NOTE_USECONDS:
 626                 multiplier = NSEC_PER_USEC;
 627                 break;
 628         case NOTE_NSECONDS:
 629                 multiplier = 1;
 630                 break;
 631         case 0: /* milliseconds (default) */
 632                 multiplier = NSEC_PER_SEC / 1000;
 633                 break;
 634         default:
 635                 return EINVAL;
 636         }
 637
 638         nanoseconds_to_absolutetime((uint64_t)kn->kn_sdata * multiplier, &raw);
 639
 640         kn->kn_ext[0] = 0;
 641         kn->kn_sdata = 0;
 642
 643         if (kn->kn_sfflags & NOTE_ABSOLUTE) {
 644                 clock_sec_t seconds;
 645                 clock_nsec_t nanoseconds;
 646                 uint64_t now;
 647
 648                 clock_get_calendar_nanotime(&seconds, &nanoseconds);
 649                 nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC +
 650                                 nanoseconds, &now);
 651
 652                 if (raw < now) {
 653                         /* time has already passed */
 654                         kn->kn_ext[0] = 0;
 655                 } else {
 656                         raw -= now;
 657                         clock_absolutetime_interval_to_deadline(raw,
 658                                         &kn->kn_ext[0]);
 659                 }
 660         } else {
 661                 kn->kn_sdata = raw;
 662         }
 663
 664         return 0;
 665 }
 666
 667 /*
 668  * filt_timerupdate - compute the next deadline
 669  *
 670  *      Repeating timers store their interval in kn_sdata. Absolute
 671  *      timers have already calculated the deadline, stored in ext[0].
 672  *
 673  *      On return, the next deadline (or zero if no deadline is needed)
 674  *      is stored in kn_ext[0].
 675  *
 676  *      Timer filter lock is held.
 677  */
 678 static void
 679 filt_timerupdate(struct knote *kn)
 680 {
 681         /* if there's no interval, deadline is just in kn_ext[0] */
 682         if (kn->kn_sdata == 0)
 683                 return;
 684
 685         /* if timer hasn't fired before, fire in interval nsecs */
 686         if (kn->kn_ext[0] == 0) {
 687                 clock_absolutetime_interval_to_deadline(kn->kn_sdata,
 688                                 &kn->kn_ext[0]);
 689         } else {
 690                 /*
 691                  * If timer has fired before, schedule the next pop
 692                  * relative to the last intended deadline.
 693                  *
 694                  * We could check for whether the deadline has expired,
 695                  * but the thread call layer can handle that.
 696                  */
 697                 kn->kn_ext[0] += kn->kn_sdata;
 698         }
 699 }
 700
 701 /*
 702  * filt_timerexpire - the timer callout routine
 703  *
 704  *      Just propagate the timer event into the knote
 705  *      filter routine (by going through the knote
 706  *      synchronization point).  Pass a hint to
 707  *      indicate this is a real event, not just a
 708  *      query from above.
 709  */
 710 static void
 711 filt_timerexpire(void *knx, __unused void *spare)
 712 {
 713         struct klist timer_list;
 714         struct knote *kn = knx;
 715
 716         filt_timerlock();
 717
 718         kn->kn_hookid &= ~TIMER_RUNNING;
 719
 720         /* no "object" for timers, so fake a list */
 721         SLIST_INIT(&timer_list);
 722         SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
 723         KNOTE(&timer_list, 1);
 724
 725         /* if someone is waiting for timer to pop */
 726         if (kn->kn_hookid & TIMER_CANCELWAIT) {
 727                 struct kqueue *kq = kn->kn_kq;
 728                 wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kn->kn_hook,
 729                                 THREAD_AWAKENED);
 730         }
 731
 732         filt_timerunlock();
 733 }
 734
 735 /*
 736  * Cancel a running timer (or wait for the pop).
 737  * Timer filter lock is held.
 738  */
 739 static void
 740 filt_timercancel(struct knote *kn)
 741 {
 742         struct kqueue *kq = kn->kn_kq;
 743         thread_call_t callout = kn->kn_hook;
 744         boolean_t cancelled;
 745
 746         if (kn->kn_hookid & TIMER_RUNNING) {
 747                 /* cancel the callout if we can */
 748                 cancelled = thread_call_cancel(callout);
 749                 if (cancelled) {
 750                         kn->kn_hookid &= ~TIMER_RUNNING;
 751                 } else {
 752                         /* we have to wait for the expire routine.  */
 753                         kn->kn_hookid |= TIMER_CANCELWAIT;
 754                         wait_queue_assert_wait((wait_queue_t)kq->kq_wqs,
 755                                         &kn->kn_hook, THREAD_UNINT, 0);
 756                         filt_timerunlock();
 757                         thread_block(THREAD_CONTINUE_NULL);
 758                         filt_timerlock();
 759                         assert((kn->kn_hookid & TIMER_RUNNING) == 0);
 760                 }
 761         }
 762 }
 763
 764 /*
 765  * Allocate a thread call for the knote's lifetime, and kick off the timer.
 766  */
 767 static int
 768 filt_timerattach(struct knote *kn)
 769 {
 770         thread_call_t callout;
 771         int error;
 772
 773         callout = thread_call_allocate(filt_timerexpire, kn);
 774         if (NULL == callout)
 775                 return (ENOMEM);
 776
 777         filt_timerlock();
 778         error = filt_timervalidate(kn);
 779         if (error) {
 780                 filt_timerunlock();
 781                 return (error);
 782         }
 783
 784         kn->kn_hook = (void*)callout;
 785         kn->kn_hookid = 0;
 786
 787         /* absolute=EV_ONESHOT */
 788         if (kn->kn_sfflags & NOTE_ABSOLUTE)
 789                 kn->kn_flags |= EV_ONESHOT;
 790
 791         filt_timerupdate(kn);
 792         if (kn->kn_ext[0]) {
 793                 kn->kn_flags |= EV_CLEAR;
 794                 thread_call_enter_delayed(callout, kn->kn_ext[0]);
 795                 kn->kn_hookid |= TIMER_RUNNING;
 796         } else {
 797                 /* fake immediate */
 798                 kn->kn_data = 1;
 799         }
 800
 801         filt_timerunlock();
 802         return (0);
 803 }
 804
 805 /*
 806  * Shut down the timer if it's running, and free the callout.
 807  */
 808 static void
 809 filt_timerdetach(struct knote *kn)
 810 {
 811         thread_call_t callout;
 812
 813         filt_timerlock();
 814
 815         callout = (thread_call_t)kn->kn_hook;
 816         filt_timercancel(kn);
 817
 818         filt_timerunlock();
 819
 820         thread_call_free(callout);
 821 }
 822
 823
 824
 825 static int
 826 filt_timer(struct knote *kn, long hint)
 827 {
 828         int result;
 829
 830         if (hint) {
 831                 /* real timer pop -- timer lock held by filt_timerexpire */
 832
 833                 kn->kn_data++;
 834
 835                 if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) &&
 836                                 ((kn->kn_flags & EV_ONESHOT) == 0)) {
 837
 838                         /* evaluate next time to fire */
 839                         filt_timerupdate(kn);
 840
 841                         if (kn->kn_ext[0]) {
 842                                 /* keep the callout and re-arm */
 843                                 thread_call_enter_delayed(kn->kn_hook,
 844                                                 kn->kn_ext[0]);
 845                                 kn->kn_hookid |= TIMER_RUNNING;
 846                         }
 847                 }
 848
 849                 return 1;
 850         }
 851
 852         /* user-query */
 853         filt_timerlock();
 854
 855         result = (kn->kn_data != 0);
 856
 857         filt_timerunlock();
 858         return result;
 859 }
 860
 861
 862 /*
 863  * filt_timertouch - update knote with new user input
 864  *
 865  *      Cancel and restart the timer based on new user data. When
 866  *      the user picks up a knote, clear the count of how many timer
 867  *      pops have gone off (in kn_data).
 868  */
 869 static void
 870 filt_timertouch(struct knote *kn, struct kevent64_s *kev, long type)
 871 {
 872         int error;
 873         filt_timerlock();
 874
 875         switch (type) {
 876         case EVENT_REGISTER:
 877                 /* cancel current call */
 878                 filt_timercancel(kn);
 879
 880                 /* recalculate deadline */
 881                 kn->kn_sdata = kev->data;
 882                 kn->kn_sfflags = kev->fflags;
 883
 884                 error = filt_timervalidate(kn);
 885                 if (error) {
 886                         /* no way to report error, so mark it in the knote */
 887                         kn->kn_flags |= EV_ERROR;
 888                         kn->kn_data = error;
 889                         break;
 890                 }
 891
 892                 /* start timer if necessary */
 893                 filt_timerupdate(kn);
 894                 if (kn->kn_ext[0]) {
 895                         thread_call_enter_delayed(kn->kn_hook, kn->kn_ext[0]);
 896                         kn->kn_hookid |= TIMER_RUNNING;
 897                 } else {
 898                         /* pretend the timer has fired */
 899                         kn->kn_data = 1;
 900                 }
 901
 902                 break;
 903
 904         case EVENT_PROCESS:
 905                 /* reset the timer pop count in kn_data */
 906                 *kev = kn->kn_kevent;
 907                 kev->ext[0] = 0;
 908                 kn->kn_data = 0;
 909                 if (kn->kn_flags & EV_CLEAR)
 910                         kn->kn_fflags = 0;
 911                 break;
 912         default:
 913                 panic("filt_timertouch() - invalid type (%ld)", type);
 914                 break;
 915         }
 916
 917         filt_timerunlock();
 918 }
 919
 920 static void
 921 filt_timerlock(void)
 922 {
 923         lck_mtx_lock(&_filt_timerlock);
 924 }
 925
 926 static void
 927 filt_timerunlock(void)
 928 {
 929         lck_mtx_unlock(&_filt_timerlock);
 930 }
 931
 932 static int
 933 filt_userattach(struct knote *kn)
 934 {
 935         /* EVFILT_USER knotes are not attached to anything in the kernel */
 936         kn->kn_hook = NULL;
 937         if (kn->kn_fflags & NOTE_TRIGGER) {
 938                 kn->kn_hookid = 1;
 939         } else {
 940                 kn->kn_hookid = 0;
 941         }
 942         return 0;
 943 }
 944
 945 static void
 946 filt_userdetach(__unused struct knote *kn)
 947 {
 948         /* EVFILT_USER knotes are not attached to anything in the kernel */
 949 }
 950
 951 static int
 952 filt_user(struct knote *kn, __unused long hint)
 953 {
 954         return kn->kn_hookid;
 955 }
 956
 957 static void
 958 filt_usertouch(struct knote *kn, struct kevent64_s *kev, long type)
 959 {
 960         uint32_t ffctrl;
 961         switch (type) {
 962         case EVENT_REGISTER:
 963                 if (kev->fflags & NOTE_TRIGGER) {
 964                         kn->kn_hookid = 1;
 965                 }
 966
 967                 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
 968                 kev->fflags &= NOTE_FFLAGSMASK;
 969                 switch (ffctrl) {
 970                 case NOTE_FFNOP:
 971                         break;
 972                 case NOTE_FFAND:
 973                         OSBitAndAtomic(kev->fflags, &kn->kn_sfflags);
 974                         break;
 975                 case NOTE_FFOR:
 976                         OSBitOrAtomic(kev->fflags, &kn->kn_sfflags);
 977                         break;
 978                 case NOTE_FFCOPY:
 979                         kn->kn_sfflags = kev->fflags;
 980                         break;
 981                 }
 982                 kn->kn_sdata = kev->data;
 983                 break;
 984         case EVENT_PROCESS:
 985                 *kev = kn->kn_kevent;
 986                 kev->fflags = (volatile UInt32)kn->kn_sfflags;
 987                 kev->data = kn->kn_sdata;
 988                 if (kn->kn_flags & EV_CLEAR) {
 989                         kn->kn_hookid = 0;
 990                         kn->kn_data = 0;
 991                         kn->kn_fflags = 0;
 992                 }
 993                 break;
 994         default:
 995                 panic("filt_usertouch() - invalid type (%ld)", type);
 996                 break;
 997         }
 998 }
 999
1000 /*
1001  * JMM - placeholder for not-yet-implemented filters
1002  */
1003 static int
1004 filt_badattach(__unused struct knote *kn)
1005 {
1006         return(ENOTSUP);
1007 }
1008
1009
1010 struct kqueue *
1011 kqueue_alloc(struct proc *p)
1012 {
1013         struct filedesc *fdp = p->p_fd;
1014         struct kqueue *kq;
1015
1016         MALLOC_ZONE(kq, struct kqueue *, sizeof(struct kqueue), M_KQUEUE, M_WAITOK);
1017         if (kq != NULL) {
1018                 wait_queue_set_t wqs;
1019
1020                 wqs = wait_queue_set_alloc(SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST);
1021                 if (wqs != NULL) {
1022                         bzero(kq, sizeof(struct kqueue));
1023                         lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
1024                         TAILQ_INIT(&kq->kq_head);
1025                         kq->kq_wqs = wqs;
1026                         kq->kq_p = p;
1027                 } else {
1028                         FREE_ZONE(kq, sizeof(struct kqueue), M_KQUEUE);
1029                 }
1030         }
1031
1032         if (fdp->fd_knlistsize < 0) {
1033                 proc_fdlock(p);
1034                 if (fdp->fd_knlistsize < 0)
1035                         fdp->fd_knlistsize = 0;         /* this process has had a kq */
1036                 proc_fdunlock(p);
1037         }
1038
1039         return kq;
1040 }
1041
1042
1043 /*
1044  * kqueue_dealloc - detach all knotes from a kqueue and free it
1045  *
1046  *      We walk each list looking for knotes referencing this
1047  *      this kqueue.  If we find one, we try to drop it.  But
1048  *      if we fail to get a drop reference, that will wait
1049  *      until it is dropped.  So, we can just restart again
1050  *      safe in the assumption that the list will eventually
1051  *      not contain any more references to this kqueue (either
1052  *      we dropped them all, or someone else did).
1053  *
1054  *      Assumes no new events are being added to the kqueue.
1055  *      Nothing locked on entry or exit.
1056  */
1057 void
1058 kqueue_dealloc(struct kqueue *kq)
1059 {
1060         struct proc *p = kq->kq_p;
1061         struct filedesc *fdp = p->p_fd;
1062         struct knote *kn;
1063         int i;
1064
1065         proc_fdlock(p);
1066         for (i = 0; i < fdp->fd_knlistsize; i++) {
1067                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1068                 while (kn != NULL) {
1069                         if (kq == kn->kn_kq) {
1070                                 kqlock(kq);
1071                                 proc_fdunlock(p);
1072                                 /* drop it ourselves or wait */
1073                                 if (kqlock2knotedrop(kq, kn)) {
1074                                         kn->kn_fop->f_detach(kn);
1075                                         knote_drop(kn, p);
1076                                 }
1077                                 proc_fdlock(p);
1078                                 /* start over at beginning of list */
1079                                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1080                                 continue;
1081                         }
1082                         kn = SLIST_NEXT(kn, kn_link);
1083                 }
1084         }
1085         if (fdp->fd_knhashmask != 0) {
1086                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
1087                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1088                         while (kn != NULL) {
1089                                 if (kq == kn->kn_kq) {
1090                                         kqlock(kq);
1091                                         proc_fdunlock(p);
1092                                         /* drop it ourselves or wait */
1093                                         if (kqlock2knotedrop(kq, kn)) {
1094                                                 kn->kn_fop->f_detach(kn);
1095                                                 knote_drop(kn, p);
1096                                         }
1097                                         proc_fdlock(p);
1098                                         /* start over at beginning of list */
1099                                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1100                                         continue;
1101                                 }
1102                                 kn = SLIST_NEXT(kn, kn_link);
1103                         }
1104                 }
1105         }
1106         proc_fdunlock(p);
1107
1108         /*
1109          * before freeing the wait queue set for this kqueue,
1110          * make sure it is unlinked from all its containing (select) sets.
1111          */
1112         wait_queue_unlink_all((wait_queue_t)kq->kq_wqs);
1113         wait_queue_set_free(kq->kq_wqs);
1114         lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
1115         FREE_ZONE(kq, sizeof(struct kqueue), M_KQUEUE);
1116 }
1117
1118 int
1119 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
1120 {
1121         struct kqueue *kq;
1122         struct fileproc *fp;
1123         int fd, error;
1124
1125         error = falloc(p, &fp, &fd, vfs_context_current());
1126         if (error) {
1127                 return (error);
1128         }
1129
1130         kq = kqueue_alloc(p);
1131         if (kq == NULL) {
1132                 fp_free(p, fd, fp);
1133                 return (ENOMEM);
1134         }
1135
1136         fp->f_flag = FREAD | FWRITE;
1137         fp->f_type = DTYPE_KQUEUE;
1138         fp->f_ops = &kqueueops;
1139         fp->f_data = (caddr_t)kq;
1140
1141         proc_fdlock(p);
1142         procfdtbl_releasefd(p, fd, NULL);
1143         fp_drop(p, fd, fp, 1);
1144         proc_fdunlock(p);
1145
1146         *retval = fd;
1147         return (error);
1148 }
1149
1150 static int
1151 kevent_copyin(user_addr_t *addrp, struct kevent64_s *kevp, struct proc *p, int iskev64)
1152 {
1153         int advance;
1154         int error;
1155
1156         if (iskev64) {
1157                 advance = sizeof(struct kevent64_s);
1158                 error = copyin(*addrp, (caddr_t)kevp, advance);
1159         } else if (IS_64BIT_PROCESS(p)) {
1160                 struct user64_kevent kev64;
1161                 bzero(kevp, sizeof(struct kevent64_s));
1162
1163                 advance = sizeof(kev64);
1164                 error = copyin(*addrp, (caddr_t)&kev64, advance);
1165                 if (error)
1166                         return error;
1167                 kevp->ident = kev64.ident;
1168                 kevp->filter = kev64.filter;
1169                 kevp->flags = kev64.flags;
1170                 kevp->fflags = kev64.fflags;
1171                 kevp->data = kev64.data;
1172                 kevp->udata = kev64.udata;
1173         } else {
1174                 struct user32_kevent kev32;
1175                 bzero(kevp, sizeof(struct kevent64_s));
1176
1177                 advance = sizeof(kev32);
1178                 error = copyin(*addrp, (caddr_t)&kev32, advance);
1179                 if (error)
1180                         return error;
1181                 kevp->ident = (uintptr_t)kev32.ident;
1182                 kevp->filter = kev32.filter;
1183                 kevp->flags = kev32.flags;
1184                 kevp->fflags = kev32.fflags;
1185                 kevp->data = (intptr_t)kev32.data;
1186                 kevp->udata = CAST_USER_ADDR_T(kev32.udata);
1187         }
1188         if (!error)
1189                 *addrp += advance;
1190         return error;
1191 }
1192
1193 static int
1194 kevent_copyout(struct kevent64_s *kevp, user_addr_t *addrp, struct proc *p, int iskev64)
1195 {
1196         int advance;
1197         int error;
1198
1199         if (iskev64) {
1200                 advance = sizeof(struct kevent64_s);
1201                 error = copyout((caddr_t)kevp, *addrp, advance);
1202         } else if (IS_64BIT_PROCESS(p)) {
1203                 struct user64_kevent kev64;
1204
1205                 /*
1206                  * deal with the special case of a user-supplied
1207                  * value of (uintptr_t)-1.
1208                  */
1209                 kev64.ident = (kevp->ident == (uintptr_t)-1) ?
1210                            (uint64_t)-1LL : (uint64_t)kevp->ident;
1211
1212                 kev64.filter = kevp->filter;
1213                 kev64.flags = kevp->flags;
1214                 kev64.fflags = kevp->fflags;
1215                 kev64.data = (int64_t) kevp->data;
1216                 kev64.udata = kevp->udata;
1217                 advance = sizeof(kev64);
1218                 error = copyout((caddr_t)&kev64, *addrp, advance);
1219         } else {
1220                 struct user32_kevent kev32;
1221
1222                 kev32.ident = (uint32_t)kevp->ident;
1223                 kev32.filter = kevp->filter;
1224                 kev32.flags = kevp->flags;
1225                 kev32.fflags = kevp->fflags;
1226                 kev32.data = (int32_t)kevp->data;
1227                 kev32.udata = kevp->udata;
1228                 advance = sizeof(kev32);
1229                 error = copyout((caddr_t)&kev32, *addrp, advance);
1230         }
1231         if (!error)
1232                 *addrp += advance;
1233         return error;
1234 }
1235
1236 /*
1237  * kevent_continue - continue a kevent syscall after blocking
1238  *
1239  *      assume we inherit a use count on the kq fileglob.
1240  */
1241
1242 static void
1243 kevent_continue(__unused struct kqueue *kq, void *data, int error)
1244 {
1245         struct _kevent *cont_args;
1246         struct fileproc *fp;
1247         int32_t *retval;
1248         int noutputs;
1249         int fd;
1250         struct proc *p = current_proc();
1251
1252         cont_args = (struct _kevent *)data;
1253         noutputs = cont_args->eventout;
1254         retval = cont_args->retval;
1255         fd = cont_args->fd;
1256         fp = cont_args->fp;
1257
1258         fp_drop(p, fd, fp, 0);
1259
1260         /* don't restart after signals... */
1261         if (error == ERESTART)
1262                 error = EINTR;
1263         else if (error == EWOULDBLOCK)
1264                 error = 0;
1265         if (error == 0)
1266                 *retval = noutputs;
1267         unix_syscall_return(error);
1268 }
1269
1270 /*
1271  * kevent - [syscall] register and wait for kernel events
1272  *
1273  */
1274 int
1275 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
1276 {
1277         return kevent_internal(p,
1278                         0,
1279                         uap->changelist,
1280                         uap->nchanges,
1281                         uap->eventlist,
1282                         uap->nevents,
1283                         uap->fd,
1284                         uap->timeout,
1285                         0, /* no flags from old kevent() call */
1286                         retval);
1287 }
1288
1289 int
1290 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
1291 {
1292         return kevent_internal(p,
1293                         1,
1294                         uap->changelist,
1295                         uap->nchanges,
1296                         uap->eventlist,
1297                         uap->nevents,
1298                         uap->fd,
1299                         uap->timeout,
1300                         uap->flags,
1301                         retval);
1302 }
1303
1304 static int
1305 kevent_internal(struct proc *p, int iskev64, user_addr_t changelist,
1306                 int nchanges, user_addr_t ueventlist, int nevents, int fd,
1307                 user_addr_t utimeout, __unused unsigned int flags,
1308                 int32_t *retval)
1309 {
1310         struct _kevent *cont_args;
1311         uthread_t ut;
1312         struct kqueue *kq;
1313         struct fileproc *fp;
1314         struct kevent64_s kev;
1315         int error, noutputs;
1316         struct timeval atv;
1317
1318         /* convert timeout to absolute - if we have one */
1319         if (utimeout != USER_ADDR_NULL) {
1320                 struct timeval rtv;
1321                 if (IS_64BIT_PROCESS(p)) {
1322                         struct user64_timespec ts;
1323                         error = copyin(utimeout, &ts, sizeof(ts));
1324                         if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
1325                                 error = EINVAL;
1326                         else
1327                                 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
1328                 } else {
1329                         struct user32_timespec ts;
1330                         error = copyin(utimeout, &ts, sizeof(ts));
1331                         TIMESPEC_TO_TIMEVAL(&rtv, &ts);
1332                 }
1333                 if (error)
1334                         return error;
1335                 if (itimerfix(&rtv))
1336                         return EINVAL;
1337                 getmicrouptime(&atv);
1338                 timevaladd(&atv, &rtv);
1339         } else {
1340                 atv.tv_sec = 0;
1341                 atv.tv_usec = 0;
1342         }
1343
1344         /* get a usecount for the kq itself */
1345         if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
1346                 return(error);
1347
1348         /* each kq should only be used for events of one type */
1349         kqlock(kq);
1350         if (kq->kq_state & (KQ_KEV32 | KQ_KEV64)) {
1351                 if (((iskev64 && (kq->kq_state & KQ_KEV32)) ||
1352                         (!iskev64 && (kq->kq_state & KQ_KEV64)))) {
1353                         error = EINVAL;
1354                         kqunlock(kq);
1355                         goto errorout;
1356                 }
1357         } else {
1358                 kq->kq_state |= (iskev64 ? KQ_KEV64 : KQ_KEV32);
1359         }
1360         kqunlock(kq);
1361
1362         /* register all the change requests the user provided... */
1363         noutputs = 0;
1364         while (nchanges > 0 && error == 0) {
1365                 error = kevent_copyin(&changelist, &kev, p, iskev64);
1366                 if (error)
1367                         break;
1368
1369                 kev.flags &= ~EV_SYSFLAGS;
1370                 error = kevent_register(kq, &kev, p);
1371                 if ((error || (kev.flags & EV_RECEIPT)) && nevents > 0) {
1372                         kev.flags = EV_ERROR;
1373                         kev.data = error;
1374                         error = kevent_copyout(&kev, &ueventlist, p, iskev64);
1375                         if (error == 0) {
1376                                 nevents--;
1377                                 noutputs++;
1378                         }
1379                 }
1380                 nchanges--;
1381         }
1382
1383         /* store the continuation/completion data in the uthread */
1384         ut = (uthread_t)get_bsdthread_info(current_thread());
1385         cont_args = &ut->uu_kevent.ss_kevent;
1386         cont_args->fp = fp;
1387         cont_args->fd = fd;
1388         cont_args->retval = retval;
1389         cont_args->eventlist = ueventlist;
1390         cont_args->eventcount = nevents;
1391         cont_args->eventout = noutputs;
1392         cont_args->eventsize = iskev64;
1393
1394         if (nevents > 0 && noutputs == 0 && error == 0)
1395                 error = kqueue_scan(kq, kevent_callback,
1396                                     kevent_continue, cont_args,
1397                                     &atv, p);
1398         kevent_continue(kq, cont_args, error);
1399
1400 errorout:
1401         fp_drop(p, fd, fp, 0);
1402         return error;
1403 }
1404
1405
1406 /*
1407  * kevent_callback - callback for each individual event
1408  *
1409  *      called with nothing locked
1410  *      caller holds a reference on the kqueue
1411  */
1412
1413 static int
1414 kevent_callback(__unused struct kqueue *kq, struct kevent64_s *kevp,
1415                 void *data)
1416 {
1417         struct _kevent *cont_args;
1418         int error;
1419         int iskev64;
1420
1421         cont_args = (struct _kevent *)data;
1422         assert(cont_args->eventout < cont_args->eventcount);
1423
1424         iskev64 = cont_args->eventsize;
1425
1426         /*
1427          * Copy out the appropriate amount of event data for this user.
1428          */
1429         error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(), iskev64);
1430
1431         /*
1432          * If there isn't space for additional events, return
1433          * a harmless error to stop the processing here
1434          */
1435         if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
1436                         error = EWOULDBLOCK;
1437         return error;
1438 }
1439
1440 /*
1441  * kevent_description - format a description of a kevent for diagnostic output
1442  *
1443  *      called with a 128-byte string buffer
1444  */
1445
1446 char *
1447 kevent_description(struct kevent64_s *kevp, char *s, size_t n)
1448 {
1449         snprintf(s, n,
1450                  "kevent="
1451                  "{.ident=%#llx, .filter=%d, .flags=%#x, .fflags=%#x, .data=%#llx, .udata=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
1452                  kevp->ident,
1453                  kevp->filter,
1454                  kevp->flags,
1455                  kevp->fflags,
1456                  kevp->data,
1457                  kevp->udata,
1458                  kevp->ext[0],
1459                  kevp->ext[1]);
1460         return s;
1461 }
1462
1463 /*
1464  * kevent_register - add a new event to a kqueue
1465  *
1466  *      Creates a mapping between the event source and
1467  *      the kqueue via a knote data structure.
1468  *
1469  *      Because many/most the event sources are file
1470  *      descriptor related, the knote is linked off
1471  *      the filedescriptor table for quick access.
1472  *
1473  *      called with nothing locked
1474  *      caller holds a reference on the kqueue
1475  */
1476
1477 int
1478 kevent_register(struct kqueue *kq, struct kevent64_s *kev, __unused struct proc *ctxp)
1479 {
1480         struct proc *p = kq->kq_p;
1481         struct filedesc *fdp = p->p_fd;
1482         struct filterops *fops;
1483         struct fileproc *fp = NULL;
1484         struct knote *kn = NULL;
1485         int error = 0;
1486
1487         if (kev->filter < 0) {
1488                 if (kev->filter + EVFILT_SYSCOUNT < 0)
1489                         return (EINVAL);
1490                 fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
1491         } else {
1492                 /*
1493                  * XXX
1494                  * filter attach routine is responsible for insuring that
1495                  * the identifier can be attached to it.
1496                  */
1497                 printf("unknown filter: %d\n", kev->filter);
1498                 return (EINVAL);
1499         }
1500
1501  restart:
1502         /* this iocount needs to be dropped if it is not registered */
1503         proc_fdlock(p);
1504         if (fops->f_isfd && (error = fp_lookup(p, kev->ident, &fp, 1)) != 0) {
1505                 proc_fdunlock(p);
1506                 return(error);
1507         }
1508
1509         if (fops->f_isfd) {
1510                 /* fd-based knotes are linked off the fd table */
1511                 if (kev->ident < (u_int)fdp->fd_knlistsize) {
1512                         SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
1513                                 if (kq == kn->kn_kq &&
1514                                     kev->filter == kn->kn_filter)
1515                                         break;
1516                 }
1517         } else {
1518                 /* hash non-fd knotes here too */
1519                 if (fdp->fd_knhashmask != 0) {
1520                         struct klist *list;
1521
1522                         list = &fdp->fd_knhash[
1523                             KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1524                         SLIST_FOREACH(kn, list, kn_link)
1525                                 if (kev->ident == kn->kn_id &&
1526                                     kq == kn->kn_kq &&
1527                                     kev->filter == kn->kn_filter)
1528                                         break;
1529                 }
1530         }
1531
1532         /*
1533          * kn now contains the matching knote, or NULL if no match
1534          */
1535         if (kn == NULL) {
1536                 if ((kev->flags & (EV_ADD|EV_DELETE)) == EV_ADD) {
1537                         kn = knote_alloc();
1538                         if (kn == NULL) {
1539                                 proc_fdunlock(p);
1540                                 error = ENOMEM;
1541                                 goto done;
1542                         }
1543                         kn->kn_fp = fp;
1544                         kn->kn_kq = kq;
1545                         kn->kn_tq = &kq->kq_head;
1546                         kn->kn_fop = fops;
1547                         kn->kn_sfflags = kev->fflags;
1548                         kn->kn_sdata = kev->data;
1549                         kev->fflags = 0;
1550                         kev->data = 0;
1551                         kn->kn_kevent = *kev;
1552                         kn->kn_inuse = 1;  /* for f_attach() */
1553                         kn->kn_status = KN_ATTACHING;
1554
1555                         /* before anyone can find it */
1556                         if (kev->flags & EV_DISABLE)
1557                                 kn->kn_status |= KN_DISABLED;
1558
1559                         error = knote_fdpattach(kn, fdp, p);
1560                         proc_fdunlock(p);
1561
1562                         if (error) {
1563                                 knote_free(kn);
1564                                 goto done;
1565                         }
1566
1567                         /*
1568                          * apply reference count to knote structure, and
1569                          * do not release it at the end of this routine.
1570                          */
1571                         fp = NULL;
1572
1573                         error = fops->f_attach(kn);
1574
1575                         kqlock(kq);
1576
1577                         if (error != 0) {
1578                                 /*
1579                                  * Failed to attach correctly, so drop.
1580                                  * All other possible users/droppers
1581                                  * have deferred to us.
1582                                  */
1583                                 kn->kn_status |= KN_DROPPING;
1584                                 kqunlock(kq);
1585                                 knote_drop(kn, p);
1586                                 goto done;
1587                         } else if (kn->kn_status & KN_DROPPING) {
1588                                 /*
1589                                  * Attach succeeded, but someone else
1590                                  * deferred their drop - now we have
1591                                  * to do it for them (after detaching).
1592                                  */
1593                                 kqunlock(kq);
1594                                 kn->kn_fop->f_detach(kn);
1595                                 knote_drop(kn, p);
1596                                 goto done;
1597                         }
1598                         kn->kn_status &= ~KN_ATTACHING;
1599                         kqunlock(kq);
1600                 } else {
1601                         proc_fdunlock(p);
1602                         error = ENOENT;
1603                         goto done;
1604                 }
1605         } else {
1606                 /* existing knote - get kqueue lock */
1607                 kqlock(kq);
1608                 proc_fdunlock(p);
1609
1610                 if (kev->flags & EV_DELETE) {
1611                         knote_dequeue(kn);
1612                         kn->kn_status |= KN_DISABLED;
1613                         if (kqlock2knotedrop(kq, kn)) {
1614                                 kn->kn_fop->f_detach(kn);
1615                                 knote_drop(kn, p);
1616                         }
1617                         goto done;
1618                 }
1619
1620                 /* update status flags for existing knote */
1621                 if (kev->flags & EV_DISABLE) {
1622                         knote_dequeue(kn);
1623                         kn->kn_status |= KN_DISABLED;
1624                 } else if (kev->flags & EV_ENABLE) {
1625                         kn->kn_status &= ~KN_DISABLED;
1626                         if (kn->kn_status & KN_ACTIVE)
1627                                 knote_enqueue(kn);
1628                 }
1629
1630                 /*
1631                  * The user may change some filter values after the
1632                  * initial EV_ADD, but doing so will not reset any
1633                  * filter which have already been triggered.
1634                  */
1635                 kn->kn_kevent.udata = kev->udata;
1636                 if (fops->f_isfd || fops->f_touch == NULL) {
1637                         kn->kn_sfflags = kev->fflags;
1638                         kn->kn_sdata = kev->data;
1639                 }
1640
1641                 /*
1642                  * If somebody is in the middle of dropping this
1643                  * knote - go find/insert a new one.  But we have
1644                  * wait for this one to go away first. Attaches
1645                  * running in parallel may also drop/modify the
1646                  * knote.  Wait for those to complete as well and
1647                  * then start over if we encounter one.
1648                  */
1649                 if (!kqlock2knoteusewait(kq, kn)) {
1650                         /* kqueue, proc_fdlock both unlocked */
1651                         goto restart;
1652                 }
1653
1654                 /*
1655                  * Call touch routine to notify filter of changes
1656                  * in filter values.
1657                  */
1658                 if (!fops->f_isfd && fops->f_touch != NULL)
1659                         fops->f_touch(kn, kev, EVENT_REGISTER);
1660         }
1661         /* still have use ref on knote */
1662
1663         /*
1664          * If the knote is not marked to always stay enqueued,
1665          * invoke the filter routine to see if it should be
1666          * enqueued now.
1667          */
1668         if ((kn->kn_status & KN_STAYQUEUED) == 0 && kn->kn_fop->f_event(kn, 0)) {
1669                 if (knoteuse2kqlock(kq, kn))
1670                         knote_activate(kn, 1);
1671                 kqunlock(kq);
1672         } else {
1673                 knote_put(kn);
1674         }
1675
1676 done:
1677         if (fp != NULL)
1678                 fp_drop(p, kev->ident, fp, 0);
1679         return (error);
1680 }
1681
1682
1683 /*
1684  * knote_process - process a triggered event
1685  *
1686  *      Validate that it is really still a triggered event
1687  *      by calling the filter routines (if necessary).  Hold
1688  *      a use reference on the knote to avoid it being detached.
1689  *      If it is still considered triggered, invoke the callback
1690  *      routine provided and move it to the provided inprocess
1691  *      queue.
1692  *
1693  *      caller holds a reference on the kqueue.
1694  *      kqueue locked on entry and exit - but may be dropped
1695  */
1696 static int
1697 knote_process(struct knote      *kn,
1698               kevent_callback_t callback,
1699               void              *data,
1700               struct kqtailq    *inprocessp,
1701               struct proc       *p)
1702 {
1703         struct kqueue *kq = kn->kn_kq;
1704         struct kevent64_s kev;
1705         int touch;
1706         int result;
1707         int error;
1708
1709         /*
1710          * Determine the kevent state we want to return.
1711          *
1712          * Some event states need to be revalidated before returning
1713          * them, others we take the snapshot at the time the event
1714          * was enqueued.
1715          *
1716          * Events with non-NULL f_touch operations must be touched.
1717          * Triggered events must fill in kev for the callback.
1718          *
1719          * Convert our lock to a use-count and call the event's
1720          * filter routine(s) to update.
1721          */
1722         if ((kn->kn_status & KN_DISABLED) != 0) {
1723                 result = 0;
1724                 touch = 0;
1725         } else {
1726                 int revalidate;
1727
1728                 result = 1;
1729                 revalidate = ((kn->kn_status & KN_STAYQUEUED) != 0 ||
1730                               (kn->kn_flags & EV_ONESHOT) == 0);
1731                 touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL);
1732
1733                 if (revalidate || touch) {
1734                         if (revalidate)
1735                                 knote_deactivate(kn);
1736
1737                         /* call the filter/touch routines with just a ref */
1738                         if (kqlock2knoteuse(kq, kn)) {
1739
1740                                 /* if we have to revalidate, call the filter */
1741                                 if (revalidate) {
1742                                         result = kn->kn_fop->f_event(kn, 0);
1743                                 }
1744
1745                                 /* capture the kevent data - using touch if specified */
1746                                 if (result && touch) {
1747                                         kn->kn_fop->f_touch(kn, &kev, EVENT_PROCESS);
1748                                 }
1749
1750                                 /* convert back to a kqlock - bail if the knote went away */
1751                                 if (!knoteuse2kqlock(kq, kn)) {
1752                                         return EJUSTRETURN;
1753                                 } else if (result) {
1754                                         /* if revalidated as alive, make sure it's active */
1755                                         if (!(kn->kn_status & KN_ACTIVE)) {
1756                                                 knote_activate(kn, 0);
1757                                         }
1758
1759                                         /* capture all events that occurred during filter */
1760                                         if (!touch) {
1761                                                 kev = kn->kn_kevent;
1762                                         }
1763
1764                                 } else if ((kn->kn_status & KN_STAYQUEUED) == 0) {
1765                                         /* was already dequeued, so just bail on this one */
1766                                         return EJUSTRETURN;
1767                                 }
1768                         } else {
1769                                 return EJUSTRETURN;
1770                         }
1771                 } else {
1772                         kev = kn->kn_kevent;
1773                 }
1774         }
1775
1776         /* move knote onto inprocess queue */
1777         assert(kn->kn_tq == &kq->kq_head);
1778         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1779         kn->kn_tq = inprocessp;
1780         TAILQ_INSERT_TAIL(inprocessp, kn, kn_tqe);
1781
1782         /*
1783          * Determine how to dispatch the knote for future event handling.
1784          * not-fired: just return (do not callout).
1785          * One-shot: deactivate it.
1786          * Clear: deactivate and clear the state.
1787          * Dispatch: don't clear state, just deactivate it and mark it disabled.
1788          * All others: just leave where they are.
1789          */
1790
1791         if (result == 0) {
1792                 return EJUSTRETURN;
1793         } else if ((kn->kn_flags & EV_ONESHOT) != 0) {
1794                 knote_deactivate(kn);
1795                 if (kqlock2knotedrop(kq, kn)) {
1796                         kn->kn_fop->f_detach(kn);
1797                         knote_drop(kn, p);
1798                 }
1799         } else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) {
1800                 if ((kn->kn_flags & EV_DISPATCH) != 0) {
1801                         /* deactivate and disable all dispatch knotes */
1802                         knote_deactivate(kn);
1803                         kn->kn_status |= KN_DISABLED;
1804                 } else if (!touch || kn->kn_fflags == 0) {
1805                         /* only deactivate if nothing since the touch */
1806                         knote_deactivate(kn);
1807                 }
1808                 if (!touch && (kn->kn_flags & EV_CLEAR) != 0) {
1809                         /* manually clear non-touch knotes */
1810                         kn->kn_data = 0;
1811                         kn->kn_fflags = 0;
1812                 }
1813                 kqunlock(kq);
1814         } else {
1815                 /*
1816                  * leave on inprocess queue.  We'll
1817                  * move all the remaining ones back
1818                  * the kq queue and wakeup any
1819                  * waiters when we are done.
1820                  */
1821                 kqunlock(kq);
1822         }
1823
1824         /* callback to handle each event as we find it */
1825         error = (callback)(kq, &kev, data);
1826
1827         kqlock(kq);
1828         return error;
1829 }
1830
1831 /*
1832  * Return 0 to indicate that processing should proceed,
1833  * -1 if there is nothing to process.
1834  *
1835  * Called with kqueue locked and returns the same way,
1836  * but may drop lock temporarily.
1837  */
1838 static int
1839 kqueue_begin_processing(struct kqueue *kq)
1840 {
1841         for (;;) {
1842                 if (kq->kq_count == 0) {
1843                         return -1;
1844                 }
1845
1846                 /* if someone else is processing the queue, wait */
1847                 if (kq->kq_nprocess != 0) {
1848                         wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_UNINT, 0);
1849                         kq->kq_state |= KQ_PROCWAIT;
1850                         kqunlock(kq);
1851                         thread_block(THREAD_CONTINUE_NULL);
1852                         kqlock(kq);
1853                 } else {
1854                         kq->kq_nprocess = 1;
1855                         return 0;
1856                 }
1857         }
1858 }
1859
1860 /*
1861  * Called with kqueue lock held.
1862  */
1863 static void
1864 kqueue_end_processing(struct kqueue *kq)
1865 {
1866         kq->kq_nprocess = 0;
1867         if (kq->kq_state & KQ_PROCWAIT) {
1868                 kq->kq_state &= ~KQ_PROCWAIT;
1869                 wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_AWAKENED);
1870         }
1871 }
1872
1873 /*
1874  * kqueue_process - process the triggered events in a kqueue
1875  *
1876  *      Walk the queued knotes and validate that they are
1877  *      really still triggered events by calling the filter
1878  *      routines (if necessary).  Hold a use reference on
1879  *      the knote to avoid it being detached. For each event
1880  *      that is still considered triggered, invoke the
1881  *      callback routine provided.
1882  *
1883  *      caller holds a reference on the kqueue.
1884  *      kqueue locked on entry and exit - but may be dropped
1885  *      kqueue list locked (held for duration of call)
1886  */
1887
1888 static int
1889 kqueue_process(struct kqueue *kq,
1890                kevent_callback_t callback,
1891                void *data,
1892                int *countp,
1893                struct proc *p)
1894 {
1895         struct kqtailq inprocess;
1896         struct knote *kn;
1897         int nevents;
1898         int error;
1899
1900         TAILQ_INIT(&inprocess);
1901
1902         if (kqueue_begin_processing(kq) == -1) {
1903                 *countp = 0;
1904                 /* Nothing to process */
1905                 return 0;
1906         }
1907
1908         /*
1909          * Clear any pre-posted status from previous runs, so we only
1910          * detect events that occur during this run.
1911          */
1912         wait_queue_sub_clearrefs(kq->kq_wqs);
1913
1914         /*
1915          * loop through the enqueued knotes, processing each one and
1916          * revalidating those that need it. As they are processed,
1917          * they get moved to the inprocess queue (so the loop can end).
1918          */
1919         error = 0;
1920         nevents = 0;
1921
1922         while (error == 0 &&
1923                (kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
1924                 error = knote_process(kn, callback, data, &inprocess, p);
1925                 if (error == EJUSTRETURN)
1926                         error = 0;
1927                 else
1928                         nevents++;
1929         }
1930
1931         /*
1932          * With the kqueue still locked, move any knotes
1933          * remaining on the inprocess queue back to the
1934          * kq's queue and wake up any waiters.
1935          */
1936         while ((kn = TAILQ_FIRST(&inprocess)) != NULL) {
1937                 assert(kn->kn_tq == &inprocess);
1938                 TAILQ_REMOVE(&inprocess, kn, kn_tqe);
1939                 kn->kn_tq = &kq->kq_head;
1940                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1941         }
1942
1943         kqueue_end_processing(kq);
1944
1945         *countp = nevents;
1946         return error;
1947 }
1948
1949
1950 static void
1951 kqueue_scan_continue(void *data, wait_result_t wait_result)
1952 {
1953         thread_t self = current_thread();
1954         uthread_t ut = (uthread_t)get_bsdthread_info(self);
1955         struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
1956         struct kqueue *kq = (struct kqueue *)data;
1957         int error;
1958         int count;
1959
1960         /* convert the (previous) wait_result to a proper error */
1961         switch (wait_result) {
1962         case THREAD_AWAKENED:
1963                 kqlock(kq);
1964                 error = kqueue_process(kq, cont_args->call, cont_args, &count, current_proc());
1965                 if (error == 0 && count == 0) {
1966                         wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, KQ_EVENT,
1967                                                THREAD_ABORTSAFE, cont_args->deadline);
1968                         kq->kq_state |= KQ_SLEEP;
1969                         kqunlock(kq);
1970                         thread_block_parameter(kqueue_scan_continue, kq);
1971                         /* NOTREACHED */
1972                 }
1973                 kqunlock(kq);
1974                 break;
1975         case THREAD_TIMED_OUT:
1976                 error = EWOULDBLOCK;
1977                 break;
1978         case THREAD_INTERRUPTED:
1979                 error = EINTR;
1980                 break;
1981         default:
1982                 panic("kevent_scan_cont() - invalid wait_result (%d)", wait_result);
1983                 error = 0;
1984         }
1985
1986         /* call the continuation with the results */
1987         assert(cont_args->cont != NULL);
1988         (cont_args->cont)(kq, cont_args->data, error);
1989 }
1990
1991
1992 /*
1993  * kqueue_scan - scan and wait for events in a kqueue
1994  *
1995  *      Process the triggered events in a kqueue.
1996  *
1997  *      If there are no events triggered arrange to
1998  *      wait for them. If the caller provided a
1999  *      continuation routine, then kevent_scan will
2000  *      also.
2001  *
2002  *      The callback routine must be valid.
2003  *      The caller must hold a use-count reference on the kq.
2004  */
2005
2006 int
2007 kqueue_scan(struct kqueue *kq,
2008             kevent_callback_t callback,
2009             kqueue_continue_t continuation,
2010             void *data,
2011             struct timeval *atvp,
2012             struct proc *p)
2013 {
2014         thread_continue_t cont = THREAD_CONTINUE_NULL;
2015         uint64_t deadline;
2016         int error;
2017         int first;
2018
2019         assert(callback != NULL);
2020
2021         first = 1;
2022         for (;;) {
2023                 wait_result_t wait_result;
2024                 int count;
2025
2026                 /*
2027                  * Make a pass through the kq to find events already
2028                  * triggered.
2029                  */
2030                 kqlock(kq);
2031                 error = kqueue_process(kq, callback, data, &count, p);
2032                 if (error || count)
2033                         break; /* lock still held */
2034
2035                 /* looks like we have to consider blocking */
2036                 if (first) {
2037                         first = 0;
2038                         /* convert the timeout to a deadline once */
2039                         if (atvp->tv_sec || atvp->tv_usec) {
2040                                 uint64_t now;
2041
2042                                 clock_get_uptime(&now);
2043                                 nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
2044                                                             atvp->tv_usec * NSEC_PER_USEC,
2045                                                             &deadline);
2046                                 if (now >= deadline) {
2047                                         /* non-blocking call */
2048                                         error = EWOULDBLOCK;
2049                                         break; /* lock still held */
2050                                 }
2051                                 deadline -= now;
2052                                 clock_absolutetime_interval_to_deadline(deadline, &deadline);
2053                         } else {
2054                                 deadline = 0;   /* block forever */
2055                         }
2056
2057                         if (continuation) {
2058                                 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
2059                                 struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
2060
2061                                 cont_args->call = callback;
2062                                 cont_args->cont = continuation;
2063                                 cont_args->deadline = deadline;
2064                                 cont_args->data = data;
2065                                 cont = kqueue_scan_continue;
2066                         }
2067                 }
2068
2069                 /* go ahead and wait */
2070                 wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, KQ_EVENT, THREAD_ABORTSAFE, deadline);
2071                 kq->kq_state |= KQ_SLEEP;
2072                 kqunlock(kq);
2073                 wait_result = thread_block_parameter(cont, kq);
2074                 /* NOTREACHED if (continuation != NULL) */
2075
2076                 switch (wait_result) {
2077                 case THREAD_AWAKENED:
2078                         continue;
2079                 case THREAD_TIMED_OUT:
2080                         return EWOULDBLOCK;
2081                 case THREAD_INTERRUPTED:
2082                         return EINTR;
2083                 default:
2084                         panic("kevent_scan - bad wait_result (%d)",
2085                               wait_result);
2086                         error = 0;
2087                 }
2088         }
2089         kqunlock(kq);
2090         return error;
2091 }
2092
2093
2094 /*
2095  * XXX
2096  * This could be expanded to call kqueue_scan, if desired.
2097  */
2098 /*ARGSUSED*/
2099 static int
2100 kqueue_read(__unused struct fileproc *fp,
2101                         __unused struct uio *uio,
2102                         __unused int flags,
2103                         __unused vfs_context_t ctx)
2104 {
2105         return (ENXIO);
2106 }
2107
2108 /*ARGSUSED*/
2109 static int
2110 kqueue_write(__unused struct fileproc *fp,
2111                          __unused struct uio *uio,
2112                          __unused int flags,
2113                          __unused vfs_context_t ctx)
2114 {
2115         return (ENXIO);
2116 }
2117
2118 /*ARGSUSED*/
2119 static int
2120 kqueue_ioctl(__unused struct fileproc *fp,
2121                          __unused u_long com,
2122                          __unused caddr_t data,
2123                          __unused vfs_context_t ctx)
2124 {
2125         return (ENOTTY);
2126 }
2127
2128 /*ARGSUSED*/
2129 static int
2130 kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
2131 {
2132         struct kqueue *kq = (struct kqueue *)fp->f_data;
2133         struct knote *kn;
2134         struct kqtailq inprocessq;
2135         int retnum = 0;
2136
2137         if (which != FREAD)
2138                 return 0;
2139
2140         TAILQ_INIT(&inprocessq);
2141
2142         kqlock(kq);
2143         /*
2144          * If this is the first pass, link the wait queue associated with the
2145          * the kqueue onto the wait queue set for the select().  Normally we
2146          * use selrecord() for this, but it uses the wait queue within the
2147          * selinfo structure and we need to use the main one for the kqueue to
2148          * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
2149          * (The select() call will unlink them when it ends).
2150          */
2151         if (wql != NULL) {
2152                 thread_t        cur_act = current_thread();
2153                 struct uthread * ut = get_bsdthread_info(cur_act);
2154
2155                 kq->kq_state |= KQ_SEL;
2156                 wait_queue_link_noalloc((wait_queue_t)kq->kq_wqs, ut->uu_wqset,
2157                                         (wait_queue_link_t)wql);
2158         }
2159
2160         if (kqueue_begin_processing(kq) == -1) {
2161                 kqunlock(kq);
2162                 return 0;
2163         }
2164
2165         if (kq->kq_count != 0) {
2166                 /*
2167                  * there is something queued - but it might be a
2168                  * KN_STAYQUEUED knote, which may or may not have
2169                  * any events pending.  So, we have to walk the
2170                  * list of knotes to see, and peek at the stay-
2171                  * queued ones to be really sure.
2172                  */
2173                 while ((kn = (struct knote*)TAILQ_FIRST(&kq->kq_head)) != NULL) {
2174                         if ((kn->kn_status & KN_STAYQUEUED) == 0) {
2175                                 retnum = 1;
2176                                 goto out;
2177                         }
2178
2179                         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2180                         TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe);
2181
2182                         if (kqlock2knoteuse(kq, kn)) {
2183                                 unsigned peek;
2184
2185                                 peek = kn->kn_fop->f_peek(kn);
2186                                 if (knoteuse2kqlock(kq, kn)) {
2187                                         if (peek > 0) {
2188                                                 retnum = 1;
2189                                                 goto out;
2190                                         }
2191                                 } else {
2192                                         retnum = 0;
2193                                 }
2194                         }
2195                 }
2196         }
2197
2198 out:
2199         /* Return knotes to active queue */
2200         while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) {
2201                 TAILQ_REMOVE(&inprocessq, kn, kn_tqe);
2202                 kn->kn_tq = &kq->kq_head;
2203                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2204         }
2205
2206         kqueue_end_processing(kq);
2207         kqunlock(kq);
2208         return retnum;
2209 }
2210
2211 /*
2212  * kqueue_close -
2213  */
2214 /*ARGSUSED*/
2215 static int
2216 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
2217 {
2218         struct kqueue *kq = (struct kqueue *)fg->fg_data;
2219
2220         kqueue_dealloc(kq);
2221         fg->fg_data = NULL;
2222         return (0);
2223 }
2224
2225 /*ARGSUSED*/
2226 /*
2227  * The callers has taken a use-count reference on this kqueue and will donate it
2228  * to the kqueue we are being added to.  This keeps the kqueue from closing until
2229  * that relationship is torn down.
2230  */
2231 static int
2232 kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
2233 {
2234         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
2235         struct kqueue *parentkq = kn->kn_kq;
2236
2237         if (parentkq == kq ||
2238             kn->kn_filter != EVFILT_READ)
2239                 return (1);
2240
2241         /*
2242          * We have to avoid creating a cycle when nesting kqueues
2243          * inside another.  Rather than trying to walk the whole
2244          * potential DAG of nested kqueues, we just use a simple
2245          * ceiling protocol.  When a kqueue is inserted into another,
2246          * we check that the (future) parent is not already nested
2247          * into another kqueue at a lower level than the potenial
2248          * child (because it could indicate a cycle).  If that test
2249          * passes, we just mark the nesting levels accordingly.
2250          */
2251
2252         kqlock(parentkq);
2253         if (parentkq->kq_level > 0 &&
2254             parentkq->kq_level < kq->kq_level)
2255         {
2256                 kqunlock(parentkq);
2257                 return (1);
2258         } else {
2259                 /* set parent level appropriately */
2260                 if (parentkq->kq_level == 0)
2261                         parentkq->kq_level = 2;
2262                 if (parentkq->kq_level < kq->kq_level + 1)
2263                         parentkq->kq_level = kq->kq_level + 1;
2264                 kqunlock(parentkq);
2265
2266                 kn->kn_fop = &kqread_filtops;
2267                 kqlock(kq);
2268                 KNOTE_ATTACH(&kq->kq_sel.si_note, kn);
2269                 /* indicate nesting in child, if needed */
2270                 if (kq->kq_level == 0)
2271                         kq->kq_level = 1;
2272                 kqunlock(kq);
2273                 return (0);
2274         }
2275 }
2276
2277 /*
2278  * kqueue_drain - called when kq is closed
2279  */
2280 /*ARGSUSED*/
2281 static int
2282 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
2283 {
2284         struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
2285         kqlock(kq);
2286         kqueue_wakeup(kq, 1);
2287         kqunlock(kq);
2288         return 0;
2289 }
2290
2291 /*ARGSUSED*/
2292 int
2293 kqueue_stat(struct fileproc *fp, void *ub, int isstat64,  __unused vfs_context_t ctx)
2294 {
2295
2296         struct kqueue *kq = (struct kqueue *)fp->f_data;
2297         if (isstat64 != 0) {
2298                 struct stat64 *sb64 = (struct stat64 *)ub;
2299
2300                 bzero((void *)sb64, sizeof(*sb64));
2301                 sb64->st_size = kq->kq_count;
2302                 if (kq->kq_state & KQ_KEV64)
2303                         sb64->st_blksize = sizeof(struct kevent64_s);
2304                 else
2305                         sb64->st_blksize = sizeof(struct kevent);
2306                 sb64->st_mode = S_IFIFO;
2307         } else {
2308                 struct stat *sb = (struct stat *)ub;
2309
2310                 bzero((void *)sb, sizeof(*sb));
2311                 sb->st_size = kq->kq_count;
2312                 if (kq->kq_state & KQ_KEV64)
2313                         sb->st_blksize = sizeof(struct kevent64_s);
2314                 else
2315                         sb->st_blksize = sizeof(struct kevent);
2316                 sb->st_mode = S_IFIFO;
2317         }
2318
2319         return (0);
2320 }
2321
2322 /*
2323  * Called with the kqueue locked
2324  */
2325 static void
2326 kqueue_wakeup(struct kqueue *kq, int closed)
2327 {
2328         if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0 || kq->kq_nprocess > 0) {
2329                 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
2330                 wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, KQ_EVENT,
2331                                       (closed) ? THREAD_INTERRUPTED : THREAD_AWAKENED);
2332         }
2333 }
2334
2335 void
2336 klist_init(struct klist *list)
2337 {
2338         SLIST_INIT(list);
2339 }
2340
2341
2342 /*
2343  * Query/Post each knote in the object's list
2344  *
2345  *      The object lock protects the list. It is assumed
2346  *      that the filter/event routine for the object can
2347  *      determine that the object is already locked (via
2348  *      the hint) and not deadlock itself.
2349  *
2350  *      The object lock should also hold off pending
2351  *      detach/drop operations.  But we'll prevent it here
2352  *      too - just in case.
2353  */
2354 void
2355 knote(struct klist *list, long hint)
2356 {
2357         struct knote *kn;
2358
2359         SLIST_FOREACH(kn, list, kn_selnext) {
2360                 struct kqueue *kq = kn->kn_kq;
2361
2362                 kqlock(kq);
2363                 if (kqlock2knoteuse(kq, kn)) {
2364                         int result;
2365
2366                         /* call the event with only a use count */
2367                         result = kn->kn_fop->f_event(kn, hint);
2368
2369                         /* if its not going away and triggered */
2370                         if (knoteuse2kqlock(kq, kn) && result)
2371                                 knote_activate(kn, 1);
2372                         /* lock held again */
2373                 }
2374                 kqunlock(kq);
2375         }
2376 }
2377
2378 /*
2379  * attach a knote to the specified list.  Return true if this is the first entry.
2380  * The list is protected by whatever lock the object it is associated with uses.
2381  */
2382 int
2383 knote_attach(struct klist *list, struct knote *kn)
2384 {
2385         int ret = SLIST_EMPTY(list);
2386         SLIST_INSERT_HEAD(list, kn, kn_selnext);
2387         return ret;
2388 }
2389
2390 /*
2391  * detach a knote from the specified list.  Return true if that was the last entry.
2392  * The list is protected by whatever lock the object it is associated with uses.
2393  */
2394 int
2395 knote_detach(struct klist *list, struct knote *kn)
2396 {
2397         SLIST_REMOVE(list, kn, knote, kn_selnext);
2398         return SLIST_EMPTY(list);
2399 }
2400
2401 /*
2402  * For a given knote, link a provided wait queue directly with the kqueue.
2403  * Wakeups will happen via recursive wait queue support.  But nothing will move
2404  * the knote to the active list at wakeup (nothing calls knote()).  Instead,
2405  * we permanently enqueue them here.
2406  *
2407  * kqueue and knote references are held by caller.
2408  */
2409 int
2410 knote_link_wait_queue(struct knote *kn, struct wait_queue *wq)
2411 {
2412         struct kqueue *kq = kn->kn_kq;
2413         kern_return_t kr;
2414
2415         kr = wait_queue_link(wq, kq->kq_wqs);
2416         if (kr == KERN_SUCCESS) {
2417                 knote_markstayqueued(kn);
2418                 return 0;
2419         } else {
2420                 return ENOMEM;
2421         }
2422 }
2423
2424 /*
2425  * Unlink the provided wait queue from the kqueue associated with a knote.
2426  * Also remove it from the magic list of directly attached knotes.
2427  *
2428  * Note that the unlink may have already happened from the other side, so
2429  * ignore any failures to unlink and just remove it from the kqueue list.
2430  */
2431 void
2432 knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq)
2433 {
2434         struct kqueue *kq = kn->kn_kq;
2435
2436         (void) wait_queue_unlink(wq, kq->kq_wqs);
2437         kqlock(kq);
2438         kn->kn_status &= ~KN_STAYQUEUED;
2439         knote_dequeue(kn);
2440         kqunlock(kq);
2441 }
2442
2443 /*
2444  * remove all knotes referencing a specified fd
2445  *
2446  * Essentially an inlined knote_remove & knote_drop
2447  * when we know for sure that the thing is a file
2448  *
2449  * Entered with the proc_fd lock already held.
2450  * It returns the same way, but may drop it temporarily.
2451  */
2452 void
2453 knote_fdclose(struct proc *p, int fd)
2454 {
2455         struct filedesc *fdp = p->p_fd;
2456         struct klist *list;
2457         struct knote *kn;
2458
2459         list = &fdp->fd_knlist[fd];
2460         while ((kn = SLIST_FIRST(list)) != NULL) {
2461                 struct kqueue *kq = kn->kn_kq;
2462
2463                 if (kq->kq_p != p)
2464                         panic("knote_fdclose: proc mismatch (kq->kq_p=%p != p=%p)", kq->kq_p, p);
2465
2466                 kqlock(kq);
2467                 proc_fdunlock(p);
2468
2469                 /*
2470                  * Convert the lock to a drop ref.
2471                  * If we get it, go ahead and drop it.
2472                  * Otherwise, we waited for it to
2473                  * be dropped by the other guy, so
2474                  * it is safe to move on in the list.
2475                  */
2476                 if (kqlock2knotedrop(kq, kn)) {
2477                         kn->kn_fop->f_detach(kn);
2478                         knote_drop(kn, p);
2479                 }
2480
2481                 proc_fdlock(p);
2482
2483                 /* the fd tables may have changed - start over */
2484                 list = &fdp->fd_knlist[fd];
2485         }
2486 }
2487
2488 /* proc_fdlock held on entry (and exit) */
2489 static int
2490 knote_fdpattach(struct knote *kn, struct filedesc *fdp, __unused struct proc *p)
2491 {
2492         struct klist *list = NULL;
2493
2494         if (! kn->kn_fop->f_isfd) {
2495                 if (fdp->fd_knhashmask == 0)
2496                         fdp->fd_knhash = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
2497                             &fdp->fd_knhashmask);
2498                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
2499         } else {
2500                 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
2501                         u_int size = 0;
2502
2503                         /* have to grow the fd_knlist */
2504                         size = fdp->fd_knlistsize;
2505                         while (size <= kn->kn_id)
2506                                 size += KQEXTENT;
2507                         MALLOC(list, struct klist *,
2508                                size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
2509                         if (list == NULL)
2510                                 return (ENOMEM);
2511
2512                         bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
2513                               fdp->fd_knlistsize * sizeof(struct klist *));
2514                         bzero((caddr_t)list +
2515                               fdp->fd_knlistsize * sizeof(struct klist *),
2516                               (size - fdp->fd_knlistsize) * sizeof(struct klist *));
2517                         FREE(fdp->fd_knlist, M_KQUEUE);
2518                         fdp->fd_knlist = list;
2519                         fdp->fd_knlistsize = size;
2520                 }
2521                 list = &fdp->fd_knlist[kn->kn_id];
2522         }
2523         SLIST_INSERT_HEAD(list, kn, kn_link);
2524         return (0);
2525 }
2526
2527
2528
2529 /*
2530  * should be called at spl == 0, since we don't want to hold spl
2531  * while calling fdrop and free.
2532  */
2533 static void
2534 knote_drop(struct knote *kn, __unused struct proc *ctxp)
2535 {
2536         struct kqueue *kq = kn->kn_kq;
2537         struct proc *p = kq->kq_p;
2538         struct filedesc *fdp = p->p_fd;
2539         struct klist *list;
2540         int needswakeup;
2541
2542         proc_fdlock(p);
2543         if (kn->kn_fop->f_isfd)
2544                 list = &fdp->fd_knlist[kn->kn_id];
2545         else
2546                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
2547
2548         SLIST_REMOVE(list, kn, knote, kn_link);
2549         kqlock(kq);
2550         knote_dequeue(kn);
2551         needswakeup = (kn->kn_status & KN_USEWAIT);
2552         kqunlock(kq);
2553         proc_fdunlock(p);
2554
2555         if (needswakeup)
2556                 wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kn->kn_status, THREAD_AWAKENED);
2557
2558         if (kn->kn_fop->f_isfd)
2559                 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
2560
2561         knote_free(kn);
2562 }
2563
2564 /* called with kqueue lock held */
2565 static void
2566 knote_activate(struct knote *kn, int propagate)
2567 {
2568         struct kqueue *kq = kn->kn_kq;
2569
2570         kn->kn_status |= KN_ACTIVE;
2571         knote_enqueue(kn);
2572         kqueue_wakeup(kq, 0);
2573
2574         /* this is a real event: wake up the parent kq, too */
2575         if (propagate)
2576                 KNOTE(&kq->kq_sel.si_note, 0);
2577 }
2578
2579 /* called with kqueue lock held */
2580 static void
2581 knote_deactivate(struct knote *kn)
2582 {
2583         kn->kn_status &= ~KN_ACTIVE;
2584         knote_dequeue(kn);
2585 }
2586
2587 /* called with kqueue lock held */
2588 static void
2589 knote_enqueue(struct knote *kn)
2590 {
2591         if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_STAYQUEUED ||
2592             (kn->kn_status & (KN_QUEUED | KN_STAYQUEUED | KN_DISABLED)) == 0) {
2593                 struct kqtailq *tq = kn->kn_tq;
2594                 struct kqueue *kq = kn->kn_kq;
2595
2596                 TAILQ_INSERT_TAIL(tq, kn, kn_tqe);
2597                 kn->kn_status |= KN_QUEUED;
2598                 kq->kq_count++;
2599         }
2600 }
2601
2602 /* called with kqueue lock held */
2603 static void
2604 knote_dequeue(struct knote *kn)
2605 {
2606         struct kqueue *kq = kn->kn_kq;
2607
2608         if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_QUEUED) {
2609                 struct kqtailq *tq = kn->kn_tq;
2610
2611                 TAILQ_REMOVE(tq, kn, kn_tqe);
2612                 kn->kn_tq = &kq->kq_head;
2613                 kn->kn_status &= ~KN_QUEUED;
2614                 kq->kq_count--;
2615         }
2616 }
2617
2618 void
2619 knote_init(void)
2620 {
2621         knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote), 8192, "knote zone");
2622
2623         /* allocate kq lock group attribute and group */
2624         kq_lck_grp_attr= lck_grp_attr_alloc_init();
2625
2626         kq_lck_grp = lck_grp_alloc_init("kqueue",  kq_lck_grp_attr);
2627
2628         /* Allocate kq lock attribute */
2629         kq_lck_attr = lck_attr_alloc_init();
2630
2631         /* Initialize the timer filter lock */
2632         lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
2633         lck_mtx_init(&vm_pressure_klist_mutex, kq_lck_grp, kq_lck_attr);
2634 }
2635 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
2636
2637 static struct knote *
2638 knote_alloc(void)
2639 {
2640         return ((struct knote *)zalloc(knote_zone));
2641 }
2642
2643 static void
2644 knote_free(struct knote *kn)
2645 {
2646         zfree(knote_zone, kn);
2647 }
2648
2649 #if SOCKETS
2650 #include <sys/param.h>
2651 #include <sys/socket.h>
2652 #include <sys/protosw.h>
2653 #include <sys/domain.h>
2654 #include <sys/mbuf.h>
2655 #include <sys/kern_event.h>
2656 #include <sys/malloc.h>
2657 #include <sys/sys_domain.h>
2658 #include <sys/syslog.h>
2659
2660
2661 static int kev_attach(struct socket *so, int proto, struct proc *p);
2662 static int kev_detach(struct socket *so);
2663 static int kev_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct proc *p);
2664
2665 struct pr_usrreqs event_usrreqs = {
2666      pru_abort_notsupp, pru_accept_notsupp, kev_attach, pru_bind_notsupp, pru_connect_notsupp,
2667      pru_connect2_notsupp, kev_control, kev_detach, pru_disconnect_notsupp,
2668      pru_listen_notsupp, pru_peeraddr_notsupp, pru_rcvd_notsupp, pru_rcvoob_notsupp,
2669      pru_send_notsupp, pru_sense_null, pru_shutdown_notsupp, pru_sockaddr_notsupp,
2670      pru_sosend_notsupp, soreceive, pru_sopoll_notsupp
2671 };
2672
2673 struct protosw eventsw[] = {
2674      {
2675           .pr_type = SOCK_RAW,
2676           .pr_domain = &systemdomain,
2677           .pr_protocol = SYSPROTO_EVENT,
2678           .pr_flags = PR_ATOMIC,
2679           .pr_usrreqs = &event_usrreqs,
2680      }
2681 };
2682
2683 static
2684 struct kern_event_head kern_event_head;
2685
2686 static u_int32_t static_event_id = 0;
2687 struct domain *sysdom = &systemdomain;
2688 static lck_mtx_t *sys_mtx;
2689
2690 /*
2691  * Install the protosw's for the NKE manager.  Invoked at
2692  *  extension load time
2693  */
2694 int
2695 kern_event_init(void)
2696 {
2697     int retval;
2698
2699     if ((retval = net_add_proto(eventsw, &systemdomain)) != 0) {
2700             log(LOG_WARNING, "Can't install kernel events protocol (%d)\n", retval);
2701             return(retval);
2702         }
2703
2704     /*
2705      * Use the domain mutex for all system event sockets
2706      */
2707     sys_mtx = sysdom->dom_mtx;
2708
2709     return(KERN_SUCCESS);
2710 }
2711
2712 static int
2713 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
2714 {
2715      int error;
2716      struct kern_event_pcb  *ev_pcb;
2717
2718      error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
2719      if (error)
2720           return error;
2721
2722      MALLOC(ev_pcb, struct kern_event_pcb *, sizeof(struct kern_event_pcb), M_PCB, M_WAITOK);
2723      if (ev_pcb == 0)
2724           return ENOBUFS;
2725
2726      ev_pcb->ev_socket = so;
2727      ev_pcb->vendor_code_filter = 0xffffffff;
2728
2729      so->so_pcb = (caddr_t) ev_pcb;
2730      lck_mtx_lock(sys_mtx);
2731      LIST_INSERT_HEAD(&kern_event_head, ev_pcb, ev_link);
2732      lck_mtx_unlock(sys_mtx);
2733
2734      return 0;
2735 }
2736
2737
2738 static int
2739 kev_detach(struct socket *so)
2740 {
2741      struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
2742
2743      if (ev_pcb != 0) {
2744                 LIST_REMOVE(ev_pcb, ev_link);
2745                 FREE(ev_pcb, M_PCB);
2746                 so->so_pcb = 0;
2747                 so->so_flags |= SOF_PCBCLEARING;
2748      }
2749
2750      return 0;
2751 }
2752
2753 /*
2754  * For now, kev_vendor_code and mbuf_tags use the same
2755  * mechanism.
2756  */
2757
2758 errno_t kev_vendor_code_find(
2759         const char      *string,
2760         u_int32_t       *out_vendor_code)
2761 {
2762         if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
2763                 return EINVAL;
2764         }
2765         return net_str_id_find_internal(string, out_vendor_code, NSI_VENDOR_CODE, 1);
2766 }
2767
2768 errno_t  kev_msg_post(struct kev_msg *event_msg)
2769 {
2770         mbuf_tag_id_t   min_vendor, max_vendor;
2771
2772         net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
2773
2774         if (event_msg == NULL)
2775                 return EINVAL;
2776
2777         /* Limit third parties to posting events for registered vendor codes only */
2778         if (event_msg->vendor_code < min_vendor ||
2779                 event_msg->vendor_code > max_vendor)
2780         {
2781                 return EINVAL;
2782         }
2783
2784         return kev_post_msg(event_msg);
2785 }
2786
2787
2788 int  kev_post_msg(struct kev_msg *event_msg)
2789 {
2790      struct mbuf *m, *m2;
2791      struct kern_event_pcb  *ev_pcb;
2792      struct kern_event_msg  *ev;
2793      char              *tmp;
2794      u_int32_t     total_size;
2795      int               i;
2796
2797         /* Verify the message is small enough to fit in one mbuf w/o cluster */
2798         total_size = KEV_MSG_HEADER_SIZE;
2799
2800         for (i = 0; i < 5; i++) {
2801                 if (event_msg->dv[i].data_length == 0)
2802                         break;
2803                 total_size += event_msg->dv[i].data_length;
2804         }
2805
2806         if (total_size > MLEN) {
2807                 return EMSGSIZE;
2808         }
2809
2810      m = m_get(M_DONTWAIT, MT_DATA);
2811      if (m == 0)
2812           return ENOBUFS;
2813
2814      ev = mtod(m, struct kern_event_msg *);
2815      total_size = KEV_MSG_HEADER_SIZE;
2816
2817      tmp = (char *) &ev->event_data[0];
2818      for (i = 0; i < 5; i++) {
2819           if (event_msg->dv[i].data_length == 0)
2820                break;
2821
2822           total_size += event_msg->dv[i].data_length;
2823           bcopy(event_msg->dv[i].data_ptr, tmp,
2824                 event_msg->dv[i].data_length);
2825           tmp += event_msg->dv[i].data_length;
2826      }
2827
2828      ev->id = ++static_event_id;
2829      ev->total_size   = total_size;
2830      ev->vendor_code  = event_msg->vendor_code;
2831      ev->kev_class    = event_msg->kev_class;
2832      ev->kev_subclass = event_msg->kev_subclass;
2833      ev->event_code   = event_msg->event_code;
2834
2835      m->m_len = total_size;
2836      lck_mtx_lock(sys_mtx);
2837      for (ev_pcb = LIST_FIRST(&kern_event_head);
2838           ev_pcb;
2839           ev_pcb = LIST_NEXT(ev_pcb, ev_link)) {
2840
2841           if (ev_pcb->vendor_code_filter != KEV_ANY_VENDOR) {
2842                if (ev_pcb->vendor_code_filter != ev->vendor_code)
2843                     continue;
2844
2845                if (ev_pcb->class_filter != KEV_ANY_CLASS) {
2846                     if (ev_pcb->class_filter != ev->kev_class)
2847                          continue;
2848
2849                     if ((ev_pcb->subclass_filter != KEV_ANY_SUBCLASS) &&
2850                         (ev_pcb->subclass_filter != ev->kev_subclass))
2851                          continue;
2852                }
2853           }
2854
2855           m2 = m_copym(m, 0, m->m_len, M_NOWAIT);
2856           if (m2 == 0) {
2857                m_free(m);
2858                    lck_mtx_unlock(sys_mtx);
2859                return ENOBUFS;
2860           }
2861           /* the socket is already locked because we hold the sys_mtx here */
2862           if (sbappendrecord(&ev_pcb->ev_socket->so_rcv, m2))
2863                   sorwakeup(ev_pcb->ev_socket);
2864      }
2865
2866      m_free(m);
2867      lck_mtx_unlock(sys_mtx);
2868      return 0;
2869 }
2870
2871 static int
2872 kev_control(struct socket *so,
2873                         u_long cmd,
2874                         caddr_t data,
2875                         __unused struct ifnet *ifp,
2876                         __unused struct proc *p)
2877 {
2878         struct kev_request *kev_req = (struct kev_request *) data;
2879         struct kern_event_pcb  *ev_pcb;
2880         struct kev_vendor_code *kev_vendor;
2881         u_int32_t  *id_value = (u_int32_t *) data;
2882
2883
2884         switch (cmd) {
2885
2886                 case SIOCGKEVID:
2887                         *id_value = static_event_id;
2888                         break;
2889
2890                 case SIOCSKEVFILT:
2891                         ev_pcb = (struct kern_event_pcb *) so->so_pcb;
2892                         ev_pcb->vendor_code_filter = kev_req->vendor_code;
2893                         ev_pcb->class_filter     = kev_req->kev_class;
2894                         ev_pcb->subclass_filter  = kev_req->kev_subclass;
2895                         break;
2896
2897                 case SIOCGKEVFILT:
2898                         ev_pcb = (struct kern_event_pcb *) so->so_pcb;
2899                         kev_req->vendor_code = ev_pcb->vendor_code_filter;
2900                         kev_req->kev_class   = ev_pcb->class_filter;
2901                         kev_req->kev_subclass = ev_pcb->subclass_filter;
2902                         break;
2903
2904                 case SIOCGKEVVENDOR:
2905                         kev_vendor = (struct kev_vendor_code*)data;
2906
2907                         /* Make sure string is NULL terminated */
2908                         kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
2909
2910                         return net_str_id_find_internal(kev_vendor->vendor_string,
2911                                         &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
2912
2913                 default:
2914                         return ENOTSUP;
2915         }
2916
2917         return 0;
2918 }
2919
2920 #endif /* SOCKETS */
2921
2922
2923 int
2924 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
2925 {
2926         struct vinfo_stat * st;
2927
2928         /* No need for the funnel as fd is kept alive */
2929
2930         st = &kinfo->kq_stat;
2931
2932         st->vst_size = kq->kq_count;
2933         if (kq->kq_state & KQ_KEV64)
2934                 st->vst_blksize = sizeof(struct kevent64_s);
2935         else
2936                 st->vst_blksize = sizeof(struct kevent);
2937         st->vst_mode = S_IFIFO;
2938         if (kq->kq_state & KQ_SEL)
2939                 kinfo->kq_state |=  PROC_KQUEUE_SELECT;
2940         if (kq->kq_state & KQ_SLEEP)
2941                 kinfo->kq_state |= PROC_KQUEUE_SLEEP;
2942
2943         return(0);
2944 }
2945
2946
2947 void
2948 knote_markstayqueued(struct knote *kn)
2949 {
2950         kqlock(kn->kn_kq);
2951         kn->kn_status |= KN_STAYQUEUED;
2952         knote_enqueue(kn);
2953         kqunlock(kn->kn_kq);
2954 }