bsd/kern/kern_event.c

   1 /*
   2  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  */
  29 /*-
  30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  31  * All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  52  * SUCH DAMAGE.
  53  */
  54 /*
  55  *      @(#)kern_event.c       1.0 (3/31/2000)
  56  */
  57 #include <stdint.h>
  58
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/filedesc.h>
  62 #include <sys/kernel.h>
  63 #include <sys/proc_internal.h>
  64 #include <sys/kauth.h>
  65 #include <sys/malloc.h>
  66 #include <sys/unistd.h>
  67 #include <sys/file_internal.h>
  68 #include <sys/fcntl.h>
  69 #include <sys/select.h>
  70 #include <sys/queue.h>
  71 #include <sys/event.h>
  72 #include <sys/eventvar.h>
  73 #include <sys/protosw.h>
  74 #include <sys/socket.h>
  75 #include <sys/socketvar.h>
  76 #include <sys/stat.h>
  77 #include <sys/sysctl.h>
  78 #include <sys/uio.h>
  79 #include <sys/sysproto.h>
  80 #include <sys/user.h>
  81 #include <sys/vnode_internal.h>
  82 #include <string.h>
  83 #include <sys/proc_info.h>
  84 #include <sys/codesign.h>
  85 #include <sys/pthread_shims.h>
  86
  87 #include <kern/locks.h>
  88 #include <kern/clock.h>
  89 #include <kern/thread_call.h>
  90 #include <kern/sched_prim.h>
  91 #include <kern/waitq.h>
  92 #include <kern/zalloc.h>
  93 #include <kern/kalloc.h>
  94 #include <kern/assert.h>
  95
  96 #include <libkern/libkern.h>
  97 #include "net/net_str_id.h"
  98
  99 #include <mach/task.h>
 100
 101 #if VM_PRESSURE_EVENTS
 102 #include <kern/vm_pressure.h>
 103 #endif
 104
 105 #if CONFIG_MEMORYSTATUS
 106 #include <sys/kern_memorystatus.h>
 107 #endif
 108
 109 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 110
 111 #define KQ_EVENT        NO_EVENT64
 112
 113 static inline void kqlock(struct kqueue *kq);
 114 static inline void kqunlock(struct kqueue *kq);
 115
 116 static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn);
 117 static int kqlock2knoteusewait(struct kqueue *kq, struct knote *kn);
 118 static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
 119 static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn);
 120
 121 static void kqueue_wakeup(struct kqueue *kq, int closed);
 122 static int kqueue_read(struct fileproc *fp, struct uio *uio,
 123     int flags, vfs_context_t ctx);
 124 static int kqueue_write(struct fileproc *fp, struct uio *uio,
 125     int flags, vfs_context_t ctx);
 126 static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
 127     vfs_context_t ctx);
 128 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
 129     vfs_context_t ctx);
 130 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
 131 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
 132         vfs_context_t ctx);
 133 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
 134
 135 static const struct fileops kqueueops = {
 136         .fo_type = DTYPE_KQUEUE,
 137         .fo_read = kqueue_read,
 138         .fo_write = kqueue_write,
 139         .fo_ioctl = kqueue_ioctl,
 140         .fo_select = kqueue_select,
 141         .fo_close = kqueue_close,
 142         .fo_kqfilter = kqueue_kqfilter,
 143         .fo_drain = kqueue_drain,
 144 };
 145
 146 static int kevent_internal(struct proc *p, int fd,
 147                            user_addr_t changelist, int nchanges,
 148                            user_addr_t eventlist, int nevents,
 149                            user_addr_t data_out, user_size_t *data_available,
 150                            unsigned int flags, user_addr_t utimeout,
 151                            kqueue_continue_t continuation,
 152                            int32_t *retval);
 153 static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp,
 154                          struct proc *p, unsigned int flags);
 155 static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
 156                           struct proc *p, unsigned int flags);
 157 char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
 158
 159 static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
 160                            void *data);
 161 static void kevent_continue(struct kqueue *kq, void *data, int error);
 162 static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
 163 static int kqueue_process(struct kqueue *kq, kevent_callback_t callback,
 164                           void *data, int *countp, struct proc *p);
 165 static int kqueue_begin_processing(struct kqueue *kq);
 166 static void kqueue_end_processing(struct kqueue *kq);
 167 static int knote_process(struct knote *kn, kevent_callback_t callback,
 168                          void *data, struct kqtailq *inprocessp, struct proc *p);
 169 static void knote_put(struct knote *kn);
 170 static int knote_fdpattach(struct knote *kn, struct filedesc *fdp,
 171                            struct proc *p);
 172 static void knote_drop(struct knote *kn, struct proc *p);
 173 static void knote_activate(struct knote *kn, int);
 174 static void knote_deactivate(struct knote *kn);
 175 static void knote_enqueue(struct knote *kn);
 176 static void knote_dequeue(struct knote *kn);
 177 static struct knote *knote_alloc(void);
 178 static void knote_free(struct knote *kn);
 179
 180 static int filt_fileattach(struct knote *kn);
 181 static struct filterops file_filtops = {
 182         .f_isfd = 1,
 183         .f_attach = filt_fileattach,
 184 };
 185
 186 static void filt_kqdetach(struct knote *kn);
 187 static int filt_kqueue(struct knote *kn, long hint);
 188 static struct filterops kqread_filtops = {
 189         .f_isfd = 1,
 190         .f_detach = filt_kqdetach,
 191         .f_event = filt_kqueue,
 192 };
 193
 194 /* placeholder for not-yet-implemented filters */
 195 static int filt_badattach(struct knote *kn);
 196 static struct filterops bad_filtops = {
 197         .f_attach = filt_badattach,
 198 };
 199
 200 static int filt_procattach(struct knote *kn);
 201 static void filt_procdetach(struct knote *kn);
 202 static int filt_proc(struct knote *kn, long hint);
 203 static struct filterops proc_filtops = {
 204         .f_attach = filt_procattach,
 205         .f_detach = filt_procdetach,
 206         .f_event = filt_proc,
 207 };
 208
 209 #if VM_PRESSURE_EVENTS
 210 static int filt_vmattach(struct knote *kn);
 211 static void filt_vmdetach(struct knote *kn);
 212 static int filt_vm(struct knote *kn, long hint);
 213 static struct filterops vm_filtops = {
 214         .f_attach = filt_vmattach,
 215         .f_detach = filt_vmdetach,
 216         .f_event = filt_vm,
 217 };
 218 #endif /* VM_PRESSURE_EVENTS */
 219
 220 #if CONFIG_MEMORYSTATUS
 221 extern struct filterops memorystatus_filtops;
 222 #endif /* CONFIG_MEMORYSTATUS */
 223
 224 extern struct filterops fs_filtops;
 225
 226 extern struct filterops sig_filtops;
 227
 228 /* Timer filter */
 229 static int filt_timerattach(struct knote *kn);
 230 static void filt_timerdetach(struct knote *kn);
 231 static int filt_timer(struct knote *kn, long hint);
 232 static void filt_timertouch(struct knote *kn, struct kevent_internal_s *kev,
 233     long type);
 234 static struct filterops timer_filtops = {
 235         .f_attach = filt_timerattach,
 236         .f_detach = filt_timerdetach,
 237         .f_event = filt_timer,
 238         .f_touch = filt_timertouch,
 239 };
 240
 241 /* Helpers */
 242 static void filt_timerexpire(void *knx, void *param1);
 243 static int filt_timervalidate(struct knote *kn);
 244 static void filt_timerupdate(struct knote *kn);
 245 static void filt_timercancel(struct knote *kn);
 246
 247 #define TIMER_RUNNING           0x1
 248 #define TIMER_CANCELWAIT        0x2
 249
 250 static lck_mtx_t _filt_timerlock;
 251 static void filt_timerlock(void);
 252 static void filt_timerunlock(void);
 253
 254 static zone_t knote_zone;
 255
 256 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 257
 258 #if 0
 259 extern struct filterops aio_filtops;
 260 #endif
 261
 262 /* Mach portset filter */
 263 extern struct filterops machport_filtops;
 264
 265 /* User filter */
 266 static int filt_userattach(struct knote *kn);
 267 static void filt_userdetach(struct knote *kn);
 268 static int filt_user(struct knote *kn, long hint);
 269 static void filt_usertouch(struct knote *kn, struct kevent_internal_s *kev,
 270     long type);
 271 static struct filterops user_filtops = {
 272         .f_attach = filt_userattach,
 273         .f_detach = filt_userdetach,
 274         .f_event = filt_user,
 275         .f_touch = filt_usertouch,
 276 };
 277
 278 /*
 279  * Table for all system-defined filters.
 280  */
 281 static struct filterops *sysfilt_ops[] = {
 282         &file_filtops,                  /* EVFILT_READ */
 283         &file_filtops,                  /* EVFILT_WRITE */
 284 #if 0
 285         &aio_filtops,                   /* EVFILT_AIO */
 286 #else
 287         &bad_filtops,                   /* EVFILT_AIO */
 288 #endif
 289         &file_filtops,                  /* EVFILT_VNODE */
 290         &proc_filtops,                  /* EVFILT_PROC */
 291         &sig_filtops,                   /* EVFILT_SIGNAL */
 292         &timer_filtops,                 /* EVFILT_TIMER */
 293         &machport_filtops,              /* EVFILT_MACHPORT */
 294         &fs_filtops,                    /* EVFILT_FS */
 295         &user_filtops,                  /* EVFILT_USER */
 296         &bad_filtops,                   /* unused */
 297 #if VM_PRESSURE_EVENTS
 298         &vm_filtops,                    /* EVFILT_VM */
 299 #else
 300         &bad_filtops,                   /* EVFILT_VM */
 301 #endif
 302         &file_filtops,                  /* EVFILT_SOCK */
 303 #if CONFIG_MEMORYSTATUS
 304         &memorystatus_filtops,  /* EVFILT_MEMORYSTATUS */
 305 #else
 306         &bad_filtops,                   /* EVFILT_MEMORYSTATUS */
 307 #endif
 308 };
 309
 310 /*
 311  * kqueue/note lock attributes and implementations
 312  *
 313  *      kqueues have locks, while knotes have use counts
 314  *      Most of the knote state is guarded by the object lock.
 315  *      the knote "inuse" count and status use the kqueue lock.
 316  */
 317 lck_grp_attr_t * kq_lck_grp_attr;
 318 lck_grp_t * kq_lck_grp;
 319 lck_attr_t * kq_lck_attr;
 320
 321 static inline void
 322 kqlock(struct kqueue *kq)
 323 {
 324         lck_spin_lock(&kq->kq_lock);
 325 }
 326
 327 static inline void
 328 kqunlock(struct kqueue *kq)
 329 {
 330         lck_spin_unlock(&kq->kq_lock);
 331 }
 332
 333 /*
 334  * Convert a kq lock to a knote use referece.
 335  *
 336  *      If the knote is being dropped, we can't get
 337  *      a use reference, so just return with it
 338  *      still locked.
 339  *      - kq locked at entry
 340  *      - unlock on exit if we get the use reference
 341  */
 342 static int
 343 kqlock2knoteuse(struct kqueue *kq, struct knote *kn)
 344 {
 345         if (kn->kn_status & KN_DROPPING)
 346                 return (0);
 347         kn->kn_inuse++;
 348         kqunlock(kq);
 349         return (1);
 350 }
 351
 352 /*
 353  * Convert a kq lock to a knote use referece,
 354  * but wait for attach and drop events to complete.
 355  *
 356  *      If the knote is being dropped, we can't get
 357  *      a use reference, so just return with it
 358  *      still locked.
 359  *      - kq locked at entry
 360  *      - kq always unlocked on exit
 361  */
 362 static int
 363 kqlock2knoteusewait(struct kqueue *kq, struct knote *kn)
 364 {
 365         if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
 366                 kn->kn_status |= KN_USEWAIT;
 367                 waitq_assert_wait64((struct waitq *)kq->kq_wqs,
 368                                     CAST_EVENT64_T(&kn->kn_status),
 369                                     THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
 370                 kqunlock(kq);
 371                 thread_block(THREAD_CONTINUE_NULL);
 372                 return (0);
 373         }
 374         kn->kn_inuse++;
 375         kqunlock(kq);
 376         return (1);
 377 }
 378
 379 /*
 380  * Convert from a knote use reference back to kq lock.
 381  *
 382  *      Drop a use reference and wake any waiters if
 383  *      this is the last one.
 384  *
 385  *      The exit return indicates if the knote is
 386  *      still alive - but the kqueue lock is taken
 387  *      unconditionally.
 388  */
 389 static int
 390 knoteuse2kqlock(struct kqueue *kq, struct knote *kn)
 391 {
 392         kqlock(kq);
 393         if (--kn->kn_inuse == 0) {
 394                 if ((kn->kn_status & KN_ATTACHING) != 0) {
 395                         kn->kn_status &= ~KN_ATTACHING;
 396                 }
 397                 if ((kn->kn_status & KN_USEWAIT) != 0) {
 398                         kn->kn_status &= ~KN_USEWAIT;
 399                         waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
 400                                            CAST_EVENT64_T(&kn->kn_status),
 401                                            THREAD_AWAKENED,
 402                                            WAITQ_ALL_PRIORITIES);
 403                 }
 404         }
 405         return ((kn->kn_status & KN_DROPPING) == 0);
 406 }
 407
 408 /*
 409  * Convert a kq lock to a knote drop reference.
 410  *
 411  *      If the knote is in use, wait for the use count
 412  *      to subside.  We first mark our intention to drop
 413  *      it - keeping other users from "piling on."
 414  *      If we are too late, we have to wait for the
 415  *      other drop to complete.
 416  *
 417  *      - kq locked at entry
 418  *      - always unlocked on exit.
 419  *      - caller can't hold any locks that would prevent
 420  *        the other dropper from completing.
 421  */
 422 static int
 423 kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
 424 {
 425         int oktodrop;
 426
 427         oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
 428         kn->kn_status &= ~KN_STAYQUEUED;
 429         kn->kn_status |= KN_DROPPING;
 430         if (oktodrop) {
 431                 if (kn->kn_inuse == 0) {
 432                         kqunlock(kq);
 433                         return (oktodrop);
 434                 }
 435         }
 436         kn->kn_status |= KN_USEWAIT;
 437         waitq_assert_wait64((struct waitq *)kq->kq_wqs,
 438                             CAST_EVENT64_T(&kn->kn_status),
 439                             THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
 440         kqunlock(kq);
 441         thread_block(THREAD_CONTINUE_NULL);
 442         return (oktodrop);
 443 }
 444
 445 /*
 446  * Release a knote use count reference.
 447  */
 448 static void
 449 knote_put(struct knote *kn)
 450 {
 451         struct kqueue *kq = kn->kn_kq;
 452
 453         kqlock(kq);
 454         if (--kn->kn_inuse == 0) {
 455                 if ((kn->kn_status & KN_USEWAIT) != 0) {
 456                         kn->kn_status &= ~KN_USEWAIT;
 457                         waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
 458                                            CAST_EVENT64_T(&kn->kn_status),
 459                                            THREAD_AWAKENED,
 460                                            WAITQ_ALL_PRIORITIES);
 461                 }
 462         }
 463         kqunlock(kq);
 464 }
 465
 466 static int
 467 filt_fileattach(struct knote *kn)
 468 {
 469         return (fo_kqfilter(kn->kn_fp, kn, vfs_context_current()));
 470 }
 471
 472 #define f_flag f_fglob->fg_flag
 473 #define f_msgcount f_fglob->fg_msgcount
 474 #define f_cred f_fglob->fg_cred
 475 #define f_ops f_fglob->fg_ops
 476 #define f_offset f_fglob->fg_offset
 477 #define f_data f_fglob->fg_data
 478
 479 static void
 480 filt_kqdetach(struct knote *kn)
 481 {
 482         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 483
 484         kqlock(kq);
 485         KNOTE_DETACH(&kq->kq_sel.si_note, kn);
 486         kqunlock(kq);
 487 }
 488
 489 /*ARGSUSED*/
 490 static int
 491 filt_kqueue(struct knote *kn, __unused long hint)
 492 {
 493         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 494
 495         kn->kn_data = kq->kq_count;
 496         return (kn->kn_data > 0);
 497 }
 498
 499 static int
 500 filt_procattach(struct knote *kn)
 501 {
 502         struct proc *p;
 503
 504         assert(PID_MAX < NOTE_PDATAMASK);
 505
 506         if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0)
 507                 return (ENOTSUP);
 508
 509         p = proc_find(kn->kn_id);
 510         if (p == NULL) {
 511                 return (ESRCH);
 512         }
 513
 514         const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
 515
 516         if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
 517                 do {
 518                         pid_t selfpid = proc_selfpid();
 519
 520                         if (p->p_ppid == selfpid)
 521                                 break;  /* parent => ok */
 522
 523                         if ((p->p_lflag & P_LTRACED) != 0 &&
 524                             (p->p_oppid == selfpid))
 525                                 break;  /* parent-in-waiting => ok */
 526
 527                         proc_rele(p);
 528                         return (EACCES);
 529                 } while (0);
 530
 531         proc_klist_lock();
 532
 533         kn->kn_flags |= EV_CLEAR;       /* automatically set */
 534         kn->kn_ptr.p_proc = p;          /* store the proc handle */
 535
 536         KNOTE_ATTACH(&p->p_klist, kn);
 537
 538         proc_klist_unlock();
 539
 540         proc_rele(p);
 541
 542         return (0);
 543 }
 544
 545 /*
 546  * The knote may be attached to a different process, which may exit,
 547  * leaving nothing for the knote to be attached to.  In that case,
 548  * the pointer to the process will have already been nulled out.
 549  */
 550 static void
 551 filt_procdetach(struct knote *kn)
 552 {
 553         struct proc *p;
 554
 555         proc_klist_lock();
 556
 557         p = kn->kn_ptr.p_proc;
 558         if (p != PROC_NULL) {
 559                 kn->kn_ptr.p_proc = PROC_NULL;
 560                 KNOTE_DETACH(&p->p_klist, kn);
 561         }
 562
 563         proc_klist_unlock();
 564 }
 565
 566 static int
 567 filt_proc(struct knote *kn, long hint)
 568 {
 569         /*
 570          * Note: a lot of bits in hint may be obtained from the knote
 571          * To free some of those bits, see <rdar://problem/12592988> Freeing up
 572          * bits in hint for filt_proc
 573          */
 574         /* hint is 0 when called from above */
 575         if (hint != 0) {
 576                 u_int event;
 577
 578                 /* ALWAYS CALLED WITH proc_klist_lock when (hint != 0) */
 579
 580                 /*
 581                  * mask off extra data
 582                  */
 583                 event = (u_int)hint & NOTE_PCTRLMASK;
 584
 585                 /*
 586                  * termination lifecycle events can happen while a debugger
 587                  * has reparented a process, in which case notifications
 588                  * should be quashed except to the tracing parent. When
 589                  * the debugger reaps the child (either via wait4(2) or
 590                  * process exit), the child will be reparented to the original
 591                  * parent and these knotes re-fired.
 592                  */
 593                 if (event & NOTE_EXIT) {
 594                         if ((kn->kn_ptr.p_proc->p_oppid != 0)
 595                                 && (kn->kn_kq->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
 596                                 /*
 597                                  * This knote is not for the current ptrace(2) parent, ignore.
 598                                  */
 599                                 return 0;
 600                         }
 601                 }
 602
 603                 /*
 604                  * if the user is interested in this event, record it.
 605                  */
 606                 if (kn->kn_sfflags & event)
 607                         kn->kn_fflags |= event;
 608
 609 #pragma clang diagnostic push
 610 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
 611                 if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
 612                         kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 613                 }
 614 #pragma clang diagnostic pop
 615
 616
 617                 /*
 618                  * The kernel has a wrapper in place that returns the same data
 619                  * as is collected here, in kn_data.  Any changes to how
 620                  * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
 621                  * should also be reflected in the proc_pidnoteexit() wrapper.
 622                  */
 623                 if (event == NOTE_EXIT) {
 624                         kn->kn_data = 0;
 625                         if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
 626                                 kn->kn_fflags |= NOTE_EXITSTATUS;
 627                                 kn->kn_data |= (hint & NOTE_PDATAMASK);
 628                         }
 629                         if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
 630                                 kn->kn_fflags |= NOTE_EXIT_DETAIL;
 631                                 if ((kn->kn_ptr.p_proc->p_lflag &
 632                                     P_LTERM_DECRYPTFAIL) != 0) {
 633                                         kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
 634                                 }
 635                                 if ((kn->kn_ptr.p_proc->p_lflag &
 636                                     P_LTERM_JETSAM) != 0) {
 637                                         kn->kn_data |= NOTE_EXIT_MEMORY;
 638                                         switch (kn->kn_ptr.p_proc->p_lflag &
 639                                             P_JETSAM_MASK) {
 640                                                 case P_JETSAM_VMPAGESHORTAGE:
 641                                                         kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
 642                                                         break;
 643                                                 case P_JETSAM_VMTHRASHING:
 644                                                         kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
 645                                                         break;
 646                                                 case P_JETSAM_FCTHRASHING:
 647                                                         kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
 648                                                         break;
 649                                                 case P_JETSAM_VNODE:
 650                                                         kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
 651                                                         break;
 652                                                 case P_JETSAM_HIWAT:
 653                                                         kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
 654                                                         break;
 655                                                 case P_JETSAM_PID:
 656                                                         kn->kn_data |= NOTE_EXIT_MEMORY_PID;
 657                                                         break;
 658                                                 case P_JETSAM_IDLEEXIT:
 659                                                         kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
 660                                                         break;
 661                                         }
 662                                 }
 663                                 if ((kn->kn_ptr.p_proc->p_csflags &
 664                                     CS_KILLED) != 0) {
 665                                         kn->kn_data |= NOTE_EXIT_CSERROR;
 666                                 }
 667                         }
 668                 }
 669         }
 670
 671         /* atomic check, no locking need when called from above */
 672         return (kn->kn_fflags != 0);
 673 }
 674
 675 #if VM_PRESSURE_EVENTS
 676 /*
 677  * Virtual memory kevents
 678  *
 679  * author: Matt Jacobson [matthew_jacobson@apple.com]
 680  */
 681
 682 static int
 683 filt_vmattach(struct knote *kn)
 684 {
 685         /*
 686          * The note will be cleared once the information has been flushed to
 687          * the client. If there is still pressure, we will be re-alerted.
 688          */
 689         kn->kn_flags |= EV_CLEAR;
 690         return (vm_knote_register(kn));
 691 }
 692
 693 static void
 694 filt_vmdetach(struct knote *kn)
 695 {
 696         vm_knote_unregister(kn);
 697 }
 698
 699 static int
 700 filt_vm(struct knote *kn, long hint)
 701 {
 702         /* hint == 0 means this is just an alive? check (always true) */
 703         if (hint != 0) {
 704                 const pid_t pid = (pid_t)hint;
 705                 if ((kn->kn_sfflags & NOTE_VM_PRESSURE) &&
 706                     (kn->kn_kq->kq_p->p_pid == pid)) {
 707                         kn->kn_fflags |= NOTE_VM_PRESSURE;
 708                 }
 709         }
 710
 711         return (kn->kn_fflags != 0);
 712 }
 713 #endif /* VM_PRESSURE_EVENTS */
 714
 715 /*
 716  * filt_timervalidate - process data from user
 717  *
 718  *      Converts to either interval or deadline format.
 719  *
 720  *      The saved-data field in the knote contains the
 721  *      time value.  The saved filter-flags indicates
 722  *      the unit of measurement.
 723  *
 724  *      After validation, either the saved-data field
 725  *      contains the interval in absolute time, or ext[0]
 726  *      contains the expected deadline. If that deadline
 727  *      is in the past, ext[0] is 0.
 728  *
 729  *      Returns EINVAL for unrecognized units of time.
 730  *
 731  *      Timer filter lock is held.
 732  *
 733  */
 734 static int
 735 filt_timervalidate(struct knote *kn)
 736 {
 737         uint64_t multiplier;
 738         uint64_t raw = 0;
 739
 740         switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS)) {
 741         case NOTE_SECONDS:
 742                 multiplier = NSEC_PER_SEC;
 743                 break;
 744         case NOTE_USECONDS:
 745                 multiplier = NSEC_PER_USEC;
 746                 break;
 747         case NOTE_NSECONDS:
 748                 multiplier = 1;
 749                 break;
 750         case 0: /* milliseconds (default) */
 751                 multiplier = NSEC_PER_SEC / 1000;
 752                 break;
 753         default:
 754                 return (EINVAL);
 755         }
 756
 757         /* transform the slop delta(leeway) in kn_ext[1] if passed to same time scale */
 758         if(kn->kn_sfflags & NOTE_LEEWAY){
 759                 nanoseconds_to_absolutetime((uint64_t)kn->kn_ext[1] * multiplier, &raw);
 760                 kn->kn_ext[1] = raw;
 761         }
 762
 763         nanoseconds_to_absolutetime((uint64_t)kn->kn_sdata * multiplier, &raw);
 764
 765         kn->kn_ext[0] = 0;
 766         kn->kn_sdata = 0;
 767
 768         if (kn->kn_sfflags & NOTE_ABSOLUTE) {
 769                 clock_sec_t seconds;
 770                 clock_nsec_t nanoseconds;
 771                 uint64_t now;
 772
 773                 clock_get_calendar_nanotime(&seconds, &nanoseconds);
 774                 nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC +
 775                     nanoseconds, &now);
 776
 777                 if (raw < now) {
 778                         /* time has already passed */
 779                         kn->kn_ext[0] = 0;
 780                 } else {
 781                         raw -= now;
 782                         clock_absolutetime_interval_to_deadline(raw,
 783                             &kn->kn_ext[0]);
 784                 }
 785         } else {
 786                 kn->kn_sdata = raw;
 787         }
 788
 789         return (0);
 790 }
 791
 792 /*
 793  * filt_timerupdate - compute the next deadline
 794  *
 795  *      Repeating timers store their interval in kn_sdata. Absolute
 796  *      timers have already calculated the deadline, stored in ext[0].
 797  *
 798  *      On return, the next deadline (or zero if no deadline is needed)
 799  *      is stored in kn_ext[0].
 800  *
 801  *      Timer filter lock is held.
 802  */
 803 static void
 804 filt_timerupdate(struct knote *kn)
 805 {
 806         /* if there's no interval, deadline is just in kn_ext[0] */
 807         if (kn->kn_sdata == 0)
 808                 return;
 809
 810         /* if timer hasn't fired before, fire in interval nsecs */
 811         if (kn->kn_ext[0] == 0) {
 812                 clock_absolutetime_interval_to_deadline(kn->kn_sdata,
 813                     &kn->kn_ext[0]);
 814         } else {
 815                 /*
 816                  * If timer has fired before, schedule the next pop
 817                  * relative to the last intended deadline.
 818                  *
 819                  * We could check for whether the deadline has expired,
 820                  * but the thread call layer can handle that.
 821                  */
 822                 kn->kn_ext[0] += kn->kn_sdata;
 823         }
 824 }
 825
 826 /*
 827  * filt_timerexpire - the timer callout routine
 828  *
 829  * Just propagate the timer event into the knote
 830  * filter routine (by going through the knote
 831  * synchronization point).  Pass a hint to
 832  * indicate this is a real event, not just a
 833  * query from above.
 834  */
 835 static void
 836 filt_timerexpire(void *knx, __unused void *spare)
 837 {
 838         struct klist timer_list;
 839         struct knote *kn = knx;
 840
 841         filt_timerlock();
 842
 843         kn->kn_hookid &= ~TIMER_RUNNING;
 844
 845         /* no "object" for timers, so fake a list */
 846         SLIST_INIT(&timer_list);
 847         SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
 848         KNOTE(&timer_list, 1);
 849
 850         /* if someone is waiting for timer to pop */
 851         if (kn->kn_hookid & TIMER_CANCELWAIT) {
 852                 struct kqueue *kq = kn->kn_kq;
 853                 waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
 854                                    CAST_EVENT64_T(&kn->kn_hook),
 855                                    THREAD_AWAKENED,
 856                                    WAITQ_ALL_PRIORITIES);
 857         }
 858
 859         filt_timerunlock();
 860 }
 861
 862 /*
 863  * Cancel a running timer (or wait for the pop).
 864  * Timer filter lock is held.
 865  */
 866 static void
 867 filt_timercancel(struct knote *kn)
 868 {
 869         struct kqueue *kq = kn->kn_kq;
 870         thread_call_t callout = kn->kn_hook;
 871         boolean_t cancelled;
 872
 873         if (kn->kn_hookid & TIMER_RUNNING) {
 874                 /* cancel the callout if we can */
 875                 cancelled = thread_call_cancel(callout);
 876                 if (cancelled) {
 877                         kn->kn_hookid &= ~TIMER_RUNNING;
 878                 } else {
 879                         /* we have to wait for the expire routine.  */
 880                         kn->kn_hookid |= TIMER_CANCELWAIT;
 881                         waitq_assert_wait64((struct waitq *)kq->kq_wqs,
 882                                             CAST_EVENT64_T(&kn->kn_hook),
 883                                             THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
 884                         filt_timerunlock();
 885                         thread_block(THREAD_CONTINUE_NULL);
 886                         filt_timerlock();
 887                         assert((kn->kn_hookid & TIMER_RUNNING) == 0);
 888                 }
 889         }
 890 }
 891
 892 /*
 893  * Allocate a thread call for the knote's lifetime, and kick off the timer.
 894  */
 895 static int
 896 filt_timerattach(struct knote *kn)
 897 {
 898         thread_call_t callout;
 899         int error;
 900
 901         callout = thread_call_allocate(filt_timerexpire, kn);
 902         if (NULL == callout)
 903                 return (ENOMEM);
 904
 905         filt_timerlock();
 906         error = filt_timervalidate(kn);
 907         if (error != 0) {
 908                 filt_timerunlock();
 909                 return (error);
 910         }
 911
 912         kn->kn_hook = (void*)callout;
 913         kn->kn_hookid = 0;
 914
 915         /* absolute=EV_ONESHOT */
 916         if (kn->kn_sfflags & NOTE_ABSOLUTE)
 917                 kn->kn_flags |= EV_ONESHOT;
 918
 919         filt_timerupdate(kn);
 920         if (kn->kn_ext[0]) {
 921                 kn->kn_flags |= EV_CLEAR;
 922                 unsigned int timer_flags = 0;
 923                 if (kn->kn_sfflags & NOTE_CRITICAL)
 924                         timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
 925                 else if (kn->kn_sfflags & NOTE_BACKGROUND)
 926                         timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
 927                 else
 928                         timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
 929
 930                 if (kn->kn_sfflags & NOTE_LEEWAY)
 931                         timer_flags |= THREAD_CALL_DELAY_LEEWAY;
 932
 933                 thread_call_enter_delayed_with_leeway(callout, NULL,
 934                                 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
 935
 936                 kn->kn_hookid |= TIMER_RUNNING;
 937         } else {
 938                 /* fake immediate */
 939                 kn->kn_data = 1;
 940         }
 941
 942         filt_timerunlock();
 943         return (0);
 944 }
 945
 946 /*
 947  * Shut down the timer if it's running, and free the callout.
 948  */
 949 static void
 950 filt_timerdetach(struct knote *kn)
 951 {
 952         thread_call_t callout;
 953
 954         filt_timerlock();
 955
 956         callout = (thread_call_t)kn->kn_hook;
 957         filt_timercancel(kn);
 958
 959         filt_timerunlock();
 960
 961         thread_call_free(callout);
 962 }
 963
 964
 965
 966 static int
 967 filt_timer(struct knote *kn, long hint)
 968 {
 969         int result;
 970
 971         if (hint) {
 972                 /* real timer pop -- timer lock held by filt_timerexpire */
 973                 kn->kn_data++;
 974
 975                 if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) &&
 976                                 ((kn->kn_flags & EV_ONESHOT) == 0)) {
 977
 978                         /* evaluate next time to fire */
 979                         filt_timerupdate(kn);
 980
 981                         if (kn->kn_ext[0]) {
 982                                 unsigned int timer_flags = 0;
 983
 984                                 /* keep the callout and re-arm */
 985                                 if (kn->kn_sfflags & NOTE_CRITICAL)
 986                                         timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
 987                                 else if (kn->kn_sfflags & NOTE_BACKGROUND)
 988                                         timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
 989                                 else
 990                                         timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
 991
 992                                 if (kn->kn_sfflags & NOTE_LEEWAY)
 993                                         timer_flags |= THREAD_CALL_DELAY_LEEWAY;
 994
 995                                 thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
 996                                                 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
 997
 998                                 kn->kn_hookid |= TIMER_RUNNING;
 999                         }
1000                 }
1001
1002                 return (1);
1003         }
1004
1005         /* user-query */
1006         filt_timerlock();
1007
1008         result = (kn->kn_data != 0);
1009
1010         filt_timerunlock();
1011
1012         return (result);
1013 }
1014
1015
1016 /*
1017  * filt_timertouch - update knote with new user input
1018  *
1019  * Cancel and restart the timer based on new user data. When
1020  * the user picks up a knote, clear the count of how many timer
1021  * pops have gone off (in kn_data).
1022  */
1023 static void
1024 filt_timertouch(struct knote *kn, struct kevent_internal_s *kev, long type)
1025 {
1026         int error;
1027         filt_timerlock();
1028
1029         switch (type) {
1030         case EVENT_REGISTER:
1031                 /* cancel current call */
1032                 filt_timercancel(kn);
1033
1034                 /* recalculate deadline */
1035                 kn->kn_sdata = kev->data;
1036                 kn->kn_sfflags = kev->fflags;
1037                 kn->kn_ext[0] = kev->ext[0];
1038                 kn->kn_ext[1] = kev->ext[1];
1039
1040                 error = filt_timervalidate(kn);
1041                 if (error) {
1042                         /* no way to report error, so mark it in the knote */
1043                         kn->kn_flags |= EV_ERROR;
1044                         kn->kn_data = error;
1045                         break;
1046                 }
1047
1048                 /* start timer if necessary */
1049                 filt_timerupdate(kn);
1050
1051                 if (kn->kn_ext[0]) {
1052                         unsigned int timer_flags = 0;
1053                         if (kn->kn_sfflags & NOTE_CRITICAL)
1054                                 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1055                         else if (kn->kn_sfflags & NOTE_BACKGROUND)
1056                                 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1057                         else
1058                                 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1059
1060                         if (kn->kn_sfflags & NOTE_LEEWAY)
1061                                 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1062
1063                         thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
1064                                         kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1065
1066                         kn->kn_hookid |= TIMER_RUNNING;
1067                 } else {
1068                         /* pretend the timer has fired */
1069                         kn->kn_data = 1;
1070                 }
1071
1072                 break;
1073
1074         case EVENT_PROCESS:
1075                 /* reset the timer pop count in kn_data */
1076                 *kev = kn->kn_kevent;
1077                 kev->ext[0] = 0;
1078                 kn->kn_data = 0;
1079                 if (kn->kn_flags & EV_CLEAR)
1080                         kn->kn_fflags = 0;
1081                 break;
1082         default:
1083                 panic("%s: - invalid type (%ld)", __func__, type);
1084                 break;
1085         }
1086
1087         filt_timerunlock();
1088 }
1089
1090 static void
1091 filt_timerlock(void)
1092 {
1093         lck_mtx_lock(&_filt_timerlock);
1094 }
1095
1096 static void
1097 filt_timerunlock(void)
1098 {
1099         lck_mtx_unlock(&_filt_timerlock);
1100 }
1101
1102 static int
1103 filt_userattach(struct knote *kn)
1104 {
1105         /* EVFILT_USER knotes are not attached to anything in the kernel */
1106         kn->kn_hook = NULL;
1107         if (kn->kn_fflags & NOTE_TRIGGER) {
1108                 kn->kn_hookid = 1;
1109         } else {
1110                 kn->kn_hookid = 0;
1111         }
1112         return (0);
1113 }
1114
1115 static void
1116 filt_userdetach(__unused struct knote *kn)
1117 {
1118         /* EVFILT_USER knotes are not attached to anything in the kernel */
1119 }
1120
1121 static int
1122 filt_user(struct knote *kn, __unused long hint)
1123 {
1124         return (kn->kn_hookid);
1125 }
1126
1127 static void
1128 filt_usertouch(struct knote *kn, struct kevent_internal_s *kev, long type)
1129 {
1130         uint32_t ffctrl;
1131         switch (type) {
1132         case EVENT_REGISTER:
1133                 if (kev->fflags & NOTE_TRIGGER) {
1134                         kn->kn_hookid = 1;
1135                 }
1136
1137                 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1138                 kev->fflags &= NOTE_FFLAGSMASK;
1139                 switch (ffctrl) {
1140                 case NOTE_FFNOP:
1141                         break;
1142                 case NOTE_FFAND:
1143                         OSBitAndAtomic(kev->fflags, &kn->kn_sfflags);
1144                         break;
1145                 case NOTE_FFOR:
1146                         OSBitOrAtomic(kev->fflags, &kn->kn_sfflags);
1147                         break;
1148                 case NOTE_FFCOPY:
1149                         kn->kn_sfflags = kev->fflags;
1150                         break;
1151                 }
1152                 kn->kn_sdata = kev->data;
1153                 break;
1154         case EVENT_PROCESS:
1155                 *kev = kn->kn_kevent;
1156                 kev->fflags = (volatile UInt32)kn->kn_sfflags;
1157                 kev->data = kn->kn_sdata;
1158                 if (kn->kn_flags & EV_CLEAR) {
1159                         kn->kn_hookid = 0;
1160                         kn->kn_data = 0;
1161                         kn->kn_fflags = 0;
1162                 }
1163                 break;
1164         default:
1165                 panic("%s: - invalid type (%ld)", __func__, type);
1166                 break;
1167         }
1168 }
1169
1170 /*
1171  * JMM - placeholder for not-yet-implemented filters
1172  */
1173 static int
1174 filt_badattach(__unused struct knote *kn)
1175 {
1176         return (ENOTSUP);
1177 }
1178
1179 struct kqueue *
1180 kqueue_alloc(struct proc *p)
1181 {
1182         struct filedesc *fdp = p->p_fd;
1183         struct kqueue *kq;
1184
1185         MALLOC_ZONE(kq, struct kqueue *, sizeof (struct kqueue), M_KQUEUE,
1186             M_WAITOK);
1187         if (kq != NULL) {
1188                 struct waitq_set *wqs;
1189
1190                 wqs = waitq_set_alloc(SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST | SYNC_POLICY_DISABLE_IRQ);
1191                 if (wqs != NULL) {
1192                         bzero(kq, sizeof (struct kqueue));
1193                         lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
1194                         TAILQ_INIT(&kq->kq_head);
1195                         kq->kq_wqs = wqs;
1196                         kq->kq_p = p;
1197                 } else {
1198                         FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
1199                         kq = NULL;
1200                 }
1201         }
1202
1203         if (fdp->fd_knlistsize < 0) {
1204                 proc_fdlock(p);
1205                 if (fdp->fd_knlistsize < 0)
1206                         fdp->fd_knlistsize = 0; /* this process has had a kq */
1207                 proc_fdunlock(p);
1208         }
1209
1210         return (kq);
1211 }
1212
1213 /*
1214  * kqueue_dealloc - detach all knotes from a kqueue and free it
1215  *
1216  *      We walk each list looking for knotes referencing this
1217  *      this kqueue.  If we find one, we try to drop it.  But
1218  *      if we fail to get a drop reference, that will wait
1219  *      until it is dropped.  So, we can just restart again
1220  *      safe in the assumption that the list will eventually
1221  *      not contain any more references to this kqueue (either
1222  *      we dropped them all, or someone else did).
1223  *
1224  *      Assumes no new events are being added to the kqueue.
1225  *      Nothing locked on entry or exit.
1226  */
1227 void
1228 kqueue_dealloc(struct kqueue *kq)
1229 {
1230         struct proc *p;
1231         struct filedesc *fdp;
1232         struct knote *kn;
1233         int i;
1234
1235         if (kq == NULL)
1236                 return;
1237
1238         p = kq->kq_p;
1239         fdp = p->p_fd;
1240
1241         proc_fdlock(p);
1242         for (i = 0; i < fdp->fd_knlistsize; i++) {
1243                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1244                 while (kn != NULL) {
1245                         if (kq == kn->kn_kq) {
1246                                 kqlock(kq);
1247                                 proc_fdunlock(p);
1248                                 /* drop it ourselves or wait */
1249                                 if (kqlock2knotedrop(kq, kn)) {
1250                                         kn->kn_fop->f_detach(kn);
1251                                         knote_drop(kn, p);
1252                                 }
1253                                 proc_fdlock(p);
1254                                 /* start over at beginning of list */
1255                                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1256                                 continue;
1257                         }
1258                         kn = SLIST_NEXT(kn, kn_link);
1259                 }
1260         }
1261         if (fdp->fd_knhashmask != 0) {
1262                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
1263                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1264                         while (kn != NULL) {
1265                                 if (kq == kn->kn_kq) {
1266                                         kqlock(kq);
1267                                         proc_fdunlock(p);
1268                                         /* drop it ourselves or wait */
1269                                         if (kqlock2knotedrop(kq, kn)) {
1270                                                 kn->kn_fop->f_detach(kn);
1271                                                 knote_drop(kn, p);
1272                                         }
1273                                         proc_fdlock(p);
1274                                         /* start over at beginning of list */
1275                                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1276                                         continue;
1277                                 }
1278                                 kn = SLIST_NEXT(kn, kn_link);
1279                         }
1280                 }
1281         }
1282         proc_fdunlock(p);
1283
1284         /*
1285          * waitq_set_free() clears all preposts and also remove the KQ's
1286          * waitq set from any select sets to which it may belong.
1287          */
1288         waitq_set_free(kq->kq_wqs);
1289         kq->kq_wqs = NULL;
1290         lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
1291         FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
1292 }
1293
1294 int
1295 kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
1296 {
1297         struct kqueue *kq;
1298         struct fileproc *fp;
1299         int fd, error;
1300
1301         error = falloc_withalloc(p,
1302             &fp, &fd, vfs_context_current(), fp_zalloc, cra);
1303         if (error) {
1304                 return (error);
1305         }
1306
1307         kq = kqueue_alloc(p);
1308         if (kq == NULL) {
1309                 fp_free(p, fd, fp);
1310                 return (ENOMEM);
1311         }
1312
1313         fp->f_flag = FREAD | FWRITE;
1314         fp->f_ops = &kqueueops;
1315         fp->f_data = kq;
1316
1317         proc_fdlock(p);
1318         *fdflags(p, fd) |= UF_EXCLOSE;
1319         procfdtbl_releasefd(p, fd, NULL);
1320         fp_drop(p, fd, fp, 1);
1321         proc_fdunlock(p);
1322
1323         *retval = fd;
1324         return (error);
1325 }
1326
1327 int
1328 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
1329 {
1330         return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
1331 }
1332
1333 static int
1334 kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
1335     unsigned int flags)
1336 {
1337         int advance;
1338         int error;
1339
1340         if (flags & KEVENT_FLAG_LEGACY32) {
1341                 bzero(kevp, sizeof (*kevp));
1342
1343                 if (IS_64BIT_PROCESS(p)) {
1344                         struct user64_kevent kev64;
1345
1346                         advance = sizeof (kev64);
1347                         error = copyin(*addrp, (caddr_t)&kev64, advance);
1348                         if (error)
1349                                 return (error);
1350                         kevp->ident = kev64.ident;
1351                         kevp->filter = kev64.filter;
1352                         kevp->flags = kev64.flags;
1353                         kevp->udata = kev64.udata;
1354                         kevp->fflags = kev64.fflags;
1355                         kevp->data = kev64.data;
1356                 } else {
1357                         struct user32_kevent kev32;
1358
1359                         advance = sizeof (kev32);
1360                         error = copyin(*addrp, (caddr_t)&kev32, advance);
1361                         if (error)
1362                                 return (error);
1363                         kevp->ident = (uintptr_t)kev32.ident;
1364                         kevp->filter = kev32.filter;
1365                         kevp->flags = kev32.flags;
1366                         kevp->udata = CAST_USER_ADDR_T(kev32.udata);
1367                         kevp->fflags = kev32.fflags;
1368                         kevp->data = (intptr_t)kev32.data;
1369                 }
1370         } else if (flags & KEVENT_FLAG_LEGACY64) {
1371                 struct kevent64_s kev64;
1372
1373                 bzero(kevp, sizeof (*kevp));
1374
1375                 advance = sizeof (struct kevent64_s);
1376                 error = copyin(*addrp, (caddr_t)&kev64, advance);
1377                 if (error)
1378                         return(error);
1379                 kevp->ident = kev64.ident;
1380                 kevp->filter = kev64.filter;
1381                 kevp->flags = kev64.flags;
1382                 kevp->udata = kev64.udata;
1383                 kevp->fflags = kev64.fflags;
1384                 kevp->data = kev64.data;
1385                 kevp->ext[0] = kev64.ext[0];
1386                 kevp->ext[1] = kev64.ext[1];
1387
1388         } else {
1389                 struct kevent_qos_s kevqos;
1390
1391                 bzero(kevp, sizeof (*kevp));
1392
1393                 advance = sizeof (struct kevent_qos_s);
1394                 error = copyin(*addrp, (caddr_t)&kevqos, advance);
1395                 if (error)
1396                         return error;
1397                 kevp->ident = kevqos.ident;
1398                 kevp->filter = kevqos.filter;
1399                 kevp->flags = kevqos.flags;
1400                 kevp->udata = kevqos.udata;
1401                 kevp->fflags = kevqos.fflags;
1402                 kevp->data = kevqos.data;
1403                 kevp->ext[0] = kevqos.ext[0];
1404                 kevp->ext[1] = kevqos.ext[1];
1405         }
1406         if (!error)
1407                 *addrp += advance;
1408         return (error);
1409 }
1410
1411 static int
1412 kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
1413     unsigned int flags)
1414 {
1415         user_addr_t addr = *addrp;
1416         int advance;
1417         int error;
1418
1419         if (flags & KEVENT_FLAG_LEGACY32) {
1420                 assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0);
1421
1422                 if (IS_64BIT_PROCESS(p)) {
1423                         struct user64_kevent kev64;
1424
1425                         /*
1426                          * deal with the special case of a user-supplied
1427                          * value of (uintptr_t)-1.
1428                          */
1429                         kev64.ident = (kevp->ident == (uintptr_t)-1) ?
1430                                 (uint64_t)-1LL : (uint64_t)kevp->ident;
1431
1432                         kev64.filter = kevp->filter;
1433                         kev64.flags = kevp->flags;
1434                         kev64.fflags = kevp->fflags;
1435                         kev64.data = (int64_t) kevp->data;
1436                         kev64.udata = kevp->udata;
1437                         advance = sizeof (kev64);
1438                         error = copyout((caddr_t)&kev64, addr, advance);
1439                 } else {
1440                         struct user32_kevent kev32;
1441
1442                         kev32.ident = (uint32_t)kevp->ident;
1443                         kev32.filter = kevp->filter;
1444                         kev32.flags = kevp->flags;
1445                         kev32.fflags = kevp->fflags;
1446                         kev32.data = (int32_t)kevp->data;
1447                         kev32.udata = kevp->udata;
1448                         advance = sizeof (kev32);
1449                         error = copyout((caddr_t)&kev32, addr, advance);
1450                 }
1451         } else if (flags & KEVENT_FLAG_LEGACY64) {
1452                 struct kevent64_s kev64;
1453
1454                 advance = sizeof (struct kevent64_s);
1455                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1456                         addr -= advance;
1457                 }
1458                 kev64.ident = kevp->ident;
1459                 kev64.filter = kevp->filter;
1460                 kev64.flags = kevp->flags;
1461                 kev64.fflags = kevp->fflags;
1462                 kev64.data = (int64_t) kevp->data;
1463                 kev64.udata = kevp->udata;
1464                 kev64.ext[0] = kevp->ext[0];
1465                 kev64.ext[1] = kevp->ext[1];
1466                 error = copyout((caddr_t)&kev64, addr, advance);
1467         } else {
1468                 struct kevent_qos_s kevqos;
1469
1470                 bzero(&kevqos, sizeof (struct kevent_qos_s));
1471                 advance = sizeof (struct kevent_qos_s);
1472                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1473                         addr -= advance;
1474                 }
1475                 kevqos.ident = kevp->ident;
1476                 kevqos.filter = kevp->filter;
1477                 kevqos.flags = kevp->flags;
1478                 kevqos.fflags = kevp->fflags;
1479                 kevqos.data = (int64_t) kevp->data;
1480                 kevqos.udata = kevp->udata;
1481                 kevqos.ext[0] = kevp->ext[0];
1482                 kevqos.ext[1] = kevp->ext[1];
1483                 error = copyout((caddr_t)&kevqos, addr, advance);
1484         }
1485         if (!error) {
1486                 if (flags & KEVENT_FLAG_STACK_EVENTS)
1487                         *addrp = addr;
1488                 else
1489                         *addrp = addr + advance;
1490         }
1491         return (error);
1492 }
1493
1494 /*
1495  * kevent_continue - continue a kevent syscall after blocking
1496  *
1497  *      assume we inherit a use count on the kq fileglob.
1498  */
1499
1500 static void
1501 kevent_continue(__unused struct kqueue *kq, void *data, int error)
1502 {
1503         struct _kevent *cont_args;
1504         struct fileproc *fp;
1505         int32_t *retval;
1506         int noutputs;
1507         int fd;
1508         struct proc *p = current_proc();
1509
1510         cont_args = (struct _kevent *)data;
1511         noutputs = cont_args->eventout;
1512         retval = cont_args->retval;
1513         fd = cont_args->fd;
1514         fp = cont_args->fp;
1515
1516         if (fp != NULL)
1517                 fp_drop(p, fd, fp, 0);
1518
1519         /* don't restart after signals... */
1520         if (error == ERESTART)
1521                 error = EINTR;
1522         else if (error == EWOULDBLOCK)
1523                 error = 0;
1524         if (error == 0)
1525                 *retval = noutputs;
1526         unix_syscall_return(error);
1527 }
1528
1529 /*
1530  * kevent - [syscall] register and wait for kernel events
1531  *
1532  */
1533 int
1534 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
1535 {
1536         unsigned int flags = KEVENT_FLAG_LEGACY32;
1537
1538         return kevent_internal(p,
1539                                uap->fd,
1540                                uap->changelist, uap->nchanges,
1541                                uap->eventlist, uap->nevents,
1542                                0ULL, 0ULL,
1543                                flags,
1544                                uap->timeout,
1545                                kevent_continue,
1546                                retval);
1547 }
1548
1549 int
1550 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
1551 {
1552         unsigned int flags;
1553
1554         /* restrict to user flags and set legacy64 */
1555         flags = uap->flags & KEVENT_FLAG_USER;
1556         flags |= KEVENT_FLAG_LEGACY64;
1557
1558         return kevent_internal(p,
1559                                uap->fd,
1560                                uap->changelist, uap->nchanges,
1561                                uap->eventlist, uap->nevents,
1562                                0ULL, 0ULL,
1563                                flags,
1564                                uap->timeout,
1565                                kevent_continue,
1566                                retval);
1567 }
1568
1569 int
1570 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
1571 {
1572         user_size_t usize = 0;
1573         user_size_t ssize;
1574         int error;
1575
1576         /* restrict to user flags */
1577         uap->flags &= KEVENT_FLAG_USER;
1578
1579         if (uap->data_available) {
1580                 if (!IS_64BIT_PROCESS(p)) {
1581                         uint32_t csize;
1582
1583                         error = copyin(uap->data_available, (caddr_t)&csize, sizeof(csize));
1584                         if (error)
1585                                 return error;
1586                         usize = csize;
1587                 } else {
1588                         uint64_t csize;
1589                         error = copyin(uap->data_available, (caddr_t)&csize, sizeof(csize));
1590                         if (error)
1591                                 return error;
1592                         usize = csize;
1593                 }
1594         }
1595         ssize = usize;
1596
1597         error = kevent_internal(p,
1598                                 uap->fd,
1599                                 uap->changelist, uap->nchanges,
1600                                 uap->eventlist, uap->nevents,
1601                                 uap->data_out, &usize,
1602                                 uap->flags,
1603                                 0ULL,
1604                                 kevent_continue,
1605                                 retval);
1606
1607         if (error == 0 && uap->data_available && usize != ssize) {
1608                 if (!IS_64BIT_PROCESS(p)) {
1609                         uint32_t csize = (uint32_t)usize;
1610
1611                         error = copyout((caddr_t)&csize, uap->data_available, sizeof(csize));
1612                 } else {
1613                         error = copyout((caddr_t)&usize, uap->data_available, sizeof(usize));
1614                 }
1615         }
1616         return error;
1617 }
1618
1619 int
1620 kevent_qos_internal(struct proc *p, int fd,
1621                     user_addr_t changelist, int nchanges,
1622                     user_addr_t eventlist, int nevents,
1623                     user_addr_t data_out, user_size_t *data_available,
1624                     unsigned int flags,
1625                     int32_t *retval)
1626 {
1627         return kevent_internal(p,
1628                                fd,
1629                                changelist, nchanges,
1630                                eventlist, nevents,
1631                                data_out, data_available,
1632                                flags,
1633                                0ULL,
1634                                NULL,
1635                                retval);
1636 }
1637
1638 static int
1639 kevent_internal(struct proc *p,
1640                 int fd,
1641                 user_addr_t changelist, int nchanges,
1642                 user_addr_t ueventlist, int nevents,
1643                 user_addr_t data_out, user_size_t *data_available,
1644                 unsigned int flags,
1645                 user_addr_t utimeout,
1646                 kqueue_continue_t continuation,
1647                 int32_t *retval)
1648 {
1649         struct _kevent *cont_args;
1650         uthread_t ut;
1651         struct kqueue *kq;
1652         struct fileproc *fp = NULL;
1653         struct kevent_internal_s kev;
1654         int error, noutputs;
1655         struct timeval atv;
1656
1657 #if 1
1658         /* temporarily ignore these fields */
1659         (void)data_out;
1660         (void)data_available;
1661 #endif
1662
1663         /* prepare to deal with stack-wise allocation of out events */
1664         if (flags & KEVENT_FLAG_STACK_EVENTS) {
1665                 int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
1666                              (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
1667                                                     sizeof(struct user32_kevent)) :
1668                              ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
1669                                                                sizeof(struct kevent_qos_s)));
1670                 ueventlist += nevents * scale;
1671         }
1672
1673         /* convert timeout to absolute - if we have one (and not immediate) */
1674         if (flags & KEVENT_FLAG_IMMEDIATE) {
1675                 getmicrouptime(&atv);
1676         } else if (utimeout != USER_ADDR_NULL) {
1677                 struct timeval rtv;
1678                 if (IS_64BIT_PROCESS(p)) {
1679                         struct user64_timespec ts;
1680                         error = copyin(utimeout, &ts, sizeof(ts));
1681                         if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
1682                                 error = EINVAL;
1683                         else
1684                                 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
1685                 } else {
1686                         struct user32_timespec ts;
1687                         error = copyin(utimeout, &ts, sizeof(ts));
1688                         TIMESPEC_TO_TIMEVAL(&rtv, &ts);
1689                 }
1690                 if (error)
1691                         return (error);
1692                 if (itimerfix(&rtv))
1693                         return (EINVAL);
1694                 getmicrouptime(&atv);
1695                 timevaladd(&atv, &rtv);
1696         } else {
1697                 /* wait forever value */
1698                 atv.tv_sec = 0;
1699                 atv.tv_usec = 0;
1700         }
1701
1702         if (flags & KEVENT_FLAG_WORKQ) {
1703                 /*
1704                  * use the private kq associated with the proc workq.
1705                  * Just being a thread within the process (and not
1706                  * being the exit/exec thread) is enough to hold a
1707                  * reference on this special kq.
1708                  */
1709                 kq = p->p_wqkqueue;
1710                 if (kq == NULL) {
1711                         struct kqueue *alloc_kq = kqueue_alloc(p);
1712                         if (alloc_kq == NULL)
1713                                 return ENOMEM;
1714
1715                         proc_fdlock(p);
1716                         if (p->p_wqkqueue == NULL) {
1717                                 /*
1718                                  * The kq is marked as special -
1719                                  * with unique interactions with
1720                                  * the workq for this process.
1721                                  */
1722                                 alloc_kq->kq_state |= KQ_WORKQ;
1723                                 kq = p->p_wqkqueue = alloc_kq;
1724                                 proc_fdunlock(p);
1725                         } else {
1726                                 proc_fdunlock(p);
1727                                 kq = p->p_wqkqueue;
1728                                 kqueue_dealloc(alloc_kq);
1729                         }
1730                 }
1731         } else {
1732                 /* get a usecount for the kq itself */
1733                 if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
1734                         return (error);
1735         }
1736
1737         /* each kq should only be used for events of one type */
1738         kqlock(kq);
1739         if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) {
1740                 if (flags & KEVENT_FLAG_LEGACY32) {
1741                         if ((kq->kq_state & KQ_KEV32) == 0) {
1742                                 error = EINVAL;
1743                                 kqunlock(kq);
1744                                 goto errorout;
1745                         }
1746                 } else if (kq->kq_state & KQ_KEV32) {
1747                         error = EINVAL;
1748                         kqunlock(kq);
1749                         goto errorout;
1750                 }
1751         } else if (flags & KEVENT_FLAG_LEGACY32) {
1752                 kq->kq_state |= KQ_KEV32;
1753         } else {
1754                 /* JMM - set KQ_KEVQOS when we are ready for exclusive */
1755                 kq->kq_state |= KQ_KEV64;
1756         }
1757         kqunlock(kq);
1758
1759         /* register all the change requests the user provided... */
1760         noutputs = 0;
1761         while (nchanges > 0 && error == 0) {
1762                 error = kevent_copyin(&changelist, &kev, p, flags);
1763                 if (error)
1764                         break;
1765
1766                 kev.flags &= ~EV_SYSFLAGS;
1767                 error = kevent_register(kq, &kev, p);
1768                 if ((error || (kev.flags & EV_RECEIPT)) && nevents > 0) {
1769                         kev.flags = EV_ERROR;
1770                         kev.data = error;
1771                         error = kevent_copyout(&kev, &ueventlist, p, flags);
1772                         if (error == 0) {
1773                                 nevents--;
1774                                 noutputs++;
1775                         }
1776                 }
1777                 nchanges--;
1778         }
1779
1780         /* short-circuit the scan if we only want error events */
1781         if (flags & KEVENT_FLAG_ERROR_EVENTS)
1782                 nevents = 0;
1783
1784         if (nevents > 0 && noutputs == 0 && error == 0) {
1785
1786                 /* store the continuation/completion data in the uthread */
1787                 ut = (uthread_t)get_bsdthread_info(current_thread());
1788                 cont_args = &ut->uu_kevent.ss_kevent;
1789                 cont_args->fp = fp;
1790                 cont_args->fd = fd;
1791                 cont_args->retval = retval;
1792                 cont_args->eventlist = ueventlist;
1793                 cont_args->eventcount = nevents;
1794                 cont_args->eventout = noutputs;
1795                 cont_args->eventflags = flags;
1796
1797                 error = kqueue_scan(kq, kevent_callback,
1798                                     continuation, cont_args,
1799                                     &atv, p);
1800
1801                 noutputs = cont_args->eventout;
1802         }
1803
1804         /* don't restart after signals... */
1805         if (error == ERESTART)
1806                 error = EINTR;
1807         else if (error == EWOULDBLOCK)
1808                 error = 0;
1809         if (error == 0)
1810                 *retval = noutputs;
1811 errorout:
1812         if (fp != NULL)
1813                 fp_drop(p, fd, fp, 0);
1814         return (error);
1815 }
1816
1817
1818 /*
1819  * kevent_callback - callback for each individual event
1820  *
1821  * called with nothing locked
1822  * caller holds a reference on the kqueue
1823  */
1824 static int
1825 kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
1826     void *data)
1827 {
1828         struct _kevent *cont_args;
1829         int error;
1830
1831         cont_args = (struct _kevent *)data;
1832         assert(cont_args->eventout < cont_args->eventcount);
1833
1834         /*
1835          * Copy out the appropriate amount of event data for this user.
1836          */
1837         error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
1838                                cont_args->eventflags);
1839
1840         /*
1841          * If there isn't space for additional events, return
1842          * a harmless error to stop the processing here
1843          */
1844         if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
1845                 error = EWOULDBLOCK;
1846         return (error);
1847 }
1848
1849 /*
1850  * kevent_description - format a description of a kevent for diagnostic output
1851  *
1852  * called with a 256-byte string buffer
1853  */
1854
1855 char *
1856 kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
1857 {
1858         snprintf(s, n,
1859             "kevent="
1860             "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
1861             kevp->ident,
1862             kevp->filter,
1863             kevp->flags,
1864             kevp->udata,
1865             kevp->fflags,
1866             kevp->data,
1867             kevp->ext[0],
1868             kevp->ext[1] );
1869
1870         return (s);
1871 }
1872
1873 /*
1874  * kevent_register - add a new event to a kqueue
1875  *
1876  *      Creates a mapping between the event source and
1877  *      the kqueue via a knote data structure.
1878  *
1879  *      Because many/most the event sources are file
1880  *      descriptor related, the knote is linked off
1881  *      the filedescriptor table for quick access.
1882  *
1883  *      called with nothing locked
1884  *      caller holds a reference on the kqueue
1885  */
1886
1887 int
1888 kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
1889     __unused struct proc *ctxp)
1890 {
1891         struct proc *p = kq->kq_p;
1892         struct filedesc *fdp = p->p_fd;
1893         struct filterops *fops;
1894         struct fileproc *fp = NULL;
1895         struct knote *kn = NULL;
1896         struct klist *list;
1897         int error = 0;
1898
1899         if (kev->filter < 0) {
1900                 if (kev->filter + EVFILT_SYSCOUNT < 0)
1901                         return (EINVAL);
1902                 fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
1903         } else {
1904                 return (EINVAL);
1905         }
1906
1907 restart:
1908         /* this iocount needs to be dropped if it is not registered */
1909         list = NULL;
1910         proc_fdlock(p);
1911
1912         /*
1913          * determine where to look for the knote
1914          */
1915         if (fops->f_isfd) {
1916                 if ((error = fp_lookup(p, kev->ident, &fp, 1)) != 0) {
1917                         proc_fdunlock(p);
1918                         return (error);
1919                 }
1920                 /* fd-based knotes are linked off the fd table */
1921                 if (kev->ident < (u_int)fdp->fd_knlistsize) {
1922                         list = &fdp->fd_knlist[kev->ident];
1923                 }
1924         } else if (fdp->fd_knhashmask != 0) {
1925                 /* hash non-fd knotes here too */
1926                 list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1927         }
1928
1929         /*
1930          * scan the selected list looking for a match
1931          */
1932         if (list != NULL) {
1933                 SLIST_FOREACH(kn, list, kn_link) {
1934                         if (kq == kn->kn_kq &&
1935                             kev->ident == kn->kn_id &&
1936                             kev->filter == kn->kn_filter) {
1937                                 if (kev->flags & EV_UDATA_SPECIFIC) {
1938                                         if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
1939                                             kev->udata == kn->kn_udata) {
1940                                                 break; /* matching udata-specific knote */
1941                                         }
1942                                 } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
1943                                         break; /* matching non-udata-specific knote */
1944                                 }
1945                         }
1946                 }
1947         }
1948
1949         /*
1950          * kn now contains the matching knote, or NULL if no match
1951          */
1952         if (kn == NULL) {
1953                 if ((kev->flags & (EV_ADD|EV_DELETE)) == EV_ADD) {
1954                         kn = knote_alloc();
1955                         if (kn == NULL) {
1956                                 proc_fdunlock(p);
1957                                 error = ENOMEM;
1958                                 goto done;
1959                         }
1960                         kn->kn_fp = fp;
1961                         kn->kn_kq = kq;
1962                         kn->kn_tq = &kq->kq_head;
1963                         kn->kn_fop = fops;
1964                         kn->kn_sfflags = kev->fflags;
1965                         kn->kn_sdata = kev->data;
1966                         kev->fflags = 0;
1967                         kev->data = 0;
1968                         kn->kn_kevent = *kev;
1969                         kn->kn_inuse = 1;  /* for f_attach() */
1970                         kn->kn_status = KN_ATTACHING;
1971
1972                         /* before anyone can find it */
1973                         if (kev->flags & EV_DISABLE)
1974                                 kn->kn_status |= KN_DISABLED;
1975
1976                         error = knote_fdpattach(kn, fdp, p);
1977                         proc_fdunlock(p);
1978
1979                         if (error) {
1980                                 knote_free(kn);
1981                                 goto done;
1982                         }
1983
1984                         /*
1985                          * apply reference count to knote structure, and
1986                          * do not release it at the end of this routine.
1987                          */
1988                         fp = NULL;
1989
1990                         error = fops->f_attach(kn);
1991
1992                         kqlock(kq);
1993
1994                         if (error != 0) {
1995                                 /*
1996                                  * Failed to attach correctly, so drop.
1997                                  * All other possible users/droppers
1998                                  * have deferred to us.
1999                                  */
2000                                 kn->kn_status |= KN_DROPPING;
2001                                 kqunlock(kq);
2002                                 knote_drop(kn, p);
2003                                 goto done;
2004                         } else if (kn->kn_status & KN_DROPPING) {
2005                                 /*
2006                                  * Attach succeeded, but someone else
2007                                  * deferred their drop - now we have
2008                                  * to do it for them (after detaching).
2009                                  */
2010                                 kqunlock(kq);
2011                                 kn->kn_fop->f_detach(kn);
2012                                 knote_drop(kn, p);
2013                                 goto done;
2014                         }
2015                         kn->kn_status &= ~KN_ATTACHING;
2016                         kqunlock(kq);
2017                 } else {
2018                         proc_fdunlock(p);
2019                         error = ENOENT;
2020                         goto done;
2021                 }
2022         } else {
2023                 /* existing knote - get kqueue lock */
2024                 kqlock(kq);
2025                 proc_fdunlock(p);
2026
2027                 if (kev->flags & EV_DELETE) {
2028                         if ((kev->flags & EV_ENABLE) == 0 &&
2029                             (kev->flags & EV_DISPATCH2) == EV_DISPATCH2 &&
2030                             (kn->kn_status & KN_DISABLED) == KN_DISABLED) {
2031                                 /* mark for deferred drop */
2032                                 kn->kn_status |= KN_DEFERDROP;
2033                                 kqunlock(kq);
2034                                 error = EINPROGRESS;
2035                         } else {
2036                                 knote_dequeue(kn);
2037                                 kn->kn_status |= KN_DISABLED;
2038                                 if (kqlock2knotedrop(kq, kn)) {
2039                                         kn->kn_fop->f_detach(kn);
2040                                         knote_drop(kn, p);
2041                                 } else {
2042                                         /* pretend we didn't find it */
2043                                         error = ENOENT;
2044                                 }
2045                         }
2046                         goto done;
2047                 }
2048
2049                 /* update status flags for existing knote */
2050                 if (kev->flags & EV_DISABLE) {
2051                         knote_dequeue(kn);
2052                         kn->kn_status |= KN_DISABLED;
2053
2054                 } else if ((kev->flags & EV_ENABLE) &&
2055                            (kn->kn_status & KN_DISABLED)) {
2056                         kn->kn_status &= ~KN_DISABLED;
2057
2058                         /* handle deferred drop */
2059                         if (kn->kn_status & KN_DEFERDROP) {
2060                                 kn->kn_status &= ~KN_DEFERDROP;
2061                                 kn->kn_flags |= (EV_DELETE | EV_ONESHOT);
2062                                 knote_activate(kn, 0);
2063                                 kqunlock(kq);
2064                                 goto done;
2065                         }
2066
2067                         if (kn->kn_status & KN_ACTIVE) {
2068                                 /* force re-activate if previously active */
2069                                 knote_activate(kn, 1);
2070                         }
2071                 }
2072
2073                 /*
2074                  * The user may change some filter values after the
2075                  * initial EV_ADD, but doing so will not reset any
2076                  * filter which have already been triggered.
2077                  */
2078                 kn->kn_kevent.udata = kev->udata;
2079                 if (fops->f_isfd || fops->f_touch == NULL) {
2080                         kn->kn_sfflags = kev->fflags;
2081                         kn->kn_sdata = kev->data;
2082                 }
2083
2084                 /*
2085                  * If somebody is in the middle of dropping this
2086                  * knote - go find/insert a new one.  But we have
2087                  * wait for this one to go away first. Attaches
2088                  * running in parallel may also drop/modify the
2089                  * knote.  Wait for those to complete as well and
2090                  * then start over if we encounter one.
2091                  */
2092                 if (!kqlock2knoteusewait(kq, kn)) {
2093                         /* kqueue, proc_fdlock both unlocked */
2094                         goto restart;
2095                 }
2096
2097                 /*
2098                  * Call touch routine to notify filter of changes
2099                  * in filter values.
2100                  */
2101                 if (!fops->f_isfd && fops->f_touch != NULL)
2102                         fops->f_touch(kn, kev, EVENT_REGISTER);
2103         }
2104         /* still have use ref on knote */
2105
2106         /*
2107          * Invoke the filter routine to see if it should be enqueued now.
2108          */
2109 #if 0
2110         if (kn->kn_fop->f_event(kn, 0)) {
2111 #else
2112         /*
2113          * JMM - temporary workaround until rdar://problem/19986199
2114          * This potentially results in extra wakeups for KN_STAYQUEUED event types,
2115          * but waking up only truly active ones (yet trying below to determine
2116          * active status, by invoking the filter routine, is having side-effects).
2117          */
2118         if ((kn->kn_status & KN_STAYQUEUED) || kn->kn_fop->f_event(kn, 0)) {
2119 #endif
2120                 if (knoteuse2kqlock(kq, kn))
2121                         knote_activate(kn, (kn->kn_status & KN_STAYQUEUED));
2122                 kqunlock(kq);
2123         } else {
2124                 knote_put(kn);
2125         }
2126
2127 done:
2128         if (fp != NULL)
2129                 fp_drop(p, kev->ident, fp, 0);
2130         return (error);
2131 }
2132
2133
2134 /*
2135  * knote_process - process a triggered event
2136  *
2137  *      Validate that it is really still a triggered event
2138  *      by calling the filter routines (if necessary).  Hold
2139  *      a use reference on the knote to avoid it being detached.
2140  *      If it is still considered triggered, invoke the callback
2141  *      routine provided and move it to the provided inprocess
2142  *      queue.
2143  *
2144  *      caller holds a reference on the kqueue.
2145  *      kqueue locked on entry and exit - but may be dropped
2146  */
2147 static int
2148 knote_process(struct knote *kn,
2149     kevent_callback_t callback,
2150     void *data,
2151     struct kqtailq *inprocessp,
2152     struct proc *p)
2153 {
2154         struct kqueue *kq = kn->kn_kq;
2155         struct kevent_internal_s kev;
2156         int touch;
2157         int result;
2158         int error;
2159
2160         /*
2161          * Determine the kevent state we want to return.
2162          *
2163          * Some event states need to be revalidated before returning
2164          * them, others we take the snapshot at the time the event
2165          * was enqueued.
2166          *
2167          * Events with non-NULL f_touch operations must be touched.
2168          * Triggered events must fill in kev for the callback.
2169          *
2170          * Convert our lock to a use-count and call the event's
2171          * filter routine(s) to update.
2172          */
2173         if ((kn->kn_status & KN_DISABLED) != 0) {
2174                 result = 0;
2175                 touch = 0;
2176         } else {
2177                 int revalidate;
2178
2179                 result = 1;
2180                 revalidate = ((kn->kn_status & KN_STAYQUEUED) != 0 ||
2181                     (kn->kn_flags & EV_ONESHOT) == 0);
2182                 touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL);
2183
2184                 if (revalidate || touch) {
2185                         if (revalidate)
2186                                 knote_deactivate(kn);
2187
2188                         /* call the filter/touch routines with just a ref */
2189                         if (kqlock2knoteuse(kq, kn)) {
2190                                 /* if we have to revalidate, call the filter */
2191                                 if (revalidate) {
2192                                         result = kn->kn_fop->f_event(kn, 0);
2193                                 }
2194
2195                                 /*
2196                                  * capture the kevent data - using touch if
2197                                  * specified
2198                                  */
2199                                 if (result && touch) {
2200                                         kn->kn_fop->f_touch(kn, &kev,
2201                                             EVENT_PROCESS);
2202                                 }
2203                                 if (result && (kn->kn_status & KN_TOUCH))
2204                                         kn->kn_fop->f_touch(kn, &kev,
2205                                             EVENT_PROCESS);
2206
2207                                 /*
2208                                  * convert back to a kqlock - bail if the knote
2209                                  * went away
2210                                  */
2211                                 if (!knoteuse2kqlock(kq, kn)) {
2212                                         return (EJUSTRETURN);
2213                                 } else if (result) {
2214                                         /*
2215                                          * if revalidated as alive, make sure
2216                                          * it's active
2217                                          */
2218                                         knote_activate(kn, 0);
2219
2220                                         /*
2221                                          * capture all events that occurred
2222                                          * during filter
2223                                          */
2224                                         if (!touch) {
2225                                                 kev = kn->kn_kevent;
2226                                         }
2227
2228                                 } else if ((kn->kn_status & KN_STAYQUEUED) == 0) {
2229                                         /*
2230                                          * was already dequeued, so just bail on
2231                                          * this one
2232                                          */
2233                                         return (EJUSTRETURN);
2234                                 }
2235                         } else {
2236                                 return (EJUSTRETURN);
2237                         }
2238                 } else {
2239                         kev = kn->kn_kevent;
2240                 }
2241         }
2242
2243         /* move knote onto inprocess queue */
2244         assert(kn->kn_tq == &kq->kq_head);
2245         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2246         kn->kn_tq = inprocessp;
2247         TAILQ_INSERT_TAIL(inprocessp, kn, kn_tqe);
2248
2249         /*
2250          * Determine how to dispatch the knote for future event handling.
2251          * not-fired: just return (do not callout).
2252          * One-shot: If dispatch2, enter deferred-delete mode (unless this is
2253          *           is the deferred delete event delivery itself).  Otherwise,
2254          *           deactivate and drop it.
2255          * Clear: deactivate and clear the state.
2256          * Dispatch: don't clear state, just deactivate it and mark it disabled.
2257          * All others: just leave where they are.
2258          */
2259
2260         if (result == 0) {
2261                 return (EJUSTRETURN);
2262         } else if ((kn->kn_flags & EV_ONESHOT) != 0) {
2263                 knote_deactivate(kn);
2264                 if ((kn->kn_flags & (EV_DISPATCH2|EV_DELETE)) == EV_DISPATCH2) {
2265                         /* defer dropping non-delete oneshot dispatch2 events */
2266                         kn->kn_status |= (KN_DISABLED | KN_DEFERDROP);
2267                         kqunlock(kq);
2268                 } else if (kqlock2knotedrop(kq, kn)) {
2269                         kn->kn_fop->f_detach(kn);
2270                         knote_drop(kn, p);
2271                 }
2272         } else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) {
2273                 if ((kn->kn_flags & EV_DISPATCH) != 0) {
2274                         /* deactivate and disable all dispatch knotes */
2275                         knote_deactivate(kn);
2276                         kn->kn_status |= KN_DISABLED;
2277                 } else if (!touch || kn->kn_fflags == 0) {
2278                         /* only deactivate if nothing since the touch */
2279                         knote_deactivate(kn);
2280                 }
2281                 if (!touch && (kn->kn_flags & EV_CLEAR) != 0) {
2282                         /* manually clear non-touch knotes */
2283                         kn->kn_data = 0;
2284                         kn->kn_fflags = 0;
2285                 }
2286                 kqunlock(kq);
2287         } else {
2288                 /*
2289                  * leave on inprocess queue.  We'll
2290                  * move all the remaining ones back
2291                  * the kq queue and wakeup any
2292                  * waiters when we are done.
2293                  */
2294                 kqunlock(kq);
2295         }
2296
2297         /* callback to handle each event as we find it */
2298         error = (callback)(kq, &kev, data);
2299
2300         kqlock(kq);
2301         return (error);
2302 }
2303
2304 /*
2305  * Return 0 to indicate that processing should proceed,
2306  * -1 if there is nothing to process.
2307  *
2308  * Called with kqueue locked and returns the same way,
2309  * but may drop lock temporarily.
2310  */
2311 static int
2312 kqueue_begin_processing(struct kqueue *kq)
2313 {
2314         for (;;) {
2315                 if (kq->kq_count == 0) {
2316                         return (-1);
2317                 }
2318
2319                 /* if someone else is processing the queue, wait */
2320                 if (kq->kq_nprocess != 0) {
2321                         waitq_assert_wait64((struct waitq *)kq->kq_wqs,
2322                                             CAST_EVENT64_T(&kq->kq_nprocess),
2323                                             THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
2324                         kq->kq_state |= KQ_PROCWAIT;
2325                         kqunlock(kq);
2326                         thread_block(THREAD_CONTINUE_NULL);
2327                         kqlock(kq);
2328                 } else {
2329                         kq->kq_nprocess = 1;
2330                         return (0);
2331                 }
2332         }
2333 }
2334
2335 /*
2336  * Called with kqueue lock held.
2337  */
2338 static void
2339 kqueue_end_processing(struct kqueue *kq)
2340 {
2341         kq->kq_nprocess = 0;
2342         if (kq->kq_state & KQ_PROCWAIT) {
2343                 kq->kq_state &= ~KQ_PROCWAIT;
2344                 waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
2345                                    CAST_EVENT64_T(&kq->kq_nprocess),
2346                                    THREAD_AWAKENED,
2347                                    WAITQ_ALL_PRIORITIES);
2348         }
2349 }
2350
2351 /*
2352  * kqueue_process - process the triggered events in a kqueue
2353  *
2354  *      Walk the queued knotes and validate that they are
2355  *      really still triggered events by calling the filter
2356  *      routines (if necessary).  Hold a use reference on
2357  *      the knote to avoid it being detached. For each event
2358  *      that is still considered triggered, invoke the
2359  *      callback routine provided.
2360  *
2361  *      caller holds a reference on the kqueue.
2362  *      kqueue locked on entry and exit - but may be dropped
2363  *      kqueue list locked (held for duration of call)
2364  */
2365
2366 static int
2367 kqueue_process(struct kqueue *kq,
2368     kevent_callback_t callback,
2369     void *data,
2370     int *countp,
2371     struct proc *p)
2372 {
2373         struct kqtailq inprocess;
2374         struct knote *kn;
2375         int nevents;
2376         int error;
2377
2378         TAILQ_INIT(&inprocess);
2379
2380         if (kqueue_begin_processing(kq) == -1) {
2381                 *countp = 0;
2382                 /* Nothing to process */
2383                 return (0);
2384         }
2385
2386         /*
2387          * Clear any pre-posted status from previous runs, so we
2388          * only detect events that occur during this run.
2389          */
2390         waitq_set_clear_preposts(kq->kq_wqs);
2391
2392         /*
2393          * loop through the enqueued knotes, processing each one and
2394          * revalidating those that need it. As they are processed,
2395          * they get moved to the inprocess queue (so the loop can end).
2396          */
2397         error = 0;
2398         nevents = 0;
2399
2400         while (error == 0 &&
2401             (kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
2402                 error = knote_process(kn, callback, data, &inprocess, p);
2403                 if (error == EJUSTRETURN)
2404                         error = 0;
2405                 else
2406                         nevents++;
2407         }
2408
2409         /*
2410          * With the kqueue still locked, move any knotes
2411          * remaining on the inprocess queue back to the
2412          * kq's queue and wake up any waiters.
2413          */
2414         while ((kn = TAILQ_FIRST(&inprocess)) != NULL) {
2415                 assert(kn->kn_tq == &inprocess);
2416                 TAILQ_REMOVE(&inprocess, kn, kn_tqe);
2417                 kn->kn_tq = &kq->kq_head;
2418                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2419         }
2420
2421         kqueue_end_processing(kq);
2422
2423         *countp = nevents;
2424         return (error);
2425 }
2426
2427
2428 static void
2429 kqueue_scan_continue(void *data, wait_result_t wait_result)
2430 {
2431         thread_t self = current_thread();
2432         uthread_t ut = (uthread_t)get_bsdthread_info(self);
2433         struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
2434         struct kqueue *kq = (struct kqueue *)data;
2435         int error;
2436         int count;
2437
2438         /* convert the (previous) wait_result to a proper error */
2439         switch (wait_result) {
2440         case THREAD_AWAKENED:
2441                 kqlock(kq);
2442                 error = kqueue_process(kq, cont_args->call, cont_args, &count,
2443                     current_proc());
2444                 if (error == 0 && count == 0) {
2445                         waitq_assert_wait64((struct waitq *)kq->kq_wqs,
2446                                             KQ_EVENT, THREAD_ABORTSAFE,
2447                                             cont_args->deadline);
2448                         kq->kq_state |= KQ_SLEEP;
2449                         kqunlock(kq);
2450                         thread_block_parameter(kqueue_scan_continue, kq);
2451                         /* NOTREACHED */
2452                 }
2453                 kqunlock(kq);
2454                 break;
2455         case THREAD_TIMED_OUT:
2456                 error = EWOULDBLOCK;
2457                 break;
2458         case THREAD_INTERRUPTED:
2459                 error = EINTR;
2460                 break;
2461         default:
2462                 panic("%s: - invalid wait_result (%d)", __func__,
2463                     wait_result);
2464                 error = 0;
2465         }
2466
2467         /* call the continuation with the results */
2468         assert(cont_args->cont != NULL);
2469         (cont_args->cont)(kq, cont_args->data, error);
2470 }
2471
2472
2473 /*
2474  * kqueue_scan - scan and wait for events in a kqueue
2475  *
2476  *      Process the triggered events in a kqueue.
2477  *
2478  *      If there are no events triggered arrange to
2479  *      wait for them. If the caller provided a
2480  *      continuation routine, then kevent_scan will
2481  *      also.
2482  *
2483  *      The callback routine must be valid.
2484  *      The caller must hold a use-count reference on the kq.
2485  */
2486
2487 int
2488 kqueue_scan(struct kqueue *kq,
2489             kevent_callback_t callback,
2490             kqueue_continue_t continuation,
2491             void *data,
2492             struct timeval *atvp,
2493             struct proc *p)
2494 {
2495         thread_continue_t cont = THREAD_CONTINUE_NULL;
2496         uint64_t deadline;
2497         int error;
2498         int first;
2499
2500         assert(callback != NULL);
2501
2502         first = 1;
2503         for (;;) {
2504                 wait_result_t wait_result;
2505                 int count;
2506
2507                 /*
2508                  * Make a pass through the kq to find events already
2509                  * triggered.
2510                  */
2511                 kqlock(kq);
2512                 error = kqueue_process(kq, callback, data, &count, p);
2513                 if (error || count)
2514                         break; /* lock still held */
2515
2516                 /* looks like we have to consider blocking */
2517                 if (first) {
2518                         first = 0;
2519                         /* convert the timeout to a deadline once */
2520                         if (atvp->tv_sec || atvp->tv_usec) {
2521                                 uint64_t now;
2522
2523                                 clock_get_uptime(&now);
2524                                 nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
2525                                                             atvp->tv_usec * (long)NSEC_PER_USEC,
2526                                                             &deadline);
2527                                 if (now >= deadline) {
2528                                         /* non-blocking call */
2529                                         error = EWOULDBLOCK;
2530                                         break; /* lock still held */
2531                                 }
2532                                 deadline -= now;
2533                                 clock_absolutetime_interval_to_deadline(deadline, &deadline);
2534                         } else {
2535                                 deadline = 0;   /* block forever */
2536                         }
2537
2538                         if (continuation) {
2539                                 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
2540                                 struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
2541
2542                                 cont_args->call = callback;
2543                                 cont_args->cont = continuation;
2544                                 cont_args->deadline = deadline;
2545                                 cont_args->data = data;
2546                                 cont = kqueue_scan_continue;
2547                         }
2548                 }
2549
2550                 /* go ahead and wait */
2551                 waitq_assert_wait64_leeway((struct waitq *)kq->kq_wqs,
2552                                            KQ_EVENT, THREAD_ABORTSAFE,
2553                                            TIMEOUT_URGENCY_USER_NORMAL,
2554                                            deadline, TIMEOUT_NO_LEEWAY);
2555                 kq->kq_state |= KQ_SLEEP;
2556                 kqunlock(kq);
2557                 wait_result = thread_block_parameter(cont, kq);
2558                 /* NOTREACHED if (continuation != NULL) */
2559
2560                 switch (wait_result) {
2561                 case THREAD_AWAKENED:
2562                         continue;
2563                 case THREAD_TIMED_OUT:
2564                         return (EWOULDBLOCK);
2565                 case THREAD_INTERRUPTED:
2566                         return (EINTR);
2567                 default:
2568                         panic("%s: - bad wait_result (%d)", __func__,
2569                             wait_result);
2570                         error = 0;
2571                 }
2572         }
2573         kqunlock(kq);
2574         return (error);
2575 }
2576
2577
2578 /*
2579  * XXX
2580  * This could be expanded to call kqueue_scan, if desired.
2581  */
2582 /*ARGSUSED*/
2583 static int
2584 kqueue_read(__unused struct fileproc *fp,
2585     __unused struct uio *uio,
2586     __unused int flags,
2587     __unused vfs_context_t ctx)
2588 {
2589         return (ENXIO);
2590 }
2591
2592 /*ARGSUSED*/
2593 static int
2594 kqueue_write(__unused struct fileproc *fp,
2595     __unused struct uio *uio,
2596     __unused int flags,
2597     __unused vfs_context_t ctx)
2598 {
2599         return (ENXIO);
2600 }
2601
2602 /*ARGSUSED*/
2603 static int
2604 kqueue_ioctl(__unused struct fileproc *fp,
2605     __unused u_long com,
2606     __unused caddr_t data,
2607     __unused vfs_context_t ctx)
2608 {
2609         return (ENOTTY);
2610 }
2611
2612 /*ARGSUSED*/
2613 static int
2614 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
2615     __unused vfs_context_t ctx)
2616 {
2617         struct kqueue *kq = (struct kqueue *)fp->f_data;
2618         struct knote *kn;
2619         struct kqtailq inprocessq;
2620         int retnum = 0;
2621
2622         if (which != FREAD)
2623                 return (0);
2624
2625         TAILQ_INIT(&inprocessq);
2626
2627         kqlock(kq);
2628         /*
2629          * If this is the first pass, link the wait queue associated with the
2630          * the kqueue onto the wait queue set for the select().  Normally we
2631          * use selrecord() for this, but it uses the wait queue within the
2632          * selinfo structure and we need to use the main one for the kqueue to
2633          * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
2634          * (The select() call will unlink them when it ends).
2635          */
2636         if (wq_link_id != NULL) {
2637                 thread_t cur_act = current_thread();
2638                 struct uthread * ut = get_bsdthread_info(cur_act);
2639
2640                 kq->kq_state |= KQ_SEL;
2641                 waitq_link((struct waitq *)kq->kq_wqs, ut->uu_wqset,
2642                            WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
2643
2644                 /* always consume the reserved link object */
2645                 waitq_link_release(*(uint64_t *)wq_link_id);
2646                 *(uint64_t *)wq_link_id = 0;
2647
2648                 /*
2649                  * selprocess() is expecting that we send it back the waitq
2650                  * that was just added to the thread's waitq set. In order
2651                  * to not change the selrecord() API (which is exported to
2652                  * kexts), we pass this value back through the
2653                  * void *wq_link_id pointer we were passed. We need to use
2654                  * memcpy here because the pointer may not be properly aligned
2655                  * on 32-bit systems.
2656                  */
2657                 memcpy(wq_link_id, (void *)&(kq->kq_wqs), sizeof(void *));
2658         }
2659
2660         if (kqueue_begin_processing(kq) == -1) {
2661                 kqunlock(kq);
2662                 return (0);
2663         }
2664
2665         if (kq->kq_count != 0) {
2666                 /*
2667                  * there is something queued - but it might be a
2668                  * KN_STAYQUEUED knote, which may or may not have
2669                  * any events pending.  So, we have to walk the
2670                  * list of knotes to see, and peek at the stay-
2671                  * queued ones to be really sure.
2672                  */
2673                 while ((kn = (struct knote *)TAILQ_FIRST(&kq->kq_head)) != NULL) {
2674                         if ((kn->kn_status & KN_STAYQUEUED) == 0) {
2675                                 retnum = 1;
2676                                 goto out;
2677                         }
2678
2679                         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2680                         TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe);
2681
2682                         if (kqlock2knoteuse(kq, kn)) {
2683                                 unsigned peek;
2684
2685                                 peek = kn->kn_fop->f_peek(kn);
2686                                 if (knoteuse2kqlock(kq, kn)) {
2687                                         if (peek > 0) {
2688                                                 retnum = 1;
2689                                                 goto out;
2690                                         }
2691                                 } else {
2692                                         retnum = 0;
2693                                 }
2694                         }
2695                 }
2696         }
2697
2698 out:
2699         /* Return knotes to active queue */
2700         while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) {
2701                 TAILQ_REMOVE(&inprocessq, kn, kn_tqe);
2702                 kn->kn_tq = &kq->kq_head;
2703                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2704         }
2705
2706         kqueue_end_processing(kq);
2707         kqunlock(kq);
2708         return (retnum);
2709 }
2710
2711 /*
2712  * kqueue_close -
2713  */
2714 /*ARGSUSED*/
2715 static int
2716 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
2717 {
2718         struct kqueue *kq = (struct kqueue *)fg->fg_data;
2719
2720         kqueue_dealloc(kq);
2721         fg->fg_data = NULL;
2722         return (0);
2723 }
2724
2725 /*ARGSUSED*/
2726 /*
2727  * The callers has taken a use-count reference on this kqueue and will donate it
2728  * to the kqueue we are being added to.  This keeps the kqueue from closing until
2729  * that relationship is torn down.
2730  */
2731 static int
2732 kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
2733 {
2734         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
2735         struct kqueue *parentkq = kn->kn_kq;
2736
2737         if (parentkq == kq ||
2738             kn->kn_filter != EVFILT_READ)
2739                 return (1);
2740
2741         /*
2742          * We have to avoid creating a cycle when nesting kqueues
2743          * inside another.  Rather than trying to walk the whole
2744          * potential DAG of nested kqueues, we just use a simple
2745          * ceiling protocol.  When a kqueue is inserted into another,
2746          * we check that the (future) parent is not already nested
2747          * into another kqueue at a lower level than the potenial
2748          * child (because it could indicate a cycle).  If that test
2749          * passes, we just mark the nesting levels accordingly.
2750          */
2751
2752         kqlock(parentkq);
2753         if (parentkq->kq_level > 0 &&
2754             parentkq->kq_level < kq->kq_level)
2755         {
2756                 kqunlock(parentkq);
2757                 return (1);
2758         } else {
2759                 /* set parent level appropriately */
2760                 if (parentkq->kq_level == 0)
2761                         parentkq->kq_level = 2;
2762                 if (parentkq->kq_level < kq->kq_level + 1)
2763                         parentkq->kq_level = kq->kq_level + 1;
2764                 kqunlock(parentkq);
2765
2766                 kn->kn_fop = &kqread_filtops;
2767                 kqlock(kq);
2768                 KNOTE_ATTACH(&kq->kq_sel.si_note, kn);
2769                 /* indicate nesting in child, if needed */
2770                 if (kq->kq_level == 0)
2771                         kq->kq_level = 1;
2772                 kqunlock(kq);
2773                 return (0);
2774         }
2775 }
2776
2777 /*
2778  * kqueue_drain - called when kq is closed
2779  */
2780 /*ARGSUSED*/
2781 static int
2782 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
2783 {
2784         struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
2785         kqlock(kq);
2786         kqueue_wakeup(kq, 1);
2787         kqunlock(kq);
2788         return (0);
2789 }
2790
2791 /*ARGSUSED*/
2792 int
2793 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
2794 {
2795         kqlock(kq);
2796         if (isstat64 != 0) {
2797                 struct stat64 *sb64 = (struct stat64 *)ub;
2798
2799                 bzero((void *)sb64, sizeof(*sb64));
2800                 sb64->st_size = kq->kq_count;
2801                 if (kq->kq_state & KQ_KEV_QOS)
2802                         sb64->st_blksize = sizeof(struct kevent_qos_s);
2803                 else if (kq->kq_state & KQ_KEV64)
2804                         sb64->st_blksize = sizeof(struct kevent64_s);
2805                 else if (IS_64BIT_PROCESS(p))
2806                         sb64->st_blksize = sizeof(struct user64_kevent);
2807                 else
2808                         sb64->st_blksize = sizeof(struct user32_kevent);
2809                 sb64->st_mode = S_IFIFO;
2810         } else {
2811                 struct stat *sb = (struct stat *)ub;
2812
2813                 bzero((void *)sb, sizeof(*sb));
2814                 sb->st_size = kq->kq_count;
2815                 if (kq->kq_state & KQ_KEV_QOS)
2816                         sb->st_blksize = sizeof(struct kevent_qos_s);
2817                 else if (kq->kq_state & KQ_KEV64)
2818                         sb->st_blksize = sizeof(struct kevent64_s);
2819                 else if (IS_64BIT_PROCESS(p))
2820                         sb->st_blksize = sizeof(struct user64_kevent);
2821                 else
2822                         sb->st_blksize = sizeof(struct user32_kevent);
2823                 sb->st_mode = S_IFIFO;
2824         }
2825         kqunlock(kq);
2826         return (0);
2827 }
2828
2829 /*
2830  * Called with the kqueue locked
2831  */
2832 static void
2833 kqueue_wakeup(struct kqueue *kq, int closed)
2834 {
2835         wait_result_t res = THREAD_NOT_WAITING;
2836
2837         if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0 || kq->kq_nprocess > 0) {
2838                 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
2839                 res = waitq_wakeup64_all((struct waitq *)kq->kq_wqs, KQ_EVENT,
2840                                          (closed) ? THREAD_INTERRUPTED : THREAD_AWAKENED,
2841                                          WAITQ_ALL_PRIORITIES);
2842         }
2843
2844         /* request additional workq threads if appropriate */
2845         if (res == THREAD_NOT_WAITING && (kq->kq_state & KQ_WORKQ) &&
2846             pthread_functions != NULL && pthread_functions->workq_reqthreads != NULL) {
2847                 /*
2848                  * The special workq kq should be accumulating the counts of
2849                  * queued sources on a pthread_priority_t basis and we should
2850                  * be providing that here.  For now, just hard-code a single
2851                  * entry request at a fixed (default) QOS.
2852                  */
2853                 struct workq_reqthreads_req_s request = {
2854                                       .priority = 0x020004ff,  /* legacy event manager */
2855                                                           .count = kq->kq_count };
2856                 thread_t wqthread;
2857
2858                 wqthread = (*pthread_functions->workq_reqthreads)(kq->kq_p, 1, &request);
2859                 assert(wqthread == THREAD_NULL);
2860         }
2861 }
2862
2863 void
2864 klist_init(struct klist *list)
2865 {
2866         SLIST_INIT(list);
2867 }
2868
2869
2870 /*
2871  * Query/Post each knote in the object's list
2872  *
2873  *      The object lock protects the list. It is assumed
2874  *      that the filter/event routine for the object can
2875  *      determine that the object is already locked (via
2876  *      the hint) and not deadlock itself.
2877  *
2878  *      The object lock should also hold off pending
2879  *      detach/drop operations.  But we'll prevent it here
2880  *      too - just in case.
2881  */
2882 void
2883 knote(struct klist *list, long hint)
2884 {
2885         struct knote *kn;
2886
2887         SLIST_FOREACH(kn, list, kn_selnext) {
2888                 struct kqueue *kq = kn->kn_kq;
2889
2890                 kqlock(kq);
2891                 if (kqlock2knoteuse(kq, kn)) {
2892                         int result;
2893
2894                         /* call the event with only a use count */
2895                         result = kn->kn_fop->f_event(kn, hint);
2896
2897                         /* if its not going away and triggered */
2898                         if (knoteuse2kqlock(kq, kn) && result)
2899                                 knote_activate(kn, 0);
2900                         /* lock held again */
2901                 }
2902                 kqunlock(kq);
2903         }
2904 }
2905
2906 /*
2907  * attach a knote to the specified list.  Return true if this is the first entry.
2908  * The list is protected by whatever lock the object it is associated with uses.
2909  */
2910 int
2911 knote_attach(struct klist *list, struct knote *kn)
2912 {
2913         int ret = SLIST_EMPTY(list);
2914         SLIST_INSERT_HEAD(list, kn, kn_selnext);
2915         return (ret);
2916 }
2917
2918 /*
2919  * detach a knote from the specified list.  Return true if that was the last entry.
2920  * The list is protected by whatever lock the object it is associated with uses.
2921  */
2922 int
2923 knote_detach(struct klist *list, struct knote *kn)
2924 {
2925         SLIST_REMOVE(list, kn, knote, kn_selnext);
2926         return (SLIST_EMPTY(list));
2927 }
2928
2929 /*
2930  * For a given knote, link a provided wait queue directly with the kqueue.
2931  * Wakeups will happen via recursive wait queue support.  But nothing will move
2932  * the knote to the active list at wakeup (nothing calls knote()).  Instead,
2933  * we permanently enqueue them here.
2934  *
2935  * kqueue and knote references are held by caller.
2936  *
2937  * caller provides the wait queue link structure.
2938  */
2939 int
2940 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
2941 {
2942         struct kqueue *kq = kn->kn_kq;
2943         kern_return_t kr;
2944
2945         kr = waitq_link(wq, kq->kq_wqs, WAITQ_SHOULD_LOCK, reserved_link);
2946         if (kr == KERN_SUCCESS) {
2947                 knote_markstayqueued(kn);
2948                 return (0);
2949         } else {
2950                 return (EINVAL);
2951         }
2952 }
2953
2954 /*
2955  * Unlink the provided wait queue from the kqueue associated with a knote.
2956  * Also remove it from the magic list of directly attached knotes.
2957  *
2958  * Note that the unlink may have already happened from the other side, so
2959  * ignore any failures to unlink and just remove it from the kqueue list.
2960  *
2961  * On success, caller is responsible for the link structure
2962  */
2963 int
2964 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
2965 {
2966         struct kqueue *kq = kn->kn_kq;
2967         kern_return_t kr;
2968
2969         kr = waitq_unlink(wq, kq->kq_wqs);
2970         knote_clearstayqueued(kn);
2971         return ((kr != KERN_SUCCESS) ? EINVAL : 0);
2972 }
2973
2974 /*
2975  * remove all knotes referencing a specified fd
2976  *
2977  * Essentially an inlined knote_remove & knote_drop
2978  * when we know for sure that the thing is a file
2979  *
2980  * Entered with the proc_fd lock already held.
2981  * It returns the same way, but may drop it temporarily.
2982  */
2983 void
2984 knote_fdclose(struct proc *p, int fd)
2985 {
2986         struct filedesc *fdp = p->p_fd;
2987         struct klist *list;
2988         struct knote *kn;
2989
2990         list = &fdp->fd_knlist[fd];
2991         while ((kn = SLIST_FIRST(list)) != NULL) {
2992                 struct kqueue *kq = kn->kn_kq;
2993
2994                 if (kq->kq_p != p)
2995                         panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
2996                             __func__, kq->kq_p, p);
2997
2998                 kqlock(kq);
2999                 proc_fdunlock(p);
3000
3001                 /*
3002                  * Convert the lock to a drop ref.
3003                  * If we get it, go ahead and drop it.
3004                  * Otherwise, we waited for it to
3005                  * be dropped by the other guy, so
3006                  * it is safe to move on in the list.
3007                  */
3008                 if (kqlock2knotedrop(kq, kn)) {
3009                         kn->kn_fop->f_detach(kn);
3010                         knote_drop(kn, p);
3011                 }
3012
3013                 proc_fdlock(p);
3014
3015                 /* the fd tables may have changed - start over */
3016                 list = &fdp->fd_knlist[fd];
3017         }
3018 }
3019
3020 /* proc_fdlock held on entry (and exit) */
3021 static int
3022 knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p)
3023 {
3024         struct klist *list = NULL;
3025
3026         if (! kn->kn_fop->f_isfd) {
3027                 if (fdp->fd_knhashmask == 0)
3028                         fdp->fd_knhash = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
3029                             &fdp->fd_knhashmask);
3030                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
3031         } else {
3032                 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
3033                         u_int size = 0;
3034
3035                         if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
3036                             || kn->kn_id >= (uint64_t)maxfiles)
3037                                 return (EINVAL);
3038
3039                         /* have to grow the fd_knlist */
3040                         size = fdp->fd_knlistsize;
3041                         while (size <= kn->kn_id)
3042                                 size += KQEXTENT;
3043
3044                         if (size >= (UINT_MAX/sizeof(struct klist *)))
3045                                 return (EINVAL);
3046
3047                         MALLOC(list, struct klist *,
3048                             size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
3049                         if (list == NULL)
3050                                 return (ENOMEM);
3051
3052                         bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
3053                             fdp->fd_knlistsize * sizeof(struct klist *));
3054                         bzero((caddr_t)list +
3055                             fdp->fd_knlistsize * sizeof(struct klist *),
3056                             (size - fdp->fd_knlistsize) * sizeof(struct klist *));
3057                         FREE(fdp->fd_knlist, M_KQUEUE);
3058                         fdp->fd_knlist = list;
3059                         fdp->fd_knlistsize = size;
3060                 }
3061                 list = &fdp->fd_knlist[kn->kn_id];
3062         }
3063         SLIST_INSERT_HEAD(list, kn, kn_link);
3064         return (0);
3065 }
3066
3067
3068
3069 /*
3070  * should be called at spl == 0, since we don't want to hold spl
3071  * while calling fdrop and free.
3072  */
3073 static void
3074 knote_drop(struct knote *kn, __unused struct proc *ctxp)
3075 {
3076         struct kqueue *kq = kn->kn_kq;
3077         struct proc *p = kq->kq_p;
3078         struct filedesc *fdp = p->p_fd;
3079         struct klist *list;
3080         int needswakeup;
3081
3082         proc_fdlock(p);
3083         if (kn->kn_fop->f_isfd)
3084                 list = &fdp->fd_knlist[kn->kn_id];
3085         else
3086                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
3087
3088         SLIST_REMOVE(list, kn, knote, kn_link);
3089         kqlock(kq);
3090         knote_dequeue(kn);
3091         needswakeup = (kn->kn_status & KN_USEWAIT);
3092         kqunlock(kq);
3093         proc_fdunlock(p);
3094
3095         if (needswakeup)
3096                 waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
3097                                    CAST_EVENT64_T(&kn->kn_status),
3098                                    THREAD_AWAKENED,
3099                                    WAITQ_ALL_PRIORITIES);
3100
3101         if (kn->kn_fop->f_isfd)
3102                 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
3103
3104         knote_free(kn);
3105 }
3106
3107 /* called with kqueue lock held */
3108 static void
3109 knote_activate(struct knote *kn, int force)
3110 {
3111         struct kqueue *kq = kn->kn_kq;
3112
3113         if (!force && (kn->kn_status & KN_ACTIVE))
3114                 return;
3115
3116         kn->kn_status |= KN_ACTIVE;
3117         knote_enqueue(kn);
3118         kqueue_wakeup(kq, 0);
3119
3120         /* wake up the parent kq, too */
3121         KNOTE(&kq->kq_sel.si_note, 0);
3122 }
3123
3124 /* called with kqueue lock held */
3125 static void
3126 knote_deactivate(struct knote *kn)
3127 {
3128         kn->kn_status &= ~KN_ACTIVE;
3129         knote_dequeue(kn);
3130 }
3131
3132 /* called with kqueue lock held */
3133 static void
3134 knote_enqueue(struct knote *kn)
3135 {
3136         if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_STAYQUEUED ||
3137             (kn->kn_status & (KN_QUEUED | KN_STAYQUEUED | KN_DISABLED)) == 0) {
3138                 struct kqtailq *tq = kn->kn_tq;
3139                 struct kqueue *kq = kn->kn_kq;
3140
3141                 TAILQ_INSERT_TAIL(tq, kn, kn_tqe);
3142                 kn->kn_status |= KN_QUEUED;
3143                 kq->kq_count++;
3144         }
3145 }
3146
3147 /* called with kqueue lock held */
3148 static void
3149 knote_dequeue(struct knote *kn)
3150 {
3151         struct kqueue *kq = kn->kn_kq;
3152
3153         if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_QUEUED) {
3154                 struct kqtailq *tq = kn->kn_tq;
3155
3156                 TAILQ_REMOVE(tq, kn, kn_tqe);
3157                 kn->kn_tq = &kq->kq_head;
3158                 kn->kn_status &= ~KN_QUEUED;
3159                 kq->kq_count--;
3160         }
3161 }
3162
3163 void
3164 knote_init(void)
3165 {
3166         knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote),
3167             8192, "knote zone");
3168
3169         /* allocate kq lock group attribute and group */
3170         kq_lck_grp_attr = lck_grp_attr_alloc_init();
3171
3172         kq_lck_grp = lck_grp_alloc_init("kqueue",  kq_lck_grp_attr);
3173
3174         /* Allocate kq lock attribute */
3175         kq_lck_attr = lck_attr_alloc_init();
3176
3177         /* Initialize the timer filter lock */
3178         lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
3179
3180 #if VM_PRESSURE_EVENTS
3181         /* Initialize the vm pressure list lock */
3182         vm_pressure_init(kq_lck_grp, kq_lck_attr);
3183 #endif
3184
3185 #if CONFIG_MEMORYSTATUS
3186         /* Initialize the memorystatus list lock */
3187         memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
3188 #endif
3189 }
3190 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
3191
3192 static struct knote *
3193 knote_alloc(void)
3194 {
3195         return ((struct knote *)zalloc(knote_zone));
3196 }
3197
3198 static void
3199 knote_free(struct knote *kn)
3200 {
3201         zfree(knote_zone, kn);
3202 }
3203
3204 #if SOCKETS
3205 #include <sys/param.h>
3206 #include <sys/socket.h>
3207 #include <sys/protosw.h>
3208 #include <sys/domain.h>
3209 #include <sys/mbuf.h>
3210 #include <sys/kern_event.h>
3211 #include <sys/malloc.h>
3212 #include <sys/sys_domain.h>
3213 #include <sys/syslog.h>
3214
3215 #ifndef ROUNDUP64
3216 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
3217 #endif
3218
3219 #ifndef ADVANCE64
3220 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
3221 #endif
3222
3223 static lck_grp_attr_t *kev_lck_grp_attr;
3224 static lck_attr_t *kev_lck_attr;
3225 static lck_grp_t *kev_lck_grp;
3226 static decl_lck_rw_data(,kev_lck_data);
3227 static lck_rw_t *kev_rwlock = &kev_lck_data;
3228
3229 static int kev_attach(struct socket *so, int proto, struct proc *p);
3230 static int kev_detach(struct socket *so);
3231 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
3232     struct ifnet *ifp, struct proc *p);
3233 static lck_mtx_t * event_getlock(struct socket *, int);
3234 static int event_lock(struct socket *, int, void *);
3235 static int event_unlock(struct socket *, int, void *);
3236
3237 static int event_sofreelastref(struct socket *);
3238 static void kev_delete(struct kern_event_pcb *);
3239
3240 static struct pr_usrreqs event_usrreqs = {
3241         .pru_attach =           kev_attach,
3242         .pru_control =          kev_control,
3243         .pru_detach =           kev_detach,
3244         .pru_soreceive =        soreceive,
3245 };
3246
3247 static struct protosw eventsw[] = {
3248 {
3249         .pr_type =              SOCK_RAW,
3250         .pr_protocol =          SYSPROTO_EVENT,
3251         .pr_flags =             PR_ATOMIC,
3252         .pr_usrreqs =           &event_usrreqs,
3253         .pr_lock =              event_lock,
3254         .pr_unlock =            event_unlock,
3255         .pr_getlock =           event_getlock,
3256 }
3257 };
3258
3259 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
3260 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
3261
3262 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
3263         CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Kernel event family");
3264
3265 struct kevtstat kevtstat;
3266 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
3267     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
3268     kevt_getstat, "S,kevtstat", "");
3269
3270 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
3271         CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
3272         kevt_pcblist, "S,xkevtpcb", "");
3273
3274 static lck_mtx_t *
3275 event_getlock(struct socket *so, int locktype)
3276 {
3277 #pragma unused(locktype)
3278         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
3279
3280         if (so->so_pcb != NULL)  {
3281                 if (so->so_usecount < 0)
3282                         panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
3283                             so, so->so_usecount, solockhistory_nr(so));
3284                         /* NOTREACHED */
3285         } else {
3286                 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
3287                     so, solockhistory_nr(so));
3288                 /* NOTREACHED */
3289         }
3290         return (&ev_pcb->evp_mtx);
3291 }
3292
3293 static int
3294 event_lock(struct socket *so, int refcount, void *lr)
3295 {
3296         void *lr_saved;
3297
3298         if (lr == NULL)
3299                 lr_saved = __builtin_return_address(0);
3300         else
3301                 lr_saved = lr;
3302
3303         if (so->so_pcb != NULL) {
3304                 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
3305         } else  {
3306                 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3307                     so, lr_saved, solockhistory_nr(so));
3308                 /* NOTREACHED */
3309         }
3310
3311         if (so->so_usecount < 0) {
3312                 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
3313                     so, so->so_pcb, lr_saved, so->so_usecount,
3314                     solockhistory_nr(so));
3315                 /* NOTREACHED */
3316         }
3317
3318         if (refcount)
3319                 so->so_usecount++;
3320
3321         so->lock_lr[so->next_lock_lr] = lr_saved;
3322         so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
3323         return (0);
3324 }
3325
3326 static int
3327 event_unlock(struct socket *so, int refcount, void *lr)
3328 {
3329         void *lr_saved;
3330         lck_mtx_t *mutex_held;
3331
3332         if (lr == NULL)
3333                 lr_saved = __builtin_return_address(0);
3334         else
3335                 lr_saved = lr;
3336
3337         if (refcount)
3338                 so->so_usecount--;
3339
3340         if (so->so_usecount < 0) {
3341                 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
3342                     so, so->so_usecount, solockhistory_nr(so));
3343                 /* NOTREACHED */
3344         }
3345         if (so->so_pcb == NULL) {
3346                 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
3347                     so, so->so_usecount, (void *)lr_saved,
3348                     solockhistory_nr(so));
3349                 /* NOTREACHED */
3350         }
3351         mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
3352
3353         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3354         so->unlock_lr[so->next_unlock_lr] = lr_saved;
3355         so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
3356
3357         if (so->so_usecount == 0) {
3358                 VERIFY(so->so_flags & SOF_PCBCLEARING);
3359                 event_sofreelastref(so);
3360         } else {
3361                 lck_mtx_unlock(mutex_held);
3362         }
3363
3364         return (0);
3365 }
3366
3367 static int
3368 event_sofreelastref(struct socket *so)
3369 {
3370         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
3371
3372         lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
3373
3374         so->so_pcb = NULL;
3375
3376         /*
3377          * Disable upcall in the event another thread is in kev_post_msg()
3378          * appending record to the receive socket buffer, since sbwakeup()
3379          * may release the socket lock otherwise.
3380          */
3381         so->so_rcv.sb_flags &= ~SB_UPCALL;
3382         so->so_snd.sb_flags &= ~SB_UPCALL;
3383         so->so_event = sonullevent;
3384         lck_mtx_unlock(&(ev_pcb->evp_mtx));
3385
3386         lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
3387         lck_rw_lock_exclusive(kev_rwlock);
3388         LIST_REMOVE(ev_pcb, evp_link);
3389         kevtstat.kes_pcbcount--;
3390         kevtstat.kes_gencnt++;
3391         lck_rw_done(kev_rwlock);
3392         kev_delete(ev_pcb);
3393
3394         sofreelastref(so, 1);
3395         return (0);
3396 }
3397
3398 static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));
3399
3400 static
3401 struct kern_event_head kern_event_head;
3402
3403 static u_int32_t static_event_id = 0;
3404
3405 #define EVPCB_ZONE_MAX          65536
3406 #define EVPCB_ZONE_NAME         "kerneventpcb"
3407 static struct zone *ev_pcb_zone;
3408
3409 /*
3410  * Install the protosw's for the NKE manager.  Invoked at extension load time
3411  */
3412 void
3413 kern_event_init(struct domain *dp)
3414 {
3415         struct protosw *pr;
3416         int i;
3417
3418         VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
3419         VERIFY(dp == systemdomain);
3420
3421         kev_lck_grp_attr = lck_grp_attr_alloc_init();
3422         if (kev_lck_grp_attr == NULL) {
3423                 panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
3424                 /* NOTREACHED */
3425         }
3426
3427         kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
3428             kev_lck_grp_attr);
3429         if (kev_lck_grp == NULL) {
3430                 panic("%s: lck_grp_alloc_init failed\n", __func__);
3431                 /* NOTREACHED */
3432         }
3433
3434         kev_lck_attr = lck_attr_alloc_init();
3435         if (kev_lck_attr == NULL) {
3436                 panic("%s: lck_attr_alloc_init failed\n", __func__);
3437                 /* NOTREACHED */
3438         }
3439
3440         lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
3441         if (kev_rwlock == NULL) {
3442                 panic("%s: lck_mtx_alloc_init failed\n", __func__);
3443                 /* NOTREACHED */
3444         }
3445
3446         for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++)
3447                 net_add_proto(pr, dp, 1);
3448
3449         ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
3450             EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
3451         if (ev_pcb_zone == NULL) {
3452                 panic("%s: failed allocating ev_pcb_zone", __func__);
3453                 /* NOTREACHED */
3454         }
3455         zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
3456         zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
3457 }
3458
3459 static int
3460 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
3461 {
3462         int error = 0;
3463         struct kern_event_pcb *ev_pcb;
3464
3465         error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
3466         if (error != 0)
3467                 return (error);
3468
3469         if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
3470                 return (ENOBUFS);
3471         }
3472         bzero(ev_pcb, sizeof(struct kern_event_pcb));
3473         lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
3474
3475         ev_pcb->evp_socket = so;
3476         ev_pcb->evp_vendor_code_filter = 0xffffffff;
3477
3478         so->so_pcb = (caddr_t) ev_pcb;
3479         lck_rw_lock_exclusive(kev_rwlock);
3480         LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
3481         kevtstat.kes_pcbcount++;
3482         kevtstat.kes_gencnt++;
3483         lck_rw_done(kev_rwlock);
3484
3485         return (error);
3486 }
3487
3488 static void
3489 kev_delete(struct kern_event_pcb *ev_pcb)
3490 {
3491         VERIFY(ev_pcb != NULL);
3492         lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
3493         zfree(ev_pcb_zone, ev_pcb);
3494 }
3495
3496 static int
3497 kev_detach(struct socket *so)
3498 {
3499         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3500
3501         if (ev_pcb != NULL) {
3502                 soisdisconnected(so);
3503                 so->so_flags |= SOF_PCBCLEARING;
3504         }
3505
3506         return (0);
3507 }
3508
3509 /*
3510  * For now, kev_vendor_code and mbuf_tags use the same
3511  * mechanism.
3512  */
3513 errno_t kev_vendor_code_find(
3514         const char      *string,
3515         u_int32_t       *out_vendor_code)
3516 {
3517         if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
3518                 return (EINVAL);
3519         }
3520         return (net_str_id_find_internal(string, out_vendor_code,
3521             NSI_VENDOR_CODE, 1));
3522 }
3523
3524 errno_t
3525 kev_msg_post(struct kev_msg *event_msg)
3526 {
3527         mbuf_tag_id_t min_vendor, max_vendor;
3528
3529         net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
3530
3531         if (event_msg == NULL)
3532                 return (EINVAL);
3533
3534         /*
3535          * Limit third parties to posting events for registered vendor codes
3536          * only
3537          */
3538         if (event_msg->vendor_code < min_vendor ||
3539             event_msg->vendor_code > max_vendor) {
3540                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
3541                 return (EINVAL);
3542         }
3543         return (kev_post_msg(event_msg));
3544 }
3545
3546 int
3547 kev_post_msg(struct kev_msg *event_msg)
3548 {
3549         struct mbuf *m, *m2;
3550         struct kern_event_pcb *ev_pcb;
3551         struct kern_event_msg *ev;
3552         char *tmp;
3553         u_int32_t total_size;
3554         int i;
3555
3556         /* Verify the message is small enough to fit in one mbuf w/o cluster */
3557         total_size = KEV_MSG_HEADER_SIZE;
3558
3559         for (i = 0; i < 5; i++) {
3560                 if (event_msg->dv[i].data_length == 0)
3561                         break;
3562                 total_size += event_msg->dv[i].data_length;
3563         }
3564
3565         if (total_size > MLEN) {
3566                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
3567                 return (EMSGSIZE);
3568         }
3569
3570         m = m_get(M_DONTWAIT, MT_DATA);
3571         if (m == 0) {
3572                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
3573                 return (ENOMEM);
3574         }
3575         ev = mtod(m, struct kern_event_msg *);
3576         total_size = KEV_MSG_HEADER_SIZE;
3577
3578         tmp = (char *) &ev->event_data[0];
3579         for (i = 0; i < 5; i++) {
3580                 if (event_msg->dv[i].data_length == 0)
3581                         break;
3582
3583                 total_size += event_msg->dv[i].data_length;
3584                 bcopy(event_msg->dv[i].data_ptr, tmp,
3585                     event_msg->dv[i].data_length);
3586                 tmp += event_msg->dv[i].data_length;
3587         }
3588
3589         ev->id = ++static_event_id;
3590         ev->total_size   = total_size;
3591         ev->vendor_code  = event_msg->vendor_code;
3592         ev->kev_class    = event_msg->kev_class;
3593         ev->kev_subclass = event_msg->kev_subclass;
3594         ev->event_code   = event_msg->event_code;
3595
3596         m->m_len = total_size;
3597         lck_rw_lock_shared(kev_rwlock);
3598         for (ev_pcb = LIST_FIRST(&kern_event_head);
3599             ev_pcb;
3600             ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
3601                 lck_mtx_lock(&ev_pcb->evp_mtx);
3602                 if (ev_pcb->evp_socket->so_pcb == NULL) {
3603                         lck_mtx_unlock(&ev_pcb->evp_mtx);
3604                         continue;
3605                 }
3606                 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
3607                         if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
3608                                 lck_mtx_unlock(&ev_pcb->evp_mtx);
3609                                 continue;
3610                         }
3611
3612                         if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
3613                                 if (ev_pcb->evp_class_filter != ev->kev_class) {
3614                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
3615                                         continue;
3616                                 }
3617
3618                                 if ((ev_pcb->evp_subclass_filter !=
3619                                     KEV_ANY_SUBCLASS) &&
3620                                     (ev_pcb->evp_subclass_filter !=
3621                                     ev->kev_subclass)) {
3622                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
3623                                         continue;
3624                                 }
3625                         }
3626                 }
3627
3628                 m2 = m_copym(m, 0, m->m_len, M_NOWAIT);
3629                 if (m2 == 0) {
3630                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
3631                         m_free(m);
3632                         lck_mtx_unlock(&ev_pcb->evp_mtx);
3633                         lck_rw_done(kev_rwlock);
3634                         return (ENOMEM);
3635                 }
3636                 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
3637                         /*
3638                          * We use "m" for the socket stats as it would be
3639                          * unsafe to use "m2"
3640                          */
3641                         so_inc_recv_data_stat(ev_pcb->evp_socket,
3642                             1, m->m_len, SO_TC_BE);
3643
3644                         sorwakeup(ev_pcb->evp_socket);
3645                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
3646                 } else {
3647                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
3648                 }
3649                 lck_mtx_unlock(&ev_pcb->evp_mtx);
3650         }
3651         m_free(m);
3652         lck_rw_done(kev_rwlock);
3653
3654         return (0);
3655 }
3656
3657 static int
3658 kev_control(struct socket *so,
3659     u_long cmd,
3660     caddr_t data,
3661     __unused struct ifnet *ifp,
3662     __unused struct proc *p)
3663 {
3664         struct kev_request *kev_req = (struct kev_request *) data;
3665         struct kern_event_pcb  *ev_pcb;
3666         struct kev_vendor_code *kev_vendor;
3667         u_int32_t  *id_value = (u_int32_t *) data;
3668
3669         switch (cmd) {
3670                 case SIOCGKEVID:
3671                         *id_value = static_event_id;
3672                         break;
3673                 case SIOCSKEVFILT:
3674                         ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3675                         ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
3676                         ev_pcb->evp_class_filter = kev_req->kev_class;
3677                         ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
3678                         break;
3679                 case SIOCGKEVFILT:
3680                         ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3681                         kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
3682                         kev_req->kev_class   = ev_pcb->evp_class_filter;
3683                         kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
3684                         break;
3685                 case SIOCGKEVVENDOR:
3686                         kev_vendor = (struct kev_vendor_code *)data;
3687                         /* Make sure string is NULL terminated */
3688                         kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
3689                         return (net_str_id_find_internal(kev_vendor->vendor_string,
3690                             &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0));
3691                 default:
3692                         return (ENOTSUP);
3693         }
3694
3695         return (0);
3696 }
3697
3698 int
3699 kevt_getstat SYSCTL_HANDLER_ARGS
3700 {
3701 #pragma unused(oidp, arg1, arg2)
3702         int error = 0;
3703
3704         lck_rw_lock_shared(kev_rwlock);
3705
3706         if (req->newptr != USER_ADDR_NULL) {
3707                 error = EPERM;
3708                 goto done;
3709         }
3710         if (req->oldptr == USER_ADDR_NULL) {
3711                 req->oldidx = sizeof(struct kevtstat);
3712                 goto done;
3713         }
3714
3715         error = SYSCTL_OUT(req, &kevtstat,
3716             MIN(sizeof(struct kevtstat), req->oldlen));
3717 done:
3718         lck_rw_done(kev_rwlock);
3719
3720         return (error);
3721 }
3722
3723 __private_extern__ int
3724 kevt_pcblist SYSCTL_HANDLER_ARGS
3725 {
3726 #pragma unused(oidp, arg1, arg2)
3727         int error = 0;
3728         int n, i;
3729         struct xsystmgen xsg;
3730         void *buf = NULL;
3731         size_t item_size = ROUNDUP64(sizeof (struct xkevtpcb)) +
3732                 ROUNDUP64(sizeof (struct xsocket_n)) +
3733                 2 * ROUNDUP64(sizeof (struct xsockbuf_n)) +
3734                 ROUNDUP64(sizeof (struct xsockstat_n));
3735         struct kern_event_pcb  *ev_pcb;
3736
3737         buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
3738         if (buf == NULL)
3739                 return (ENOMEM);
3740
3741         lck_rw_lock_shared(kev_rwlock);
3742
3743         n = kevtstat.kes_pcbcount;
3744
3745         if (req->oldptr == USER_ADDR_NULL) {
3746                 req->oldidx = (n + n/8) * item_size;
3747                 goto done;
3748         }
3749         if (req->newptr != USER_ADDR_NULL) {
3750                 error = EPERM;
3751                 goto done;
3752         }
3753         bzero(&xsg, sizeof (xsg));
3754         xsg.xg_len = sizeof (xsg);
3755         xsg.xg_count = n;
3756         xsg.xg_gen = kevtstat.kes_gencnt;
3757         xsg.xg_sogen = so_gencnt;
3758         error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
3759         if (error) {
3760                 goto done;
3761         }
3762         /*
3763          * We are done if there is no pcb
3764          */
3765         if (n == 0) {
3766                 goto done;
3767         }
3768
3769         i = 0;
3770         for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
3771             i < n && ev_pcb != NULL;
3772             i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
3773                 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
3774                 struct xsocket_n *xso = (struct xsocket_n *)
3775                         ADVANCE64(xk, sizeof (*xk));
3776                 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
3777                         ADVANCE64(xso, sizeof (*xso));
3778                 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
3779                         ADVANCE64(xsbrcv, sizeof (*xsbrcv));
3780                 struct xsockstat_n *xsostats = (struct xsockstat_n *)
3781                         ADVANCE64(xsbsnd, sizeof (*xsbsnd));
3782
3783                 bzero(buf, item_size);
3784
3785                 lck_mtx_lock(&ev_pcb->evp_mtx);
3786
3787                 xk->kep_len = sizeof(struct xkevtpcb);
3788                 xk->kep_kind = XSO_EVT;
3789                 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
3790                 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
3791                 xk->kep_class_filter = ev_pcb->evp_class_filter;
3792                 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
3793
3794                 sotoxsocket_n(ev_pcb->evp_socket, xso);
3795                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
3796                         &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
3797                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
3798                         &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
3799                 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
3800
3801                 lck_mtx_unlock(&ev_pcb->evp_mtx);
3802
3803                 error = SYSCTL_OUT(req, buf, item_size);
3804         }
3805
3806         if (error == 0) {
3807                 /*
3808                  * Give the user an updated idea of our state.
3809                  * If the generation differs from what we told
3810                  * her before, she knows that something happened
3811                  * while we were processing this request, and it
3812                  * might be necessary to retry.
3813                  */
3814                 bzero(&xsg, sizeof (xsg));
3815                 xsg.xg_len = sizeof (xsg);
3816                 xsg.xg_count = n;
3817                 xsg.xg_gen = kevtstat.kes_gencnt;
3818                 xsg.xg_sogen = so_gencnt;
3819                 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
3820                 if (error) {
3821                         goto done;
3822                 }
3823         }
3824
3825 done:
3826         lck_rw_done(kev_rwlock);
3827
3828         return (error);
3829 }
3830
3831 #endif /* SOCKETS */
3832
3833
3834 int
3835 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
3836 {
3837         struct vinfo_stat * st;
3838
3839         st = &kinfo->kq_stat;
3840
3841         st->vst_size = kq->kq_count;
3842         if (kq->kq_state & KQ_KEV_QOS)
3843                 st->vst_blksize = sizeof(struct kevent_qos_s);
3844         else if (kq->kq_state & KQ_KEV64)
3845                 st->vst_blksize = sizeof(struct kevent64_s);
3846         else
3847                 st->vst_blksize = sizeof(struct kevent);
3848         st->vst_mode = S_IFIFO;
3849
3850         /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
3851 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS)
3852         kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
3853
3854         return (0);
3855 }
3856
3857
3858 void
3859 knote_markstayqueued(struct knote *kn)
3860 {
3861         kqlock(kn->kn_kq);
3862         kn->kn_status |= KN_STAYQUEUED;
3863         knote_enqueue(kn);
3864         kqunlock(kn->kn_kq);
3865 }
3866
3867 void
3868 knote_clearstayqueued(struct knote *kn)
3869 {
3870         kqlock(kn->kn_kq);
3871         kn->kn_status &= ~KN_STAYQUEUED;
3872         knote_dequeue(kn);
3873         kqunlock(kn->kn_kq);
3874 }
3875
3876 static unsigned long
3877 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
3878                 unsigned long buflen, unsigned long nknotes)
3879 {
3880         struct kevent_qos_s kevqos;
3881         struct kevent_internal_s *kevp;
3882         for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
3883                 if (kq == kn->kn_kq) {
3884                         if (nknotes < buflen) {
3885                                 struct kevent_extinfo *info = &buf[nknotes];
3886
3887                                 kqlock(kq);
3888                                 bzero(&kevqos, sizeof(kevqos));
3889                                 kevp = &(kn->kn_kevent);
3890
3891                                 kevqos.ident = kevp->ident;
3892                                 kevqos.filter = kevp->filter;
3893                                 kevqos.flags = kevp->flags;
3894                                 kevqos.fflags = kevp->fflags;
3895                                 kevqos.data = (int64_t) kevp->data;
3896                                 kevqos.udata = kevp->udata;
3897                                 kevqos.ext[0] = kevp->ext[0];
3898                                 kevqos.ext[1] = kevp->ext[1];
3899
3900                                 memcpy(&info->kqext_kev, &kevqos, sizeof(info->kqext_kev));
3901                                 info->kqext_sdata = kn->kn_sdata;
3902
3903                                 /* status flags exported to userspace/libproc */
3904 #define KQEXT_STATUS_MASK (KN_ACTIVE|KN_QUEUED|KN_DISABLED|KN_STAYQUEUED)
3905                                 info->kqext_status = kn->kn_status & KQEXT_STATUS_MASK;
3906                                 info->kqext_sfflags = kn->kn_sfflags;
3907
3908                                 kqunlock(kq);
3909                         }
3910
3911                         /* we return total number of knotes, which may be more than requested */
3912                         nknotes++;
3913                 }
3914         }
3915
3916         return nknotes;
3917 }
3918
3919 int
3920 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
3921                 uint32_t bufsize, int32_t *retval)
3922 {
3923         struct knote *kn;
3924         int i;
3925         int err = 0;
3926         struct filedesc *fdp = p->p_fd;
3927         unsigned long nknotes = 0;
3928         unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
3929         struct kevent_extinfo *kqext = NULL;
3930
3931         kqext = kalloc(buflen * sizeof(struct kevent_extinfo));
3932         if (kqext == NULL) {
3933                 err = ENOMEM;
3934                 goto out;
3935         }
3936         bzero(kqext, buflen * sizeof(struct kevent_extinfo));
3937
3938         proc_fdlock(p);
3939
3940         for (i = 0; i < fdp->fd_knlistsize; i++) {
3941                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
3942                 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
3943         }
3944
3945         if (fdp->fd_knhashmask != 0) {
3946                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
3947                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3948                         nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
3949                 }
3950         }
3951
3952         proc_fdunlock(p);
3953
3954         assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
3955         err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
3956
3957  out:
3958         if (kqext) {
3959                 kfree(kqext, buflen * sizeof(struct kevent_extinfo));
3960                 kqext = NULL;
3961         }
3962
3963         if (!err)
3964                 *retval = nknotes;
3965         return err;
3966 }