bsd/kern/kern_event.c

   1 /*
   2  * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  */
  29 /*-
  30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  31  * All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  52  * SUCH DAMAGE.
  53  */
  54 /*
  55  *      @(#)kern_event.c       1.0 (3/31/2000)
  56  */
  57 #include <stdint.h>
  58 #include <machine/atomic.h>
  59
  60 #include <sys/param.h>
  61 #include <sys/systm.h>
  62 #include <sys/filedesc.h>
  63 #include <sys/kernel.h>
  64 #include <sys/proc_internal.h>
  65 #include <sys/kauth.h>
  66 #include <sys/malloc.h>
  67 #include <sys/unistd.h>
  68 #include <sys/file_internal.h>
  69 #include <sys/fcntl.h>
  70 #include <sys/select.h>
  71 #include <sys/queue.h>
  72 #include <sys/event.h>
  73 #include <sys/eventvar.h>
  74 #include <sys/protosw.h>
  75 #include <sys/socket.h>
  76 #include <sys/socketvar.h>
  77 #include <sys/stat.h>
  78 #include <sys/syscall.h> // SYS_* constants
  79 #include <sys/sysctl.h>
  80 #include <sys/uio.h>
  81 #include <sys/sysproto.h>
  82 #include <sys/user.h>
  83 #include <sys/vnode_internal.h>
  84 #include <string.h>
  85 #include <sys/proc_info.h>
  86 #include <sys/codesign.h>
  87 #include <sys/pthread_shims.h>
  88 #include <sys/kdebug.h>
  89 #include <sys/reason.h>
  90 #include <os/reason_private.h>
  91 #include <pexpert/pexpert.h>
  92
  93 #include <kern/locks.h>
  94 #include <kern/clock.h>
  95 #include <kern/cpu_data.h>
  96 #include <kern/policy_internal.h>
  97 #include <kern/thread_call.h>
  98 #include <kern/sched_prim.h>
  99 #include <kern/waitq.h>
 100 #include <kern/zalloc.h>
 101 #include <kern/kalloc.h>
 102 #include <kern/assert.h>
 103 #include <kern/ast.h>
 104 #include <kern/thread.h>
 105 #include <kern/kcdata.h>
 106
 107 #include <pthread/priority_private.h>
 108 #include <pthread/workqueue_syscalls.h>
 109 #include <pthread/workqueue_internal.h>
 110 #include <libkern/libkern.h>
 111 #include <libkern/OSAtomic.h>
 112
 113 #include "net/net_str_id.h"
 114
 115 #include <mach/task.h>
 116 #include <libkern/section_keywords.h>
 117
 118 #if CONFIG_MEMORYSTATUS
 119 #include <sys/kern_memorystatus.h>
 120 #endif
 121
 122 extern thread_t port_name_to_thread(mach_port_name_t    port_name); /* osfmk/kern/ipc_tt.h   */
 123 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
 124
 125 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
 126
 127 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 128
 129 #define KQ_EVENT        NO_EVENT64
 130
 131 static int kqueue_read(struct fileproc *fp, struct uio *uio,
 132     int flags, vfs_context_t ctx);
 133 static int kqueue_write(struct fileproc *fp, struct uio *uio,
 134     int flags, vfs_context_t ctx);
 135 static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
 136     vfs_context_t ctx);
 137 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
 138     vfs_context_t ctx);
 139 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
 140 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
 141     struct kevent_internal_s *kev, vfs_context_t ctx);
 142 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
 143
 144 static const struct fileops kqueueops = {
 145         .fo_type = DTYPE_KQUEUE,
 146         .fo_read = kqueue_read,
 147         .fo_write = kqueue_write,
 148         .fo_ioctl = kqueue_ioctl,
 149         .fo_select = kqueue_select,
 150         .fo_close = kqueue_close,
 151         .fo_kqfilter = kqueue_kqfilter,
 152         .fo_drain = kqueue_drain,
 153 };
 154
 155 static void kevent_put_kq(struct proc *p, kqueue_id_t id, struct fileproc *fp, struct kqueue *kq);
 156 static int kevent_internal(struct proc *p,
 157     kqueue_id_t id, kqueue_id_t *id_out,
 158     user_addr_t changelist, int nchanges,
 159     user_addr_t eventlist, int nevents,
 160     user_addr_t data_out, uint64_t data_available,
 161     unsigned int flags, user_addr_t utimeout,
 162     kqueue_continue_t continuation,
 163     int32_t *retval);
 164 static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp,
 165     struct proc *p, unsigned int flags);
 166 static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
 167     struct proc *p, unsigned int flags);
 168 char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
 169
 170 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev);
 171 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
 172     struct knote_lock_ctx *knlc, thread_continue_t cont,
 173     struct _kevent_register *cont_args) __dead2;
 174 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
 175 static void kevent_register_wait_cleanup(struct knote *kn);
 176 static inline void kqueue_release_last(struct proc *p, kqueue_t kqu);
 177 static void kqueue_interrupt(struct kqueue *kq);
 178 static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
 179     void *data);
 180 static void kevent_continue(struct kqueue *kq, void *data, int error);
 181 static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
 182 static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data,
 183     struct filt_process_s *process_data, int *countp);
 184 static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index);
 185
 186 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
 187 static void kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos, int flags);
 188
 189 static void kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn, kq_index_t qos);
 190 static void kqworkq_unbind(proc_t p, struct kqrequest *kqr);
 191 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, struct kqrequest *kqr, thread_t thread);
 192 static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
 193
 194 static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index);
 195 static void kqworkloop_unbind(proc_t p, struct kqworkloop *kwql);
 196 static thread_qos_t kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread);
 197 static kq_index_t kqworkloop_owner_override(struct kqworkloop *kqwl);
 198 enum {
 199         KQWL_UTQ_NONE,
 200         /*
 201          * The wakeup qos is the qos of QUEUED knotes.
 202          *
 203          * This QoS is accounted for with the events override in the
 204          * kqr_override_index field. It is raised each time a new knote is queued at
 205          * a given QoS. The kqr_wakeup_indexes field is a superset of the non empty
 206          * knote buckets and is recomputed after each event delivery.
 207          */
 208         KQWL_UTQ_UPDATE_WAKEUP_QOS,
 209         KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
 210         KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
 211         KQWL_UTQ_UNBINDING, /* attempt to rebind */
 212         KQWL_UTQ_PARKING,
 213         /*
 214          * The wakeup override is for suppressed knotes that have fired again at
 215          * a higher QoS than the one for which they are suppressed already.
 216          * This override is cleared when the knote suppressed list becomes empty.
 217          */
 218         KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
 219         KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
 220         /*
 221          * The QoS is the maximum QoS of an event enqueued on this workloop in
 222          * userland. It is copied from the only EVFILT_WORKLOOP knote with
 223          * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
 224          * such knote, this QoS is 0.
 225          */
 226         KQWL_UTQ_SET_QOS_INDEX,
 227         KQWL_UTQ_REDRIVE_EVENTS,
 228 };
 229 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
 230 static void kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index);
 231 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
 232
 233 static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data,
 234     struct filt_process_s *process_data);
 235
 236 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
 237     struct knote_lock_ctx *knlc, struct proc *p);
 238 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, bool is_fd, struct proc *p);
 239
 240 static void knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc);
 241 static struct knote *knote_alloc(void);
 242 static void knote_free(struct knote *kn);
 243
 244 static void knote_activate(struct knote *kn);
 245 static void knote_deactivate(struct knote *kn);
 246
 247 static void knote_enable(struct knote *kn);
 248 static void knote_disable(struct knote *kn);
 249
 250 static int knote_enqueue(struct knote *kn);
 251 static void knote_dequeue(struct knote *kn);
 252
 253 static void knote_suppress(struct knote *kn);
 254 static void knote_unsuppress(struct knote *kn);
 255 static void knote_wakeup(struct knote *kn);
 256
 257 static bool knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn,
 258     int result, thread_qos_t *qos_out);
 259 static void knote_apply_qos_override(struct knote *kn, kq_index_t qos_index);
 260 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
 261 static void knote_reset_priority(struct knote *kn, pthread_priority_t pp);
 262 static kq_index_t knote_get_qos_override_index(struct knote *kn);
 263 static void knote_set_qos_overcommit(struct knote *kn);
 264
 265 static zone_t knote_zone;
 266 static zone_t kqfile_zone;
 267 static zone_t kqworkq_zone;
 268 static zone_t kqworkloop_zone;
 269 #if DEVELOPMENT || DEBUG
 270 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK  (1U << 0)
 271 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS     (1U << 1)
 272 #define KEVENT_PANIC_BOOT_ARG_INITIALIZED        (1U << 31)
 273
 274 #define KEVENT_PANIC_DEFAULT_VALUE (0)
 275 static uint32_t
 276 kevent_debug_flags(void)
 277 {
 278         static uint32_t flags = KEVENT_PANIC_DEFAULT_VALUE;
 279
 280         if ((flags & KEVENT_PANIC_BOOT_ARG_INITIALIZED) == 0) {
 281                 uint32_t value = 0;
 282                 if (!PE_parse_boot_argn("kevent_debug", &value, sizeof(value))) {
 283                         value = KEVENT_PANIC_DEFAULT_VALUE;
 284                 }
 285                 value |= KEVENT_PANIC_BOOT_ARG_INITIALIZED;
 286                 os_atomic_store(&flags, value, relaxed);
 287         }
 288         return flags;
 289 }
 290 #endif
 291
 292 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 293
 294 /* placeholder for not-yet-implemented filters */
 295 static int filt_badattach(struct knote *kn, struct kevent_internal_s *kev);
 296 static int filt_badevent(struct knote *kn, long hint);
 297 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
 298         .f_attach = filt_badattach,
 299 };
 300
 301 #if CONFIG_MEMORYSTATUS
 302 extern const struct filterops memorystatus_filtops;
 303 #endif /* CONFIG_MEMORYSTATUS */
 304 extern const struct filterops fs_filtops;
 305 extern const struct filterops sig_filtops;
 306 extern const struct filterops machport_filtops;
 307 extern const struct filterops pipe_rfiltops;
 308 extern const struct filterops pipe_wfiltops;
 309 extern const struct filterops ptsd_kqops;
 310 extern const struct filterops ptmx_kqops;
 311 extern const struct filterops soread_filtops;
 312 extern const struct filterops sowrite_filtops;
 313 extern const struct filterops sock_filtops;
 314 extern const struct filterops soexcept_filtops;
 315 extern const struct filterops spec_filtops;
 316 extern const struct filterops bpfread_filtops;
 317 extern const struct filterops necp_fd_rfiltops;
 318 extern const struct filterops fsevent_filtops;
 319 extern const struct filterops vnode_filtops;
 320 extern const struct filterops tty_filtops;
 321
 322 const static struct filterops file_filtops;
 323 const static struct filterops kqread_filtops;
 324 const static struct filterops proc_filtops;
 325 const static struct filterops timer_filtops;
 326 const static struct filterops user_filtops;
 327 const static struct filterops workloop_filtops;
 328
 329 /*
 330  *
 331  * Rules for adding new filters to the system:
 332  * Public filters:
 333  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
 334  *   in the exported section of the header
 335  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
 336  * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
 337  *   of the Public Filters section in the array.
 338  * Private filters:
 339  * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
 340  *   in the XNU_KERNEL_PRIVATE section of the header
 341  * - Update the EVFILTID_MAX value to reflect the new addition
 342  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
 343  *   the Private filters section of the array.
 344  */
 345 SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = {
 346         /* Public Filters */
 347         [~EVFILT_READ]                  = &file_filtops,
 348         [~EVFILT_WRITE]                 = &file_filtops,
 349         [~EVFILT_AIO]                   = &bad_filtops,
 350         [~EVFILT_VNODE]                 = &file_filtops,
 351         [~EVFILT_PROC]                  = &proc_filtops,
 352         [~EVFILT_SIGNAL]                = &sig_filtops,
 353         [~EVFILT_TIMER]                 = &timer_filtops,
 354         [~EVFILT_MACHPORT]              = &machport_filtops,
 355         [~EVFILT_FS]                    = &fs_filtops,
 356         [~EVFILT_USER]                  = &user_filtops,
 357         &bad_filtops,
 358         [~EVFILT_VM]                    = &bad_filtops,
 359         [~EVFILT_SOCK]                  = &file_filtops,
 360 #if CONFIG_MEMORYSTATUS
 361         [~EVFILT_MEMORYSTATUS]          = &memorystatus_filtops,
 362 #else
 363         [~EVFILT_MEMORYSTATUS]          = &bad_filtops,
 364 #endif
 365         [~EVFILT_EXCEPT]                = &file_filtops,
 366         [~EVFILT_WORKLOOP]              = &workloop_filtops,
 367
 368         /* Private filters */
 369         [EVFILTID_KQREAD]               = &kqread_filtops,
 370         [EVFILTID_PIPE_R]               = &pipe_rfiltops,
 371         [EVFILTID_PIPE_W]               = &pipe_wfiltops,
 372         [EVFILTID_PTSD]                 = &ptsd_kqops,
 373         [EVFILTID_SOREAD]               = &soread_filtops,
 374         [EVFILTID_SOWRITE]              = &sowrite_filtops,
 375         [EVFILTID_SCK]                  = &sock_filtops,
 376         [EVFILTID_SOEXCEPT]             = &soexcept_filtops,
 377         [EVFILTID_SPEC]                 = &spec_filtops,
 378         [EVFILTID_BPFREAD]              = &bpfread_filtops,
 379         [EVFILTID_NECP_FD]              = &necp_fd_rfiltops,
 380         [EVFILTID_FSEVENT]              = &fsevent_filtops,
 381         [EVFILTID_VN]                   = &vnode_filtops,
 382         [EVFILTID_TTY]                  = &tty_filtops,
 383         [EVFILTID_PTMX]                 = &ptmx_kqops,
 384 };
 385
 386 /* waitq prepost callback */
 387 void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos);
 388
 389 static inline struct kqworkloop *
 390 kqr_kqworkloop(struct kqrequest *kqr)
 391 {
 392         if (kqr->kqr_state & KQR_WORKLOOP) {
 393                 return __container_of(kqr, struct kqworkloop, kqwl_request);
 394         }
 395         return NULL;
 396 }
 397
 398 static inline kqueue_t
 399 kqr_kqueue(proc_t p, struct kqrequest *kqr)
 400 {
 401         kqueue_t kqu;
 402         if (kqr->kqr_state & KQR_WORKLOOP) {
 403                 kqu.kqwl = kqr_kqworkloop(kqr);
 404         } else {
 405                 kqu.kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
 406                 assert(kqr >= kqu.kqwq->kqwq_request &&
 407                     kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
 408         }
 409         return kqu;
 410 }
 411
 412 static inline boolean_t
 413 is_workqueue_thread(thread_t thread)
 414 {
 415         return thread_get_tag(thread) & THREAD_TAG_WORKQUEUE;
 416 }
 417
 418 /*
 419  * kqueue/note lock implementations
 420  *
 421  *      The kqueue lock guards the kq state, the state of its queues,
 422  *      and the kqueue-aware status and locks of individual knotes.
 423  *
 424  *      The kqueue workq lock is used to protect state guarding the
 425  *      interaction of the kqueue with the workq.  This state cannot
 426  *      be guarded by the kq lock - as it needs to be taken when we
 427  *      already have the waitq set lock held (during the waitq hook
 428  *      callback).  It might be better to use the waitq lock itself
 429  *      for this, but the IRQ requirements make that difficult).
 430  *
 431  *      Knote flags, filter flags, and associated data are protected
 432  *      by the underlying object lock - and are only ever looked at
 433  *      by calling the filter to get a [consistent] snapshot of that
 434  *      data.
 435  */
 436 static lck_grp_attr_t *kq_lck_grp_attr;
 437 static lck_grp_t *kq_lck_grp;
 438 static lck_attr_t *kq_lck_attr;
 439
 440 static inline void
 441 kqlock(kqueue_t kqu)
 442 {
 443         lck_spin_lock(&kqu.kq->kq_lock);
 444 }
 445
 446 static inline void
 447 kqlock_held(__assert_only kqueue_t kqu)
 448 {
 449         LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
 450 }
 451
 452 static inline void
 453 kqunlock(kqueue_t kqu)
 454 {
 455         lck_spin_unlock(&kqu.kq->kq_lock);
 456 }
 457
 458 static inline void
 459 kq_req_lock(kqueue_t kqu)
 460 {
 461         assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
 462         lck_spin_lock(&kqu.kq->kq_reqlock);
 463 }
 464
 465 static inline void
 466 kq_req_unlock(kqueue_t kqu)
 467 {
 468         assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
 469         lck_spin_unlock(&kqu.kq->kq_reqlock);
 470 }
 471
 472 static inline void
 473 kq_req_held(__assert_only kqueue_t kqu)
 474 {
 475         assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
 476         LCK_SPIN_ASSERT(&kqu.kq->kq_reqlock, LCK_ASSERT_OWNED);
 477 }
 478
 479 static inline void
 480 knhash_lock(proc_t p)
 481 {
 482         lck_mtx_lock(&p->p_fd->fd_knhashlock);
 483 }
 484
 485 static inline void
 486 knhash_unlock(proc_t p)
 487 {
 488         lck_mtx_unlock(&p->p_fd->fd_knhashlock);
 489 }
 490
 491 #pragma mark knote locks
 492
 493 /*
 494  * Enum used by the knote_lock_* functions.
 495  *
 496  * KNOTE_KQ_LOCK_ALWAYS
 497  *   The function will always return with the kq lock held.
 498  *
 499  * KNOTE_KQ_UNLOCK_ON_SUCCESS
 500  *   The function will return with the kq lock held if it was successful
 501  *   (knote_lock() is the only function that can fail).
 502  *
 503  * KNOTE_KQ_UNLOCK_ON_FAILURE
 504  *   The function will return with the kq lock held if it was unsuccessful
 505  *   (knote_lock() is the only function that can fail).
 506  *
 507  * KNOTE_KQ_UNLOCK:
 508  *   The function returns with the kq unlocked.
 509  */
 510 #define KNOTE_KQ_LOCK_ALWAYS      0x0
 511 #define KNOTE_KQ_LOCK_ON_SUCCESS  0x1
 512 #define KNOTE_KQ_LOCK_ON_FAILURE  0x2
 513 #define KNOTE_KQ_UNLOCK           0x3
 514
 515 #if DEBUG || DEVELOPMENT
 516 __attribute__((noinline, not_tail_called, disable_tail_calls))
 517 void
 518 knote_lock_ctx_chk(struct knote_lock_ctx *knlc)
 519 {
 520         /* evil hackery to make sure no one forgets to unlock */
 521         assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
 522 }
 523 #endif
 524
 525 static struct knote_lock_ctx *
 526 knote_lock_ctx_find(struct kqueue *kq, struct knote *kn)
 527 {
 528         struct knote_lock_ctx *ctx;
 529         LIST_FOREACH(ctx, &kq->kq_knlocks, knlc_le) {
 530                 if (ctx->knlc_knote == kn) {
 531                         return ctx;
 532                 }
 533         }
 534         panic("knote lock context not found: %p", kn);
 535         __builtin_trap();
 536 }
 537
 538 /* slowpath of knote_lock() */
 539 __attribute__((noinline))
 540 static bool __result_use_check
 541 knote_lock_slow(struct kqueue *kq, struct knote *kn,
 542     struct knote_lock_ctx *knlc, int kqlocking)
 543 {
 544         kqlock_held(kq);
 545
 546         struct knote_lock_ctx *owner_lc = knote_lock_ctx_find(kq, kn);
 547         thread_t owner_thread = owner_lc->knlc_thread;
 548
 549 #if DEBUG || DEVELOPMENT
 550         knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
 551 #endif
 552
 553         thread_reference(owner_thread);
 554         TAILQ_INSERT_TAIL(&owner_lc->knlc_head, knlc, knlc_tqe);
 555         assert_wait(&kn->kn_status, THREAD_UNINT | THREAD_WAIT_NOREPORT);
 556         kqunlock(kq);
 557
 558         if (thread_handoff_deallocate(owner_thread) == THREAD_RESTART) {
 559                 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
 560                     kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
 561                         kqlock(kq);
 562                 }
 563 #if DEBUG || DEVELOPMENT
 564                 assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
 565                 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
 566 #endif
 567                 return false;
 568         }
 569 #if DEBUG || DEVELOPMENT
 570         assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
 571 #endif
 572         if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
 573             kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
 574                 kqlock(kq);
 575         }
 576         return true;
 577 }
 578
 579 /*
 580  * Attempts to take the "knote" lock.
 581  *
 582  * Called with the kqueue lock held.
 583  *
 584  * Returns true if the knote lock is acquired, false if it has been dropped
 585  */
 586 static bool __result_use_check
 587 knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
 588     int kqlocking)
 589 {
 590         kqlock_held(kq);
 591
 592 #if DEBUG || DEVELOPMENT
 593         assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
 594 #endif
 595         knlc->knlc_knote = kn;
 596         knlc->knlc_thread = current_thread();
 597         TAILQ_INIT(&knlc->knlc_head);
 598
 599         if (__improbable(kn->kn_status & KN_LOCKED)) {
 600                 return knote_lock_slow(kq, kn, knlc, kqlocking);
 601         }
 602
 603         /*
 604          * When the knote will be dropped, the knote lock is taken before
 605          * KN_DROPPING is set, and then the knote will be removed from any
 606          * hash table that references it before the lock is canceled.
 607          */
 608         assert((kn->kn_status & KN_DROPPING) == 0);
 609         LIST_INSERT_HEAD(&kq->kq_knlocks, knlc, knlc_le);
 610         kn->kn_status |= KN_LOCKED;
 611 #if DEBUG || DEVELOPMENT
 612         knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
 613 #endif
 614
 615         if (kqlocking == KNOTE_KQ_UNLOCK ||
 616             kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
 617                 kqunlock(kq);
 618         }
 619         return true;
 620 }
 621
 622 /*
 623  * Unlocks a knote successfully locked with knote_lock().
 624  *
 625  * Called with the kqueue lock held.
 626  *
 627  * Returns with the kqueue lock held according to KNOTE_KQ_* flags
 628  */
 629 static void
 630 knote_unlock(struct kqueue *kq, struct knote *kn,
 631     struct knote_lock_ctx *knlc, int flags)
 632 {
 633         kqlock_held(kq);
 634
 635         assert(knlc->knlc_knote == kn);
 636         assert(kn->kn_status & KN_LOCKED);
 637 #if DEBUG || DEVELOPMENT
 638         assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
 639 #endif
 640
 641         struct knote_lock_ctx *next_owner_lc = TAILQ_FIRST(&knlc->knlc_head);
 642
 643         LIST_REMOVE(knlc, knlc_le);
 644
 645         if (next_owner_lc) {
 646                 assert(next_owner_lc->knlc_knote == kn);
 647                 TAILQ_REMOVE(&knlc->knlc_head, next_owner_lc, knlc_tqe);
 648
 649                 assert(TAILQ_EMPTY(&next_owner_lc->knlc_head));
 650                 TAILQ_CONCAT(&next_owner_lc->knlc_head, &knlc->knlc_head, knlc_tqe);
 651                 LIST_INSERT_HEAD(&kq->kq_knlocks, next_owner_lc, knlc_le);
 652 #if DEBUG || DEVELOPMENT
 653                 next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
 654 #endif
 655         } else {
 656                 kn->kn_status &= ~KN_LOCKED;
 657         }
 658         if (kn->kn_inuse == 0) {
 659                 /*
 660                  * No f_event() in flight anymore, we can leave QoS "Merge" mode
 661                  *
 662                  * See knote_should_apply_qos_override()
 663                  */
 664                 kn->kn_status &= ~KN_MERGE_QOS;
 665         }
 666         if (flags & KNOTE_KQ_UNLOCK) {
 667                 kqunlock(kq);
 668         }
 669         if (next_owner_lc) {
 670                 thread_wakeup_thread(&kn->kn_status, next_owner_lc->knlc_thread);
 671         }
 672 #if DEBUG || DEVELOPMENT
 673         knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
 674 #endif
 675 }
 676
 677 /*
 678  * Aborts all waiters for a knote lock, and unlock the knote.
 679  *
 680  * Called with the kqueue lock held.
 681  *
 682  * Returns with the kqueue lock held according to KNOTE_KQ_* flags
 683  */
 684 static void
 685 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
 686     struct knote_lock_ctx *knlc, int kqlocking)
 687 {
 688         kqlock_held(kq);
 689
 690         assert(knlc->knlc_knote == kn);
 691         assert(kn->kn_status & KN_LOCKED);
 692         assert(kn->kn_status & KN_DROPPING);
 693
 694         LIST_REMOVE(knlc, knlc_le);
 695         kn->kn_status &= ~KN_LOCKED;
 696
 697         if (kqlocking == KNOTE_KQ_UNLOCK ||
 698             kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
 699                 kqunlock(kq);
 700         }
 701         if (!TAILQ_EMPTY(&knlc->knlc_head)) {
 702                 thread_wakeup_with_result(&kn->kn_status, THREAD_RESTART);
 703         }
 704 #if DEBUG || DEVELOPMENT
 705         knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
 706 #endif
 707 }
 708
 709 /*
 710  * Call the f_event hook of a given filter.
 711  *
 712  * Takes a use count to protect against concurrent drops.
 713  */
 714 static void
 715 knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint)
 716 {
 717         int result, dropping = 0;
 718
 719         kqlock_held(kq);
 720
 721         if (kn->kn_status & (KN_DROPPING | KN_VANISHED)) {
 722                 return;
 723         }
 724
 725         kn->kn_inuse++;
 726         kqunlock(kq);
 727         result = filter_call(knote_fops(kn), f_event(kn, hint));
 728         kqlock(kq);
 729
 730         dropping = (kn->kn_status & KN_DROPPING);
 731
 732         if (!dropping && (result & FILTER_ACTIVE)) {
 733                 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
 734                         knote_adjust_qos(kq, kn, result);
 735                 }
 736                 knote_activate(kn);
 737         }
 738
 739         if (--kn->kn_inuse == 0) {
 740                 if ((kn->kn_status & KN_LOCKED) == 0) {
 741                         /*
 742                          * We're the last f_event() call and there's no other f_* call in
 743                          * flight, we can leave QoS "Merge" mode.
 744                          *
 745                          * See knote_should_apply_qos_override()
 746                          */
 747                         kn->kn_status &= ~KN_MERGE_QOS;
 748                 }
 749                 if (dropping) {
 750                         waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
 751                             CAST_EVENT64_T(&kn->kn_inuse),
 752                             THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
 753                 }
 754         }
 755 }
 756
 757 /*
 758  * Called by knote_drop() to wait for the last f_event() caller to be done.
 759  *
 760  *      - kq locked at entry
 761  *      - kq unlocked at exit
 762  */
 763 static void
 764 knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn)
 765 {
 766         wait_result_t wr = THREAD_NOT_WAITING;
 767
 768         kqlock_held(kq);
 769
 770         assert(kn->kn_status & KN_DROPPING);
 771
 772         if (kn->kn_inuse) {
 773                 wr = waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
 774                     CAST_EVENT64_T(&kn->kn_inuse),
 775                     THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
 776         }
 777         kqunlock(kq);
 778         if (wr == THREAD_WAITING) {
 779                 thread_block(THREAD_CONTINUE_NULL);
 780         }
 781 }
 782
 783 #pragma mark file_filtops
 784
 785 static int
 786 filt_fileattach(struct knote *kn, struct kevent_internal_s *kev)
 787 {
 788         return fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current());
 789 }
 790
 791 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
 792         .f_isfd = 1,
 793         .f_attach = filt_fileattach,
 794 };
 795
 796 #pragma mark kqread_filtops
 797
 798 #define f_flag f_fglob->fg_flag
 799 #define f_ops f_fglob->fg_ops
 800 #define f_data f_fglob->fg_data
 801 #define f_lflags f_fglob->fg_lflags
 802
 803 static void
 804 filt_kqdetach(struct knote *kn)
 805 {
 806         struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
 807         struct kqueue *kq = &kqf->kqf_kqueue;
 808
 809         kqlock(kq);
 810         KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
 811         kqunlock(kq);
 812 }
 813
 814 static int
 815 filt_kqueue(struct knote *kn, __unused long hint)
 816 {
 817         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 818
 819         return kq->kq_count > 0;
 820 }
 821
 822 static int
 823 filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev)
 824 {
 825 #pragma unused(kev)
 826         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 827         int res;
 828
 829         kqlock(kq);
 830         kn->kn_data = kq->kq_count;
 831         res = (kn->kn_data > 0);
 832
 833         kqunlock(kq);
 834
 835         return res;
 836 }
 837
 838 static int
 839 filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
 840 {
 841 #pragma unused(data)
 842         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 843         int res;
 844
 845         kqlock(kq);
 846         kn->kn_data = kq->kq_count;
 847         res = (kn->kn_data > 0);
 848         if (res) {
 849                 *kev = kn->kn_kevent;
 850                 if (kn->kn_flags & EV_CLEAR) {
 851                         kn->kn_data = 0;
 852                 }
 853         }
 854         kqunlock(kq);
 855
 856         return res;
 857 }
 858
 859 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
 860         .f_isfd = 1,
 861         .f_detach = filt_kqdetach,
 862         .f_event = filt_kqueue,
 863         .f_touch = filt_kqtouch,
 864         .f_process = filt_kqprocess,
 865 };
 866
 867 #pragma mark proc_filtops
 868
 869 static int
 870 filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
 871 {
 872         struct proc *p;
 873
 874         assert(PID_MAX < NOTE_PDATAMASK);
 875
 876         if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
 877                 knote_set_error(kn, ENOTSUP);
 878                 return 0;
 879         }
 880
 881         p = proc_find(kn->kn_id);
 882         if (p == NULL) {
 883                 knote_set_error(kn, ESRCH);
 884                 return 0;
 885         }
 886
 887         const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
 888
 889         if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
 890                 do {
 891                         pid_t selfpid = proc_selfpid();
 892
 893                         if (p->p_ppid == selfpid) {
 894                                 break;  /* parent => ok */
 895                         }
 896                         if ((p->p_lflag & P_LTRACED) != 0 &&
 897                             (p->p_oppid == selfpid)) {
 898                                 break;  /* parent-in-waiting => ok */
 899                         }
 900                         proc_rele(p);
 901                         knote_set_error(kn, EACCES);
 902                         return 0;
 903                 } while (0);
 904         }
 905
 906         proc_klist_lock();
 907
 908         kn->kn_ptr.p_proc = p;          /* store the proc handle */
 909
 910         KNOTE_ATTACH(&p->p_klist, kn);
 911
 912         proc_klist_unlock();
 913
 914         proc_rele(p);
 915
 916         /*
 917          * only captures edge-triggered events after this point
 918          * so it can't already be fired.
 919          */
 920         return 0;
 921 }
 922
 923
 924 /*
 925  * The knote may be attached to a different process, which may exit,
 926  * leaving nothing for the knote to be attached to.  In that case,
 927  * the pointer to the process will have already been nulled out.
 928  */
 929 static void
 930 filt_procdetach(struct knote *kn)
 931 {
 932         struct proc *p;
 933
 934         proc_klist_lock();
 935
 936         p = kn->kn_ptr.p_proc;
 937         if (p != PROC_NULL) {
 938                 kn->kn_ptr.p_proc = PROC_NULL;
 939                 KNOTE_DETACH(&p->p_klist, kn);
 940         }
 941
 942         proc_klist_unlock();
 943 }
 944
 945 static int
 946 filt_proc(struct knote *kn, long hint)
 947 {
 948         u_int event;
 949
 950         /* ALWAYS CALLED WITH proc_klist_lock */
 951
 952         /*
 953          * Note: a lot of bits in hint may be obtained from the knote
 954          * To free some of those bits, see <rdar://problem/12592988> Freeing up
 955          * bits in hint for filt_proc
 956          *
 957          * mask off extra data
 958          */
 959         event = (u_int)hint & NOTE_PCTRLMASK;
 960
 961         /*
 962          * termination lifecycle events can happen while a debugger
 963          * has reparented a process, in which case notifications
 964          * should be quashed except to the tracing parent. When
 965          * the debugger reaps the child (either via wait4(2) or
 966          * process exit), the child will be reparented to the original
 967          * parent and these knotes re-fired.
 968          */
 969         if (event & NOTE_EXIT) {
 970                 if ((kn->kn_ptr.p_proc->p_oppid != 0)
 971                     && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
 972                         /*
 973                          * This knote is not for the current ptrace(2) parent, ignore.
 974                          */
 975                         return 0;
 976                 }
 977         }
 978
 979         /*
 980          * if the user is interested in this event, record it.
 981          */
 982         if (kn->kn_sfflags & event) {
 983                 kn->kn_fflags |= event;
 984         }
 985
 986 #pragma clang diagnostic push
 987 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
 988         if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
 989                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 990         }
 991 #pragma clang diagnostic pop
 992
 993
 994         /*
 995          * The kernel has a wrapper in place that returns the same data
 996          * as is collected here, in kn_data.  Any changes to how
 997          * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
 998          * should also be reflected in the proc_pidnoteexit() wrapper.
 999          */
1000         if (event == NOTE_EXIT) {
1001                 kn->kn_data = 0;
1002                 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1003                         kn->kn_fflags |= NOTE_EXITSTATUS;
1004                         kn->kn_data |= (hint & NOTE_PDATAMASK);
1005                 }
1006                 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1007                         kn->kn_fflags |= NOTE_EXIT_DETAIL;
1008                         if ((kn->kn_ptr.p_proc->p_lflag &
1009                             P_LTERM_DECRYPTFAIL) != 0) {
1010                                 kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
1011                         }
1012                         if ((kn->kn_ptr.p_proc->p_lflag &
1013                             P_LTERM_JETSAM) != 0) {
1014                                 kn->kn_data |= NOTE_EXIT_MEMORY;
1015                                 switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) {
1016                                 case P_JETSAM_VMPAGESHORTAGE:
1017                                         kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1018                                         break;
1019                                 case P_JETSAM_VMTHRASHING:
1020                                         kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
1021                                         break;
1022                                 case P_JETSAM_FCTHRASHING:
1023                                         kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
1024                                         break;
1025                                 case P_JETSAM_VNODE:
1026                                         kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
1027                                         break;
1028                                 case P_JETSAM_HIWAT:
1029                                         kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
1030                                         break;
1031                                 case P_JETSAM_PID:
1032                                         kn->kn_data |= NOTE_EXIT_MEMORY_PID;
1033                                         break;
1034                                 case P_JETSAM_IDLEEXIT:
1035                                         kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
1036                                         break;
1037                                 }
1038                         }
1039                         if ((kn->kn_ptr.p_proc->p_csflags &
1040                             CS_KILLED) != 0) {
1041                                 kn->kn_data |= NOTE_EXIT_CSERROR;
1042                         }
1043                 }
1044         }
1045
1046         /* if we have any matching state, activate the knote */
1047         return kn->kn_fflags != 0;
1048 }
1049
1050 static int
1051 filt_proctouch(struct knote *kn, struct kevent_internal_s *kev)
1052 {
1053         int res;
1054
1055         proc_klist_lock();
1056
1057         /* accept new filter flags and mask off output events no long interesting */
1058         kn->kn_sfflags = kev->fflags;
1059
1060         /* restrict the current results to the (smaller?) set of new interest */
1061         /*
1062          * For compatibility with previous implementations, we leave kn_fflags
1063          * as they were before.
1064          */
1065         //kn->kn_fflags &= kn->kn_sfflags;
1066
1067         res = (kn->kn_fflags != 0);
1068
1069         proc_klist_unlock();
1070
1071         return res;
1072 }
1073
1074 static int
1075 filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
1076 {
1077 #pragma unused(data)
1078         int res;
1079
1080         proc_klist_lock();
1081         res = (kn->kn_fflags != 0);
1082         if (res) {
1083                 *kev = kn->kn_kevent;
1084                 kn->kn_flags |= EV_CLEAR;       /* automatically set */
1085                 kn->kn_fflags = 0;
1086                 kn->kn_data = 0;
1087         }
1088         proc_klist_unlock();
1089         return res;
1090 }
1091
1092 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1093         .f_attach = filt_procattach,
1094         .f_detach = filt_procdetach,
1095         .f_event = filt_proc,
1096         .f_touch = filt_proctouch,
1097         .f_process = filt_procprocess,
1098 };
1099
1100 #pragma mark timer_filtops
1101
1102 struct filt_timer_params {
1103         uint64_t deadline; /* deadline in abs/cont time
1104                             *                      (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1105         uint64_t leeway;   /* leeway in abstime, or 0 if none */
1106         uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1107 };
1108
1109 /*
1110  * Values stored in the knote at rest (using Mach absolute time units)
1111  *
1112  * kn->kn_hook          where the thread_call object is stored
1113  * kn->kn_ext[0]        next deadline or 0 if immediate expiration
1114  * kn->kn_ext[1]        leeway value
1115  * kn->kn_sdata         interval timer: the interval
1116  *                      absolute/deadline timer: 0
1117  * kn->kn_hookid        timer state
1118  *
1119  * TIMER_IDLE:
1120  *   The timer has either never been scheduled or been cancelled.
1121  *   It is safe to schedule a new one in this state.
1122  *
1123  * TIMER_ARMED:
1124  *   The timer has been scheduled
1125  *
1126  * TIMER_FIRED
1127  *   The timer has fired and an event needs to be delivered.
1128  *   When in this state, the callout may still be running.
1129  *
1130  * TIMER_IMMEDIATE
1131  *   The timer has fired at registration time, and the callout was never
1132  *   dispatched.
1133  */
1134 #define TIMER_IDLE       0x0
1135 #define TIMER_ARMED      0x1
1136 #define TIMER_FIRED      0x2
1137 #define TIMER_IMMEDIATE  0x3
1138
1139 static void
1140 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1141 {
1142         kn->kn_ext[0] = params->deadline;
1143         kn->kn_ext[1] = params->leeway;
1144         kn->kn_sdata  = params->interval;
1145 }
1146
1147 /*
1148  * filt_timervalidate - process data from user
1149  *
1150  * Sets up the deadline, interval, and leeway from the provided user data
1151  *
1152  * Input:
1153  *      kn_sdata        timer deadline or interval time
1154  *      kn_sfflags      style of timer, unit of measurement
1155  *
1156  * Output:
1157  *      struct filter_timer_params to apply to the filter with
1158  *      filt_timer_set_params when changes are ready to be commited.
1159  *
1160  * Returns:
1161  *      EINVAL          Invalid user data parameters
1162  *      ERANGE          Various overflows with the parameters
1163  *
1164  * Called with timer filter lock held.
1165  */
1166 static int
1167 filt_timervalidate(const struct kevent_internal_s *kev,
1168     struct filt_timer_params *params)
1169 {
1170         /*
1171          * There are 5 knobs that need to be chosen for a timer registration:
1172          *
1173          * A) Units of time (what is the time duration of the specified number)
1174          *      Absolute and interval take:
1175          *              NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1176          *      Defaults to milliseconds if not specified
1177          *
1178          * B) Clock epoch (what is the zero point of the specified number)
1179          *      For interval, there is none
1180          *      For absolute, defaults to the gettimeofday/calendar epoch
1181          *      With NOTE_MACHTIME, uses mach_absolute_time()
1182          *      With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1183          *
1184          * C) The knote's behavior on delivery
1185          *      Interval timer causes the knote to arm for the next interval unless one-shot is set
1186          *      Absolute is a forced one-shot timer which deletes on delivery
1187          *      TODO: Add a way for absolute to be not forced one-shot
1188          *
1189          * D) Whether the time duration is relative to now or absolute
1190          *      Interval fires at now + duration when it is set up
1191          *      Absolute fires at now + difference between now walltime and passed in walltime
1192          *      With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1193          *
1194          * E) Whether the timer continues to tick across sleep
1195          *      By default all three do not.
1196          *      For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1197          *      With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1198          *              expires when mach_continuous_time() is > the passed in value.
1199          */
1200
1201         uint64_t multiplier;
1202
1203         boolean_t use_abstime = FALSE;
1204
1205         switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1206         case NOTE_SECONDS:
1207                 multiplier = NSEC_PER_SEC;
1208                 break;
1209         case NOTE_USECONDS:
1210                 multiplier = NSEC_PER_USEC;
1211                 break;
1212         case NOTE_NSECONDS:
1213                 multiplier = 1;
1214                 break;
1215         case NOTE_MACHTIME:
1216                 multiplier = 0;
1217                 use_abstime = TRUE;
1218                 break;
1219         case 0: /* milliseconds (default) */
1220                 multiplier = NSEC_PER_SEC / 1000;
1221                 break;
1222         default:
1223                 return EINVAL;
1224         }
1225
1226         /* transform the leeway in kn_ext[1] to same time scale */
1227         if (kev->fflags & NOTE_LEEWAY) {
1228                 uint64_t leeway_abs;
1229
1230                 if (use_abstime) {
1231                         leeway_abs = (uint64_t)kev->ext[1];
1232                 } else {
1233                         uint64_t leeway_ns;
1234                         if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1235                                 return ERANGE;
1236                         }
1237
1238                         nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1239                 }
1240
1241                 params->leeway = leeway_abs;
1242         } else {
1243                 params->leeway = 0;
1244         }
1245
1246         if (kev->fflags & NOTE_ABSOLUTE) {
1247                 uint64_t deadline_abs;
1248
1249                 if (use_abstime) {
1250                         deadline_abs = (uint64_t)kev->data;
1251                 } else {
1252                         uint64_t calendar_deadline_ns;
1253
1254                         if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1255                                 return ERANGE;
1256                         }
1257
1258                         /* calendar_deadline_ns is in nanoseconds since the epoch */
1259
1260                         clock_sec_t seconds;
1261                         clock_nsec_t nanoseconds;
1262
1263                         /*
1264                          * Note that the conversion through wall-time is only done once.
1265                          *
1266                          * If the relationship between MAT and gettimeofday changes,
1267                          * the underlying timer does not update.
1268                          *
1269                          * TODO: build a wall-time denominated timer_call queue
1270                          * and a flag to request DTRTing with wall-time timers
1271                          */
1272                         clock_get_calendar_nanotime(&seconds, &nanoseconds);
1273
1274                         uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1275
1276                         /* if deadline is in the future */
1277                         if (calendar_now_ns < calendar_deadline_ns) {
1278                                 uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1279                                 uint64_t interval_abs;
1280
1281                                 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1282
1283                                 /*
1284                                  * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1285                                  * causes the timer to keep ticking across sleep, but
1286                                  * it does not change the calendar timebase.
1287                                  */
1288
1289                                 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1290                                         clock_continuoustime_interval_to_deadline(interval_abs,
1291                                             &deadline_abs);
1292                                 } else {
1293                                         clock_absolutetime_interval_to_deadline(interval_abs,
1294                                             &deadline_abs);
1295                                 }
1296                         } else {
1297                                 deadline_abs = 0; /* cause immediate expiration */
1298                         }
1299                 }
1300
1301                 params->deadline = deadline_abs;
1302                 params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1303         } else if (kev->data < 0) {
1304                 /*
1305                  * Negative interval timers fire immediately, once.
1306                  *
1307                  * Ideally a negative interval would be an error, but certain clients
1308                  * pass negative values on accident, and expect an event back.
1309                  *
1310                  * In the old implementation the timer would repeat with no delay
1311                  * N times until mach_absolute_time() + (N * interval) underflowed,
1312                  * then it would wait ~forever by accidentally arming a timer for the far future.
1313                  *
1314                  * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1315                  */
1316
1317                 params->deadline = 0; /* expire immediately */
1318                 params->interval = 0; /* non-repeating */
1319         } else {
1320                 uint64_t interval_abs = 0;
1321
1322                 if (use_abstime) {
1323                         interval_abs = (uint64_t)kev->data;
1324                 } else {
1325                         uint64_t interval_ns;
1326                         if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1327                                 return ERANGE;
1328                         }
1329
1330                         nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1331                 }
1332
1333                 uint64_t deadline = 0;
1334
1335                 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1336                         clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1337                 } else {
1338                         clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1339                 }
1340
1341                 params->deadline = deadline;
1342                 params->interval = interval_abs;
1343         }
1344
1345         return 0;
1346 }
1347
1348 /*
1349  * filt_timerexpire - the timer callout routine
1350  */
1351 static void
1352 filt_timerexpire(void *knx, __unused void *spare)
1353 {
1354         struct knote *kn = knx;
1355         int v;
1356
1357         if (os_atomic_cmpxchgv(&kn->kn_hookid, TIMER_ARMED, TIMER_FIRED,
1358             &v, relaxed)) {
1359                 // our f_event always would say FILTER_ACTIVE,
1360                 // so be leaner and just do it.
1361                 struct kqueue *kq = knote_get_kq(kn);
1362                 kqlock(kq);
1363                 knote_activate(kn);
1364                 kqunlock(kq);
1365         } else {
1366                 /*
1367                  * From TIMER_ARMED, the only allowed transition are:
1368                  * - to TIMER_FIRED through the timer callout just above
1369                  * - to TIMER_IDLE due to filt_timercancel() which will wait for the
1370                  *   timer callout (and any possible invocation of filt_timerexpire) to
1371                  *   have finished before the state is changed again.
1372                  */
1373                 assert(v == TIMER_IDLE);
1374         }
1375 }
1376
1377 static void
1378 filt_timercancel(struct knote *kn)
1379 {
1380         if (os_atomic_xchg(&kn->kn_hookid, TIMER_IDLE, relaxed) == TIMER_ARMED) {
1381                 /* cancel the thread call and wait for any filt_timerexpire in flight */
1382                 thread_call_cancel_wait((thread_call_t)kn->kn_hook);
1383         }
1384 }
1385
1386 /*
1387  * Does this deadline needs a timer armed for it, or has it expired?
1388  */
1389 static bool
1390 filt_timer_is_ready(struct knote *kn)
1391 {
1392         uint64_t now, deadline = kn->kn_ext[0];
1393
1394         if (deadline == 0) {
1395                 return true;
1396         }
1397
1398         if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1399                 now = mach_continuous_time();
1400         } else {
1401                 now = mach_absolute_time();
1402         }
1403         return deadline <= now;
1404 }
1405
1406 /*
1407  * Arm a timer
1408  *
1409  * It is the responsibility of the caller to make sure the timer call
1410  * has completed or been cancelled properly prior to arming it.
1411  */
1412 static void
1413 filt_timerarm(struct knote *kn)
1414 {
1415         uint64_t deadline = kn->kn_ext[0];
1416         uint64_t leeway   = kn->kn_ext[1];
1417
1418         int filter_flags = kn->kn_sfflags;
1419         unsigned int timer_flags = 0;
1420
1421         assert(os_atomic_load(&kn->kn_hookid, relaxed) == TIMER_IDLE);
1422
1423         if (filter_flags & NOTE_CRITICAL) {
1424                 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1425         } else if (filter_flags & NOTE_BACKGROUND) {
1426                 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1427         } else {
1428                 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1429         }
1430
1431         if (filter_flags & NOTE_LEEWAY) {
1432                 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1433         }
1434
1435         if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1436                 timer_flags |= THREAD_CALL_CONTINUOUS;
1437         }
1438
1439         os_atomic_store(&kn->kn_hookid, TIMER_ARMED, relaxed);
1440         thread_call_enter_delayed_with_leeway((thread_call_t)kn->kn_hook, NULL,
1441             deadline, leeway, timer_flags);
1442 }
1443
1444 /*
1445  * Allocate a thread call for the knote's lifetime, and kick off the timer.
1446  */
1447 static int
1448 filt_timerattach(struct knote *kn, struct kevent_internal_s *kev)
1449 {
1450         thread_call_t callout;
1451         struct filt_timer_params params;
1452         int error;
1453
1454         if ((error = filt_timervalidate(kev, &params)) != 0) {
1455                 knote_set_error(kn, error);
1456                 return 0;
1457         }
1458
1459         callout = thread_call_allocate_with_options(filt_timerexpire,
1460             (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1461             THREAD_CALL_OPTIONS_ONCE);
1462
1463         if (NULL == callout) {
1464                 knote_set_error(kn, ENOMEM);
1465                 return 0;
1466         }
1467
1468         filt_timer_set_params(kn, &params);
1469         kn->kn_hook = callout;
1470         kn->kn_flags |= EV_CLEAR;
1471         os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed);
1472
1473         /* NOTE_ABSOLUTE implies EV_ONESHOT */
1474         if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1475                 kn->kn_flags |= EV_ONESHOT;
1476         }
1477
1478         if (filt_timer_is_ready(kn)) {
1479                 os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed);
1480                 return FILTER_ACTIVE;
1481         } else {
1482                 filt_timerarm(kn);
1483                 return 0;
1484         }
1485 }
1486
1487 /*
1488  * Shut down the timer if it's running, and free the callout.
1489  */
1490 static void
1491 filt_timerdetach(struct knote *kn)
1492 {
1493         __assert_only boolean_t freed;
1494
1495         /*
1496          * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1497          * running anymore.
1498          */
1499         thread_call_cancel_wait((thread_call_t)kn->kn_hook);
1500         freed = thread_call_free((thread_call_t)kn->kn_hook);
1501         assert(freed);
1502 }
1503
1504 /*
1505  * filt_timertouch - update timer knote with new user input
1506  *
1507  * Cancel and restart the timer based on new user data. When
1508  * the user picks up a knote, clear the count of how many timer
1509  * pops have gone off (in kn_data).
1510  */
1511 static int
1512 filt_timertouch(struct knote *kn, struct kevent_internal_s *kev)
1513 {
1514         struct filt_timer_params params;
1515         uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1516         int error;
1517
1518         if (changed_flags & NOTE_ABSOLUTE) {
1519                 kev->flags |= EV_ERROR;
1520                 kev->data = EINVAL;
1521                 return 0;
1522         }
1523
1524         if ((error = filt_timervalidate(kev, &params)) != 0) {
1525                 kev->flags |= EV_ERROR;
1526                 kev->data = error;
1527                 return 0;
1528         }
1529
1530         /* capture the new values used to compute deadline */
1531         filt_timercancel(kn);
1532         filt_timer_set_params(kn, &params);
1533         kn->kn_sfflags = kev->fflags;
1534
1535         if (filt_timer_is_ready(kn)) {
1536                 os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed);
1537                 return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1538         } else {
1539                 filt_timerarm(kn);
1540                 return FILTER_UPDATE_REQ_QOS;
1541         }
1542 }
1543
1544 /*
1545  * filt_timerprocess - query state of knote and snapshot event data
1546  *
1547  * Determine if the timer has fired in the past, snapshot the state
1548  * of the kevent for returning to user-space, and clear pending event
1549  * counters for the next time.
1550  */
1551 static int
1552 filt_timerprocess(
1553         struct knote *kn,
1554         __unused struct filt_process_s *data,
1555         struct kevent_internal_s *kev)
1556 {
1557         /*
1558          * filt_timerprocess is serialized with any filter routine except for
1559          * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1560          * transition, and on success, activates the knote.
1561          *
1562          * Hence, we don't need atomic modifications of the state, only to peek at
1563          * whether we see any of the "FIRED" state, and if we do, it is safe to
1564          * do simple state machine transitions.
1565          */
1566         switch (os_atomic_load(&kn->kn_hookid, relaxed)) {
1567         case TIMER_IDLE:
1568         case TIMER_ARMED:
1569                 /*
1570                  * This can happen if a touch resets a timer that had fired
1571                  * without being processed
1572                  */
1573                 return 0;
1574         }
1575
1576         os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed);
1577
1578         /*
1579          * Copy out the interesting kevent state,
1580          * but don't leak out the raw time calculations.
1581          *
1582          * TODO: potential enhancements - tell the user about:
1583          *      - deadline to which this timer thought it was expiring
1584          *      - return kn_sfflags in the fflags field so the client can know
1585          *        under what flags the timer fired
1586          */
1587         *kev = kn->kn_kevent;
1588         kev->ext[0] = 0;
1589         /* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
1590
1591         if (kn->kn_sdata == 0) {
1592                 kev->data = 1;
1593         } else {
1594                 /*
1595                  * This is a 'repeating' timer, so we have to emit
1596                  * how many intervals expired between the arm
1597                  * and the process.
1598                  *
1599                  * A very strange style of interface, because
1600                  * this could easily be done in the client...
1601                  */
1602
1603                 uint64_t now;
1604
1605                 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1606                         now = mach_continuous_time();
1607                 } else {
1608                         now = mach_absolute_time();
1609                 }
1610
1611                 uint64_t first_deadline = kn->kn_ext[0];
1612                 uint64_t interval_abs   = kn->kn_sdata;
1613                 uint64_t orig_arm_time  = first_deadline - interval_abs;
1614
1615                 assert(now > orig_arm_time);
1616                 assert(now > first_deadline);
1617
1618                 uint64_t elapsed = now - orig_arm_time;
1619
1620                 uint64_t num_fired = elapsed / interval_abs;
1621
1622                 /*
1623                  * To reach this code, we must have seen the timer pop
1624                  * and be in repeating mode, so therefore it must have been
1625                  * more than 'interval' time since the attach or last
1626                  * successful touch.
1627                  */
1628                 assert(num_fired > 0);
1629
1630                 /* report how many intervals have elapsed to the user */
1631                 kev->data = (int64_t)num_fired;
1632
1633                 /* We only need to re-arm the timer if it's not about to be destroyed */
1634                 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1635                         /* fire at the end of the next interval */
1636                         uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1637
1638                         assert(new_deadline > now);
1639
1640                         kn->kn_ext[0] = new_deadline;
1641
1642                         /*
1643                          * This can't shortcut setting up the thread call, because
1644                          * knote_process deactivates EV_CLEAR knotes unconditionnally.
1645                          */
1646                         filt_timerarm(kn);
1647                 }
1648         }
1649
1650         return FILTER_ACTIVE;
1651 }
1652
1653 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1654         .f_extended_codes = true,
1655         .f_attach   = filt_timerattach,
1656         .f_detach   = filt_timerdetach,
1657         .f_event    = filt_badevent,
1658         .f_touch    = filt_timertouch,
1659         .f_process  = filt_timerprocess,
1660 };
1661
1662 #pragma mark user_filtops
1663
1664 static int
1665 filt_userattach(struct knote *kn, __unused struct kevent_internal_s *kev)
1666 {
1667         if (kn->kn_sfflags & NOTE_TRIGGER) {
1668                 kn->kn_hookid = FILTER_ACTIVE;
1669         } else {
1670                 kn->kn_hookid = 0;
1671         }
1672         return kn->kn_hookid;
1673 }
1674
1675 static void
1676 filt_userdetach(__unused struct knote *kn)
1677 {
1678         /* EVFILT_USER knotes are not attached to anything in the kernel */
1679 }
1680
1681 static int
1682 filt_usertouch(struct knote *kn, struct kevent_internal_s *kev)
1683 {
1684         uint32_t ffctrl;
1685         int fflags;
1686
1687         ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1688         fflags = kev->fflags & NOTE_FFLAGSMASK;
1689         switch (ffctrl) {
1690         case NOTE_FFNOP:
1691                 break;
1692         case NOTE_FFAND:
1693                 kn->kn_sfflags &= fflags;
1694                 break;
1695         case NOTE_FFOR:
1696                 kn->kn_sfflags |= fflags;
1697                 break;
1698         case NOTE_FFCOPY:
1699                 kn->kn_sfflags = fflags;
1700                 break;
1701         }
1702         kn->kn_sdata = kev->data;
1703
1704         if (kev->fflags & NOTE_TRIGGER) {
1705                 kn->kn_hookid = FILTER_ACTIVE;
1706         }
1707         return (int)kn->kn_hookid;
1708 }
1709
1710 static int
1711 filt_userprocess(
1712         struct knote *kn,
1713         __unused struct filt_process_s *data,
1714         struct kevent_internal_s *kev)
1715 {
1716         int result = (int)kn->kn_hookid;
1717
1718         if (result) {
1719                 *kev = kn->kn_kevent;
1720                 kev->fflags = kn->kn_sfflags;
1721                 kev->data = kn->kn_sdata;
1722                 if (kn->kn_flags & EV_CLEAR) {
1723                         kn->kn_hookid = 0;
1724                         kn->kn_data = 0;
1725                         kn->kn_fflags = 0;
1726                 }
1727         }
1728
1729         return result;
1730 }
1731
1732 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1733         .f_extended_codes = true,
1734         .f_attach  = filt_userattach,
1735         .f_detach  = filt_userdetach,
1736         .f_event   = filt_badevent,
1737         .f_touch   = filt_usertouch,
1738         .f_process = filt_userprocess,
1739 };
1740
1741 #pragma mark workloop_filtops
1742
1743 static inline void
1744 filt_wllock(struct kqworkloop *kqwl)
1745 {
1746         lck_mtx_lock(&kqwl->kqwl_statelock);
1747 }
1748
1749 static inline void
1750 filt_wlunlock(struct kqworkloop *kqwl)
1751 {
1752         lck_mtx_unlock(&kqwl->kqwl_statelock);
1753 }
1754
1755 /*
1756  * Returns true when the interlock for the turnstile is the workqueue lock
1757  *
1758  * When this is the case, all turnstiles operations are delegated
1759  * to the workqueue subsystem.
1760  *
1761  * This is required because kqueue_threadreq_bind_prepost only holds the
1762  * workqueue lock but needs to move the inheritor from the workloop turnstile
1763  * away from the creator thread, so that this now fulfilled request cannot be
1764  * picked anymore by other threads.
1765  */
1766 static inline bool
1767 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
1768 {
1769         struct kqrequest *kqr = &kqwl->kqwl_request;
1770         return (kqr->kqr_state & KQR_THREQUESTED) &&
1771                (kqr->kqr_thread == THREAD_NULL);
1772 }
1773
1774 static void
1775 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
1776     turnstile_update_flags_t flags)
1777 {
1778         turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
1779         struct kqrequest *kqr = &kqwl->kqwl_request;
1780
1781         /*
1782          * binding to the workq should always happen through
1783          * workq_kern_threadreq_update_inheritor()
1784          */
1785         assert(!filt_wlturnstile_interlock_is_workq(kqwl));
1786
1787         if ((inheritor = kqwl->kqwl_owner)) {
1788                 flags |= TURNSTILE_INHERITOR_THREAD;
1789         } else if ((inheritor = kqr->kqr_thread)) {
1790                 flags |= TURNSTILE_INHERITOR_THREAD;
1791         }
1792
1793         turnstile_update_inheritor(ts, inheritor, flags);
1794 }
1795
1796 #define FILT_WLATTACH 0
1797 #define FILT_WLTOUCH  1
1798 #define FILT_WLDROP   2
1799
1800 __result_use_check
1801 static int
1802 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
1803     struct kevent_internal_s *kev, kq_index_t qos_index, int op)
1804 {
1805         user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
1806         struct kqrequest *kqr = &kqwl->kqwl_request;
1807         thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
1808         kq_index_t cur_owner_override = THREAD_QOS_UNSPECIFIED;
1809         int action = KQWL_UTQ_NONE, error = 0;
1810         bool needs_wake = false, needs_wllock = false;
1811         uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
1812         uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
1813         uint64_t udata = 0;
1814
1815         if (kev->fflags & (NOTE_WL_END_OWNERSHIP | NOTE_WL_DISCOVER_OWNER)) {
1816                 /*
1817                  * If we're maybe going to change the kqwl_owner,
1818                  * then we need to hold the filt_wllock().
1819                  */
1820                 needs_wllock = true;
1821         } else if (kqr->kqr_thread == current_thread()) {
1822                 /*
1823                  * <rdar://problem/41531764> Servicer updates need to be serialized with
1824                  * any ownership change too, as the kqr_thread value influences the
1825                  * outcome of handling NOTE_WL_DISCOVER_OWNER.
1826                  */
1827                 needs_wllock = true;
1828         }
1829
1830         if (needs_wllock) {
1831                 filt_wllock(kqwl);
1832                 /*
1833                  * The kqwl owner is set under both the req and filter lock,
1834                  * meaning it's fine to look at it under any.
1835                  */
1836                 new_owner = cur_owner = kqwl->kqwl_owner;
1837         } else {
1838                 new_owner = cur_owner = THREAD_NULL;
1839         }
1840
1841         /*
1842          * Phase 1:
1843          *
1844          * If asked, load the uint64 value at the user provided address and compare
1845          * it against the passed in mask and expected value.
1846          *
1847          * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
1848          * a thread reference.
1849          *
1850          * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
1851          * the current thread, then end ownership.
1852          *
1853          * Lastly decide whether we need to perform a QoS update.
1854          */
1855         if (uaddr) {
1856                 error = copyin_word(uaddr, &udata, sizeof(udata));
1857                 if (error) {
1858                         goto out;
1859                 }
1860
1861                 /* Update state as copied in.  */
1862                 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
1863
1864                 if ((udata & mask) != (kdata & mask)) {
1865                         error = ESTALE;
1866                 } else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
1867                         /*
1868                          * Decipher the owner port name, and translate accordingly.
1869                          * The low 2 bits were borrowed for other flags, so mask them off.
1870                          *
1871                          * Then attempt translation to a thread reference or fail.
1872                          */
1873                         mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
1874                         if (name != MACH_PORT_NULL) {
1875                                 name = ipc_entry_name_mask(name);
1876                                 extra_thread_ref = port_name_to_thread(name);
1877                                 if (extra_thread_ref == THREAD_NULL) {
1878                                         error = EOWNERDEAD;
1879                                         goto out;
1880                                 }
1881                                 new_owner = extra_thread_ref;
1882                         }
1883                 }
1884         }
1885
1886         if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
1887                 new_owner = THREAD_NULL;
1888         }
1889
1890         if (error == 0) {
1891                 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
1892                         action = KQWL_UTQ_SET_QOS_INDEX;
1893                 } else if (qos_index && kqr->kqr_qos_index != qos_index) {
1894                         action = KQWL_UTQ_SET_QOS_INDEX;
1895                 }
1896
1897                 if (op == FILT_WLTOUCH) {
1898                         /*
1899                          * Save off any additional fflags/data we just accepted
1900                          * But only keep the last round of "update" bits we acted on which helps
1901                          * debugging a lot.
1902                          */
1903                         kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
1904                         kn->kn_sfflags |= kev->fflags;
1905                         kn->kn_sdata = kev->data;
1906                         if (kev->fflags & NOTE_WL_SYNC_WAKE) {
1907                                 needs_wake = (kn->kn_hook != THREAD_NULL);
1908                         }
1909                 } else if (op == FILT_WLDROP) {
1910                         if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
1911                             NOTE_WL_SYNC_WAIT) {
1912                                 /*
1913                                  * When deleting a SYNC_WAIT knote that hasn't been woken up
1914                                  * explicitly, issue a wake up.
1915                                  */
1916                                 kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
1917                                 needs_wake = (kn->kn_hook != THREAD_NULL);
1918                         }
1919                 }
1920         }
1921
1922         /*
1923          * Phase 2:
1924          *
1925          * Commit ownership and QoS changes if any, possibly wake up waiters
1926          */
1927
1928         if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
1929                 goto out;
1930         }
1931
1932         kq_req_lock(kqwl);
1933
1934         /* If already tracked as servicer, don't track as owner */
1935         if (new_owner == kqr->kqr_thread) {
1936                 new_owner = THREAD_NULL;
1937         }
1938
1939         if (cur_owner != new_owner) {
1940                 kqwl->kqwl_owner = new_owner;
1941                 if (new_owner == extra_thread_ref) {
1942                         /* we just transfered this ref to kqwl_owner */
1943                         extra_thread_ref = THREAD_NULL;
1944                 }
1945                 cur_owner_override = kqworkloop_owner_override(kqwl);
1946
1947                 if (cur_owner) {
1948                         thread_ends_owning_workloop(cur_owner);
1949                 }
1950
1951                 if (new_owner) {
1952                         /* override it before we drop the old */
1953                         if (cur_owner_override != THREAD_QOS_UNSPECIFIED) {
1954                                 thread_add_ipc_override(new_owner, cur_owner_override);
1955                         }
1956                         thread_starts_owning_workloop(new_owner);
1957                         if ((kqr->kqr_state & KQR_THREQUESTED) && !kqr->kqr_thread) {
1958                                 if (action == KQWL_UTQ_NONE) {
1959                                         action = KQWL_UTQ_REDRIVE_EVENTS;
1960                                 }
1961                         }
1962                 } else {
1963                         if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_WAKEUP)) == KQR_WAKEUP) {
1964                                 if (action == KQWL_UTQ_NONE) {
1965                                         action = KQWL_UTQ_REDRIVE_EVENTS;
1966                                 }
1967                         }
1968                 }
1969         }
1970
1971         struct turnstile *ts = kqwl->kqwl_turnstile;
1972         bool wl_inheritor_updated = false;
1973
1974         if (action != KQWL_UTQ_NONE) {
1975                 kqworkloop_update_threads_qos(kqwl, action, qos_index);
1976         }
1977
1978         if (cur_owner != new_owner && ts) {
1979                 if (action == KQWL_UTQ_REDRIVE_EVENTS) {
1980                         /*
1981                          * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
1982                          * the code went through workq_kern_threadreq_initiate()
1983                          * and the workqueue has set the inheritor already
1984                          */
1985                         assert(filt_wlturnstile_interlock_is_workq(kqwl));
1986                 } else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
1987                         workq_kern_threadreq_lock(kqwl->kqwl_p);
1988                         workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
1989                             ts, TURNSTILE_IMMEDIATE_UPDATE);
1990                         workq_kern_threadreq_unlock(kqwl->kqwl_p);
1991                         if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
1992                                 /*
1993                                  * If the workq is no longer the interlock, then
1994                                  * workq_kern_threadreq_update_inheritor() has finished a bind
1995                                  * and we need to fallback to the regular path.
1996                                  */
1997                                 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
1998                         }
1999                         wl_inheritor_updated = true;
2000                 } else {
2001                         filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2002                         wl_inheritor_updated = true;
2003                 }
2004
2005                 /*
2006                  * We need a turnstile reference because we are dropping the interlock
2007                  * and the caller has not called turnstile_prepare.
2008                  */
2009                 if (wl_inheritor_updated) {
2010                         turnstile_reference(ts);
2011                 }
2012         }
2013
2014         if (needs_wake && ts) {
2015                 waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T((event_t)kn),
2016                     (thread_t)kn->kn_hook, THREAD_AWAKENED);
2017         }
2018
2019         kq_req_unlock(kqwl);
2020
2021         if (wl_inheritor_updated) {
2022                 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2023                 turnstile_deallocate(ts);
2024         }
2025
2026 out:
2027         /*
2028          * Phase 3:
2029          *
2030          * Unlock and cleanup various lingering references and things.
2031          */
2032         if (needs_wllock) {
2033                 filt_wlunlock(kqwl);
2034         }
2035
2036 #if CONFIG_WORKLOOP_DEBUG
2037         KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2038                 .updater = current_thread(),
2039                 .servicer = kqr->kqr_thread, /* Note: racy */
2040                 .old_owner = cur_owner,
2041                 .new_owner = new_owner,
2042
2043                 .kev_ident  = kev->ident,
2044                 .error      = (int16_t)error,
2045                 .kev_flags  = kev->flags,
2046                 .kev_fflags = kev->fflags,
2047
2048                 .kev_mask   = mask,
2049                 .kev_value  = kdata,
2050                 .in_value   = udata,
2051         });
2052 #endif // CONFIG_WORKLOOP_DEBUG
2053
2054         if (cur_owner && new_owner != cur_owner) {
2055                 if (cur_owner_override != THREAD_QOS_UNSPECIFIED) {
2056                         thread_drop_ipc_override(cur_owner);
2057                 }
2058                 thread_deallocate(cur_owner);
2059         }
2060
2061         if (extra_thread_ref) {
2062                 thread_deallocate(extra_thread_ref);
2063         }
2064         return error;
2065 }
2066
2067 /*
2068  * Remembers the last updated that came in from userspace for debugging reasons.
2069  * - fflags is mirrored from the userspace kevent
2070  * - ext[i, i != VALUE] is mirrored from the userspace kevent
2071  * - ext[VALUE] is set to what the kernel loaded atomically
2072  * - data is set to the error if any
2073  */
2074 static inline void
2075 filt_wlremember_last_update(struct knote *kn, struct kevent_internal_s *kev,
2076     int error)
2077 {
2078         kn->kn_fflags = kev->fflags;
2079         kn->kn_data = error;
2080         memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2081 }
2082
2083 static int
2084 filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
2085 {
2086         struct kqueue *kq = knote_get_kq(kn);
2087         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2088         int error = 0;
2089         kq_index_t qos_index = 0;
2090
2091         if ((kq->kq_state & KQ_WORKLOOP) == 0) {
2092                 error = ENOTSUP;
2093                 goto out;
2094         }
2095
2096 #if DEVELOPMENT || DEBUG
2097         if (kev->ident == 0 && kev->udata == 0 && kev->fflags == 0) {
2098                 struct kqrequest *kqr = &kqwl->kqwl_request;
2099
2100                 kq_req_lock(kqwl);
2101                 kev->fflags = 0;
2102                 if (kqr->kqr_dsync_waiters) {
2103                         kev->fflags |= NOTE_WL_SYNC_WAIT;
2104                 }
2105                 if (kqr->kqr_qos_index) {
2106                         kev->fflags |= NOTE_WL_THREAD_REQUEST;
2107                 }
2108                 kev->ext[0] = thread_tid(kqwl->kqwl_owner);
2109                 kev->ext[1] = thread_tid(kqwl->kqwl_request.kqr_thread);
2110                 kev->ext[2] = thread_owned_workloops_count(current_thread());
2111                 kev->ext[3] = kn->kn_kevent.ext[3];
2112                 kq_req_unlock(kqwl);
2113                 error = EBUSY;
2114                 goto out;
2115         }
2116 #endif
2117
2118         int command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2119         switch (command) {
2120         case NOTE_WL_THREAD_REQUEST:
2121                 if (kn->kn_id != kqwl->kqwl_dynamicid) {
2122                         error = EINVAL;
2123                         goto out;
2124                 }
2125                 qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2126                 if (qos_index == THREAD_QOS_UNSPECIFIED) {
2127                         error = ERANGE;
2128                         goto out;
2129                 }
2130                 if (kqwl->kqwl_request.kqr_qos_index) {
2131                         /*
2132                          * There already is a thread request, and well, you're only allowed
2133                          * one per workloop, so fail the attach.
2134                          */
2135                         error = EALREADY;
2136                         goto out;
2137                 }
2138                 break;
2139         case NOTE_WL_SYNC_WAIT:
2140         case NOTE_WL_SYNC_WAKE:
2141                 if (kn->kn_id == kqwl->kqwl_dynamicid) {
2142                         error = EINVAL;
2143                         goto out;
2144                 }
2145                 if ((kn->kn_flags & EV_DISABLE) == 0) {
2146                         error = EINVAL;
2147                         goto out;
2148                 }
2149                 if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2150                         error = EINVAL;
2151                         goto out;
2152                 }
2153                 break;
2154         default:
2155                 error = EINVAL;
2156                 goto out;
2157         }
2158
2159         error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2160
2161 out:
2162         if (error) {
2163                 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2164                 if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2165                         error = 0;
2166                 }
2167                 knote_set_error(kn, error);
2168                 return 0;
2169         }
2170         if (command == NOTE_WL_SYNC_WAIT) {
2171                 return kevent_register_wait_prepare(kn, kev);
2172         }
2173         /* Just attaching the thread request successfully will fire it */
2174         if (command == NOTE_WL_THREAD_REQUEST) {
2175                 /*
2176                  * Thread Request knotes need an explicit touch to be active again,
2177                  * so delivering an event needs to also consume it.
2178                  */
2179                 kn->kn_flags |= EV_CLEAR;
2180                 return FILTER_ACTIVE;
2181         }
2182         return 0;
2183 }
2184
2185 static void __dead2
2186 filt_wlwait_continue(void *parameter, wait_result_t wr)
2187 {
2188         struct _kevent_register *cont_args = parameter;
2189         struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq;
2190         struct kqrequest *kqr = &kqwl->kqwl_request;
2191
2192         kq_req_lock(kqwl);
2193         kqr->kqr_dsync_waiters--;
2194         if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2195                 workq_kern_threadreq_lock(kqwl->kqwl_p);
2196                 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL);
2197                 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2198         } else {
2199                 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL);
2200         }
2201         kq_req_unlock(kqwl);
2202
2203         turnstile_cleanup();
2204
2205         if (wr == THREAD_INTERRUPTED) {
2206                 cont_args->kev.flags |= EV_ERROR;
2207                 cont_args->kev.data = EINTR;
2208         } else if (wr != THREAD_AWAKENED) {
2209                 panic("Unexpected wait result: %d", wr);
2210         }
2211
2212         kevent_register_wait_return(cont_args);
2213 }
2214
2215 /*
2216  * Called with the workloop mutex held, most of the time never returns as it
2217  * calls filt_wlwait_continue through a continuation.
2218  */
2219 static void __dead2
2220 filt_wlpost_register_wait(struct uthread *uth, struct knote_lock_ctx *knlc,
2221     struct _kevent_register *cont_args)
2222 {
2223         struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq;
2224         struct kqrequest *kqr = &kqwl->kqwl_request;
2225         struct turnstile *ts;
2226         bool workq_locked = false;
2227
2228         kq_req_lock(kqwl);
2229
2230         kqr->kqr_dsync_waiters++;
2231
2232         if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2233                 workq_kern_threadreq_lock(kqwl->kqwl_p);
2234                 workq_locked = true;
2235         }
2236
2237         ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2238             TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2239
2240         if (workq_locked) {
2241                 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2242                     &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2243                     TURNSTILE_DELAYED_UPDATE);
2244                 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2245                         /*
2246                          * if the interlock is no longer the workqueue lock,
2247                          * then we don't need to hold it anymore.
2248                          */
2249                         workq_kern_threadreq_unlock(kqwl->kqwl_p);
2250                         workq_locked = false;
2251                 }
2252         }
2253         if (!workq_locked) {
2254                 /*
2255                  * If the interlock is the workloop's, then it's our responsibility to
2256                  * call update_inheritor, so just do it.
2257                  */
2258                 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2259         }
2260
2261         thread_set_pending_block_hint(uth->uu_thread, kThreadWaitWorkloopSyncWait);
2262         waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(cont_args->knote),
2263             THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2264
2265         if (workq_locked) {
2266                 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2267         }
2268
2269         thread_t thread = kqwl->kqwl_owner ?: kqr->kqr_thread;
2270         if (thread) {
2271                 thread_reference(thread);
2272         }
2273         kq_req_unlock(kqwl);
2274
2275         kevent_register_wait_block(ts, thread, knlc, filt_wlwait_continue, cont_args);
2276 }
2277
2278 /* called in stackshot context to report the thread responsible for blocking this thread */
2279 void
2280 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2281     event64_t event, thread_waitinfo_t *waitinfo)
2282 {
2283         struct knote *kn = (struct knote *)event;
2284         assert(kdp_is_in_zone(kn, "knote zone"));
2285
2286         assert(kn->kn_hook == thread);
2287
2288         struct kqueue *kq = knote_get_kq(kn);
2289         assert(kdp_is_in_zone(kq, "kqueue workloop zone"));
2290         assert(kq->kq_state & KQ_WORKLOOP);
2291
2292         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2293         struct kqrequest *kqr = &kqwl->kqwl_request;
2294
2295         thread_t kqwl_owner = kqwl->kqwl_owner;
2296         thread_t servicer = kqr->kqr_thread;
2297
2298         if (kqwl_owner != THREAD_NULL) {
2299                 assert(kdp_is_in_zone(kqwl_owner, "threads"));
2300
2301                 waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2302         } else if (servicer != THREAD_NULL) {
2303                 assert(kdp_is_in_zone(servicer, "threads"));
2304
2305                 waitinfo->owner = thread_tid(servicer);
2306         } else if (kqr->kqr_state & KQR_THREQUESTED) {
2307                 waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2308         } else {
2309                 waitinfo->owner = 0;
2310         }
2311
2312         waitinfo->context = kqwl->kqwl_dynamicid;
2313 }
2314
2315 static void
2316 filt_wldetach(__assert_only struct knote *kn)
2317 {
2318         assert(knote_get_kq(kn)->kq_state & KQ_WORKLOOP);
2319         if (kn->kn_hook) {
2320                 kevent_register_wait_cleanup(kn);
2321         }
2322 }
2323
2324 static int
2325 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev,
2326     thread_qos_t *qos_index)
2327 {
2328         int new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2329         int sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2330
2331         if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2332                 return EINVAL;
2333         }
2334         if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2335                 if (kev->flags & EV_DELETE) {
2336                         return EINVAL;
2337                 }
2338                 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2339                         return EINVAL;
2340                 }
2341                 if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2342                         return ERANGE;
2343                 }
2344         }
2345
2346         switch (new_commands) {
2347         case NOTE_WL_THREAD_REQUEST:
2348                 /* thread requests can only update themselves */
2349                 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2350                         return EINVAL;
2351                 }
2352                 break;
2353
2354         case NOTE_WL_SYNC_WAIT:
2355                 if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2356                         return EINVAL;
2357                 }
2358                 goto sync_checks;
2359
2360         case NOTE_WL_SYNC_WAKE:
2361 sync_checks:
2362                 if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2363                         return EINVAL;
2364                 }
2365                 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2366                         return EINVAL;
2367                 }
2368                 break;
2369
2370         default:
2371                 return EINVAL;
2372         }
2373         return 0;
2374 }
2375
2376 static int
2377 filt_wltouch(struct knote *kn, struct kevent_internal_s *kev)
2378 {
2379         struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2380         thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2381
2382         int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2383         if (error) {
2384                 goto out;
2385         }
2386
2387         error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2388         filt_wlremember_last_update(kn, kev, error);
2389         if (error) {
2390                 goto out;
2391         }
2392
2393 out:
2394         if (error) {
2395                 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2396                         /* If userland wants ESTALE to be hidden, do not activate */
2397                         return 0;
2398                 }
2399                 kev->flags |= EV_ERROR;
2400                 kev->data = error;
2401                 return 0;
2402         }
2403         int command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2404         if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2405                 return kevent_register_wait_prepare(kn, kev);
2406         }
2407         /* Just touching the thread request successfully will fire it */
2408         if (command == NOTE_WL_THREAD_REQUEST) {
2409                 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2410                         return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
2411                 }
2412                 return FILTER_ACTIVE;
2413         }
2414         return 0;
2415 }
2416
2417 static bool
2418 filt_wlallow_drop(struct knote *kn, struct kevent_internal_s *kev)
2419 {
2420         struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2421
2422         int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2423         if (error) {
2424                 goto out;
2425         }
2426
2427         error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2428         filt_wlremember_last_update(kn, kev, error);
2429         if (error) {
2430                 goto out;
2431         }
2432
2433 out:
2434         if (error) {
2435                 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2436                         return false;
2437                 }
2438                 kev->flags |= EV_ERROR;
2439                 kev->data = error;
2440                 return false;
2441         }
2442         return true;
2443 }
2444
2445 static int
2446 filt_wlprocess(
2447         struct knote *kn,
2448         __unused struct filt_process_s *data,
2449         struct kevent_internal_s *kev)
2450 {
2451         struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2452         int rc = 0;
2453
2454         assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2455
2456         filt_wllock(kqwl);
2457
2458         if (kqwl->kqwl_owner) {
2459                 /*
2460                  * <rdar://problem/33584321> userspace sometimes due to events being
2461                  * delivered but not triggering a drain session can cause a process
2462                  * of the thread request knote.
2463                  *
2464                  * When that happens, the automatic deactivation due to process
2465                  * would swallow the event, so we have to activate the knote again.
2466                  */
2467                 kqlock(kqwl);
2468                 knote_activate(kn);
2469                 kqunlock(kqwl);
2470         } else {
2471 #if DEBUG || DEVELOPMENT
2472                 if (kevent_debug_flags() & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2473                         /*
2474                          * see src/queue_internal.h in libdispatch
2475                          */
2476 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2477                         user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2478                         task_t t = current_task();
2479                         uint64_t val;
2480                         if (addr && task_is_active(t) && !task_is_halting(t) &&
2481                             copyin_word(addr, &val, sizeof(val)) == 0 &&
2482                             val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2483                             (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2484                                 panic("kevent: workloop %#016llx is not enqueued "
2485                                     "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2486                                     kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2487                         }
2488                 }
2489 #endif
2490                 *kev = kn->kn_kevent;
2491                 kev->fflags = kn->kn_sfflags;
2492                 kev->data = kn->kn_sdata;
2493                 kev->qos = kn->kn_qos;
2494                 rc |= FILTER_ACTIVE;
2495         }
2496
2497         filt_wlunlock(kqwl);
2498
2499         if (rc & FILTER_ACTIVE) {
2500                 workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2501         }
2502         return rc;
2503 }
2504
2505 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2506         .f_extended_codes = true,
2507         .f_attach  = filt_wlattach,
2508         .f_detach  = filt_wldetach,
2509         .f_event   = filt_badevent,
2510         .f_touch   = filt_wltouch,
2511         .f_process = filt_wlprocess,
2512         .f_allow_drop = filt_wlallow_drop,
2513         .f_post_register_wait = filt_wlpost_register_wait,
2514 };
2515
2516 #pragma mark kevent / knotes
2517
2518 /*
2519  * JMM - placeholder for not-yet-implemented filters
2520  */
2521 static int
2522 filt_badevent(struct knote *kn, long hint)
2523 {
2524         panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
2525         return 0;
2526 }
2527
2528 static int
2529 filt_badattach(__unused struct knote *kn, __unused struct kevent_internal_s *kev)
2530 {
2531         knote_set_error(kn, ENOTSUP);
2532         return 0;
2533 }
2534
2535 struct kqueue *
2536 kqueue_alloc(struct proc *p, unsigned int flags)
2537 {
2538         struct filedesc *fdp = p->p_fd;
2539         struct kqueue *kq = NULL;
2540         int policy;
2541         void *hook = NULL;
2542
2543         if (flags & KEVENT_FLAG_WORKQ) {
2544                 struct kqworkq *kqwq;
2545                 int i;
2546
2547                 kqwq = (struct kqworkq *)zalloc(kqworkq_zone);
2548                 if (kqwq == NULL) {
2549                         return NULL;
2550                 }
2551
2552                 kq = &kqwq->kqwq_kqueue;
2553                 bzero(kqwq, sizeof(struct kqworkq));
2554
2555                 kqwq->kqwq_state = KQ_WORKQ;
2556
2557                 for (i = 0; i < KQWQ_NBUCKETS; i++) {
2558                         TAILQ_INIT(&kqwq->kqwq_queue[i]);
2559                 }
2560                 for (i = 0; i < KQWQ_NBUCKETS; i++) {
2561                         if (i != KQWQ_QOS_MANAGER) {
2562                                 /*
2563                                  * Because of how the bucketized system works, we mix overcommit
2564                                  * sources with not overcommit: each time we move a knote from
2565                                  * one bucket to the next due to overrides, we'd had to track
2566                                  * overcommitness, and it's really not worth it in the workloop
2567                                  * enabled world that track this faithfully.
2568                                  *
2569                                  * Incidentally, this behaves like the original manager-based
2570                                  * kqwq where event delivery always happened (hence is
2571                                  * "overcommit")
2572                                  */
2573                                 kqwq->kqwq_request[i].kqr_state |= KQR_THOVERCOMMIT;
2574                         }
2575                         kqwq->kqwq_request[i].kqr_qos_index = i;
2576                         TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed);
2577                 }
2578
2579                 policy = SYNC_POLICY_FIFO;
2580                 hook = (void *)kqwq;
2581         } else if (flags & KEVENT_FLAG_WORKLOOP) {
2582                 struct kqworkloop *kqwl;
2583                 int i;
2584
2585                 kqwl = (struct kqworkloop *)zalloc(kqworkloop_zone);
2586                 if (kqwl == NULL) {
2587                         return NULL;
2588                 }
2589
2590                 bzero(kqwl, sizeof(struct kqworkloop));
2591
2592                 kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC;
2593                 kqwl->kqwl_retains = 1; /* donate a retain to creator */
2594                 kqwl->kqwl_request.kqr_state = KQR_WORKLOOP;
2595
2596                 kq = &kqwl->kqwl_kqueue;
2597                 for (i = 0; i < KQWL_NBUCKETS; i++) {
2598                         TAILQ_INIT(&kqwl->kqwl_queue[i]);
2599                 }
2600                 TAILQ_INIT(&kqwl->kqwl_request.kqr_suppressed);
2601
2602                 lck_mtx_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr);
2603
2604                 policy = SYNC_POLICY_FIFO;
2605                 hook = (void *)kqwl;
2606         } else {
2607                 struct kqfile *kqf;
2608
2609                 kqf = (struct kqfile *)zalloc(kqfile_zone);
2610                 if (kqf == NULL) {
2611                         return NULL;
2612                 }
2613
2614                 kq = &kqf->kqf_kqueue;
2615                 bzero(kqf, sizeof(struct kqfile));
2616                 TAILQ_INIT(&kqf->kqf_queue);
2617                 TAILQ_INIT(&kqf->kqf_suppressed);
2618
2619                 policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST;
2620         }
2621
2622         waitq_set_init(&kq->kq_wqs, policy, NULL, hook);
2623         lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
2624         lck_spin_init(&kq->kq_reqlock, kq_lck_grp, kq_lck_attr);
2625         kq->kq_p = p;
2626
2627         if (fdp->fd_knlistsize < 0) {
2628                 proc_fdlock(p);
2629                 if (fdp->fd_knlistsize < 0) {
2630                         fdp->fd_knlistsize = 0; /* this process has had a kq */
2631                 }
2632                 proc_fdunlock(p);
2633         }
2634
2635         return kq;
2636 }
2637
2638 /*
2639  * knotes_dealloc - detach all knotes for the process and drop them
2640  *
2641  *              Called with proc_fdlock held.
2642  *              Returns with it locked.
2643  *              May drop it temporarily.
2644  *              Process is in such a state that it will not try to allocate
2645  *              any more knotes during this process (stopped for exit or exec).
2646  */
2647 void
2648 knotes_dealloc(proc_t p)
2649 {
2650         struct filedesc *fdp = p->p_fd;
2651         struct kqueue *kq;
2652         struct knote *kn;
2653         struct  klist *kn_hash = NULL;
2654         int i;
2655
2656         /* Close all the fd-indexed knotes up front */
2657         if (fdp->fd_knlistsize > 0) {
2658                 for (i = 0; i < fdp->fd_knlistsize; i++) {
2659                         while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
2660                                 kq = knote_get_kq(kn);
2661                                 kqlock(kq);
2662                                 proc_fdunlock(p);
2663                                 knote_drop(kq, kn, NULL);
2664                                 proc_fdlock(p);
2665                         }
2666                 }
2667                 /* free the table */
2668                 FREE(fdp->fd_knlist, M_KQUEUE);
2669                 fdp->fd_knlist = NULL;
2670         }
2671         fdp->fd_knlistsize = -1;
2672
2673         knhash_lock(p);
2674         proc_fdunlock(p);
2675
2676         /* Clean out all the hashed knotes as well */
2677         if (fdp->fd_knhashmask != 0) {
2678                 for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
2679                         while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
2680                                 kq = knote_get_kq(kn);
2681                                 kqlock(kq);
2682                                 knhash_unlock(p);
2683                                 knote_drop(kq, kn, NULL);
2684                                 knhash_lock(p);
2685                         }
2686                 }
2687                 kn_hash = fdp->fd_knhash;
2688                 fdp->fd_knhashmask = 0;
2689                 fdp->fd_knhash = NULL;
2690         }
2691
2692         knhash_unlock(p);
2693
2694         /* free the kn_hash table */
2695         if (kn_hash) {
2696                 FREE(kn_hash, M_KQUEUE);
2697         }
2698
2699         proc_fdlock(p);
2700 }
2701
2702 /*
2703  * kqworkloop_invalidate
2704  *
2705  * Invalidate ownership of a workloop.
2706  *
2707  * This is meant to be used so that any remnant of overrides and ownership
2708  * information is dropped before a kqworkloop can no longer be found in the
2709  * global hash table and have ghost workloop ownership left over.
2710  *
2711  * Possibly returns a thread to deallocate in a safe context.
2712  */
2713 static thread_t
2714 kqworkloop_invalidate(struct kqworkloop *kqwl)
2715 {
2716         thread_t cur_owner = kqwl->kqwl_owner;
2717
2718         assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed));
2719         if (cur_owner) {
2720                 /*
2721                  * If the kqueue had an owner that prevented the thread request to
2722                  * go through, then no unbind happened, and we may have lingering
2723                  * overrides to drop.
2724                  */
2725                 if (kqworkloop_owner_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
2726                         thread_drop_ipc_override(cur_owner);
2727                 }
2728                 thread_ends_owning_workloop(cur_owner);
2729                 kqwl->kqwl_owner = THREAD_NULL;
2730         }
2731
2732         return cur_owner;
2733 }
2734
2735 /*
2736  * kqueue_dealloc - detach all knotes from a kqueue and free it
2737  *
2738  *      We walk each list looking for knotes referencing this
2739  *      this kqueue.  If we find one, we try to drop it.  But
2740  *      if we fail to get a drop reference, that will wait
2741  *      until it is dropped.  So, we can just restart again
2742  *      safe in the assumption that the list will eventually
2743  *      not contain any more references to this kqueue (either
2744  *      we dropped them all, or someone else did).
2745  *
2746  *      Assumes no new events are being added to the kqueue.
2747  *      Nothing locked on entry or exit.
2748  *
2749  * Workloop kqueues cant get here unless all the knotes
2750  * are already gone and all requested threads have come
2751  * and gone (cancelled or arrived).
2752  */
2753 void
2754 kqueue_dealloc(struct kqueue *kq)
2755 {
2756         struct proc *p;
2757         struct filedesc *fdp;
2758         struct knote *kn;
2759         int i;
2760
2761         if (kq == NULL) {
2762                 return;
2763         }
2764
2765         p = kq->kq_p;
2766         fdp = p->p_fd;
2767
2768         /*
2769          * Workloops are refcounted by their knotes, so there's no point
2770          * spending a lot of time under these locks just to deallocate one.
2771          */
2772         if ((kq->kq_state & KQ_WORKLOOP) == 0) {
2773                 KNOTE_LOCK_CTX(knlc);
2774
2775                 proc_fdlock(p);
2776                 for (i = 0; i < fdp->fd_knlistsize; i++) {
2777                         kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2778                         while (kn != NULL) {
2779                                 if (kq == knote_get_kq(kn)) {
2780                                         kqlock(kq);
2781                                         proc_fdunlock(p);
2782                                         if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2783                                                 knote_drop(kq, kn, &knlc);
2784                                         }
2785                                         proc_fdlock(p);
2786                                         /* start over at beginning of list */
2787                                         kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2788                                         continue;
2789                                 }
2790                                 kn = SLIST_NEXT(kn, kn_link);
2791                         }
2792                 }
2793
2794                 knhash_lock(p);
2795                 proc_fdunlock(p);
2796
2797                 if (fdp->fd_knhashmask != 0) {
2798                         for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2799                                 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2800                                 while (kn != NULL) {
2801                                         if (kq == knote_get_kq(kn)) {
2802                                                 kqlock(kq);
2803                                                 knhash_unlock(p);
2804                                                 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2805                                                         knote_drop(kq, kn, &knlc);
2806                                                 }
2807                                                 knhash_lock(p);
2808                                                 /* start over at beginning of list */
2809                                                 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2810                                                 continue;
2811                                         }
2812                                         kn = SLIST_NEXT(kn, kn_link);
2813                                 }
2814                         }
2815                 }
2816                 knhash_unlock(p);
2817         }
2818
2819         if (kq->kq_state & KQ_WORKLOOP) {
2820                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2821                 thread_t cur_owner = kqworkloop_invalidate(kqwl);
2822
2823                 if (cur_owner) {
2824                         thread_deallocate(cur_owner);
2825                 }
2826
2827                 if (kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) {
2828                         struct turnstile *ts;
2829                         turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, &ts);
2830                         turnstile_cleanup();
2831                         turnstile_deallocate(ts);
2832                 } else {
2833                         assert(kqwl->kqwl_turnstile == NULL);
2834                 }
2835         }
2836
2837         /*
2838          * waitq_set_deinit() remove the KQ's waitq set from
2839          * any select sets to which it may belong.
2840          */
2841         waitq_set_deinit(&kq->kq_wqs);
2842         lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
2843         lck_spin_destroy(&kq->kq_reqlock, kq_lck_grp);
2844
2845         if (kq->kq_state & KQ_WORKQ) {
2846                 zfree(kqworkq_zone, kq);
2847         } else if (kq->kq_state & KQ_WORKLOOP) {
2848                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2849
2850                 assert(kqwl->kqwl_retains == 0);
2851                 lck_mtx_destroy(&kqwl->kqwl_statelock, kq_lck_grp);
2852                 zfree(kqworkloop_zone, kqwl);
2853         } else {
2854                 zfree(kqfile_zone, kq);
2855         }
2856 }
2857
2858 static inline void
2859 kqueue_retain(struct kqueue *kq)
2860 {
2861         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2862         uint32_t previous;
2863
2864         if ((kq->kq_state & KQ_DYNAMIC) == 0) {
2865                 return;
2866         }
2867
2868         previous = OSIncrementAtomic(&kqwl->kqwl_retains);
2869         if (previous == KQ_WORKLOOP_RETAINS_MAX) {
2870                 panic("kq(%p) retain overflow", kq);
2871         }
2872
2873         if (previous == 0) {
2874                 panic("kq(%p) resurrection", kq);
2875         }
2876 }
2877
2878 #define KQUEUE_CANT_BE_LAST_REF  0
2879 #define KQUEUE_MIGHT_BE_LAST_REF 1
2880
2881 static inline int
2882 kqueue_release(kqueue_t kqu, __assert_only int possibly_last)
2883 {
2884         if ((kqu.kq->kq_state & KQ_DYNAMIC) == 0) {
2885                 return 0;
2886         }
2887
2888         assert(kqu.kq->kq_state & KQ_WORKLOOP); /* for now */
2889         uint32_t refs = OSDecrementAtomic(&kqu.kqwl->kqwl_retains);
2890         if (__improbable(refs == 0)) {
2891                 panic("kq(%p) over-release", kqu.kq);
2892         }
2893         if (refs == 1) {
2894                 assert(possibly_last);
2895         }
2896         return refs == 1;
2897 }
2898
2899 int
2900 kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
2901 {
2902         struct kqueue *kq;
2903         struct fileproc *fp;
2904         int fd, error;
2905
2906         error = falloc_withalloc(p,
2907             &fp, &fd, vfs_context_current(), fp_zalloc, cra);
2908         if (error) {
2909                 return error;
2910         }
2911
2912         kq = kqueue_alloc(p, 0);
2913         if (kq == NULL) {
2914                 fp_free(p, fd, fp);
2915                 return ENOMEM;
2916         }
2917
2918         fp->f_flag = FREAD | FWRITE;
2919         fp->f_ops = &kqueueops;
2920         fp->f_data = kq;
2921         fp->f_lflags |= FG_CONFINED;
2922
2923         proc_fdlock(p);
2924         *fdflags(p, fd) |= UF_EXCLOSE | UF_FORKCLOSE;
2925         procfdtbl_releasefd(p, fd, NULL);
2926         fp_drop(p, fd, fp, 1);
2927         proc_fdunlock(p);
2928
2929         *retval = fd;
2930         return error;
2931 }
2932
2933 int
2934 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
2935 {
2936         return kqueue_body(p, fileproc_alloc_init, NULL, retval);
2937 }
2938
2939 static int
2940 kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
2941     unsigned int flags)
2942 {
2943         int advance;
2944         int error;
2945
2946         if (flags & KEVENT_FLAG_LEGACY32) {
2947                 bzero(kevp, sizeof(*kevp));
2948
2949                 if (IS_64BIT_PROCESS(p)) {
2950                         struct user64_kevent kev64;
2951
2952                         advance = sizeof(kev64);
2953                         error = copyin(*addrp, (caddr_t)&kev64, advance);
2954                         if (error) {
2955                                 return error;
2956                         }
2957                         kevp->ident = kev64.ident;
2958                         kevp->filter = kev64.filter;
2959                         kevp->flags = kev64.flags;
2960                         kevp->udata = kev64.udata;
2961                         kevp->fflags = kev64.fflags;
2962                         kevp->data = kev64.data;
2963                 } else {
2964                         struct user32_kevent kev32;
2965
2966                         advance = sizeof(kev32);
2967                         error = copyin(*addrp, (caddr_t)&kev32, advance);
2968                         if (error) {
2969                                 return error;
2970                         }
2971                         kevp->ident = (uintptr_t)kev32.ident;
2972                         kevp->filter = kev32.filter;
2973                         kevp->flags = kev32.flags;
2974                         kevp->udata = CAST_USER_ADDR_T(kev32.udata);
2975                         kevp->fflags = kev32.fflags;
2976                         kevp->data = (intptr_t)kev32.data;
2977                 }
2978         } else if (flags & KEVENT_FLAG_LEGACY64) {
2979                 struct kevent64_s kev64;
2980
2981                 bzero(kevp, sizeof(*kevp));
2982
2983                 advance = sizeof(struct kevent64_s);
2984                 error = copyin(*addrp, (caddr_t)&kev64, advance);
2985                 if (error) {
2986                         return error;
2987                 }
2988                 kevp->ident = kev64.ident;
2989                 kevp->filter = kev64.filter;
2990                 kevp->flags = kev64.flags;
2991                 kevp->udata = kev64.udata;
2992                 kevp->fflags = kev64.fflags;
2993                 kevp->data = kev64.data;
2994                 kevp->ext[0] = kev64.ext[0];
2995                 kevp->ext[1] = kev64.ext[1];
2996         } else {
2997                 struct kevent_qos_s kevqos;
2998
2999                 bzero(kevp, sizeof(*kevp));
3000
3001                 advance = sizeof(struct kevent_qos_s);
3002                 error = copyin(*addrp, (caddr_t)&kevqos, advance);
3003                 if (error) {
3004                         return error;
3005                 }
3006                 kevp->ident = kevqos.ident;
3007                 kevp->filter = kevqos.filter;
3008                 kevp->flags = kevqos.flags;
3009                 kevp->qos = kevqos.qos;
3010 //              kevp->xflags = kevqos.xflags;
3011                 kevp->udata = kevqos.udata;
3012                 kevp->fflags = kevqos.fflags;
3013                 kevp->data = kevqos.data;
3014                 kevp->ext[0] = kevqos.ext[0];
3015                 kevp->ext[1] = kevqos.ext[1];
3016                 kevp->ext[2] = kevqos.ext[2];
3017                 kevp->ext[3] = kevqos.ext[3];
3018         }
3019         if (!error) {
3020                 *addrp += advance;
3021         }
3022         return error;
3023 }
3024
3025 static int
3026 kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
3027     unsigned int flags)
3028 {
3029         user_addr_t addr = *addrp;
3030         int advance;
3031         int error;
3032
3033         /*
3034          * fully initialize the differnt output event structure
3035          * types from the internal kevent (and some universal
3036          * defaults for fields not represented in the internal
3037          * form).
3038          */
3039         if (flags & KEVENT_FLAG_LEGACY32) {
3040                 assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0);
3041
3042                 if (IS_64BIT_PROCESS(p)) {
3043                         struct user64_kevent kev64;
3044
3045                         advance = sizeof(kev64);
3046                         bzero(&kev64, advance);
3047
3048                         /*
3049                          * deal with the special case of a user-supplied
3050                          * value of (uintptr_t)-1.
3051                          */
3052                         kev64.ident = (kevp->ident == (uintptr_t)-1) ?
3053                             (uint64_t)-1LL : (uint64_t)kevp->ident;
3054
3055                         kev64.filter = kevp->filter;
3056                         kev64.flags = kevp->flags;
3057                         kev64.fflags = kevp->fflags;
3058                         kev64.data = (int64_t) kevp->data;
3059                         kev64.udata = kevp->udata;
3060                         error = copyout((caddr_t)&kev64, addr, advance);
3061                 } else {
3062                         struct user32_kevent kev32;
3063
3064                         advance = sizeof(kev32);
3065                         bzero(&kev32, advance);
3066                         kev32.ident = (uint32_t)kevp->ident;
3067                         kev32.filter = kevp->filter;
3068                         kev32.flags = kevp->flags;
3069                         kev32.fflags = kevp->fflags;
3070                         kev32.data = (int32_t)kevp->data;
3071                         kev32.udata = kevp->udata;
3072                         error = copyout((caddr_t)&kev32, addr, advance);
3073                 }
3074         } else if (flags & KEVENT_FLAG_LEGACY64) {
3075                 struct kevent64_s kev64;
3076
3077                 advance = sizeof(struct kevent64_s);
3078                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
3079                         addr -= advance;
3080                 }
3081                 bzero(&kev64, advance);
3082                 kev64.ident = kevp->ident;
3083                 kev64.filter = kevp->filter;
3084                 kev64.flags = kevp->flags;
3085                 kev64.fflags = kevp->fflags;
3086                 kev64.data = (int64_t) kevp->data;
3087                 kev64.udata = kevp->udata;
3088                 kev64.ext[0] = kevp->ext[0];
3089                 kev64.ext[1] = kevp->ext[1];
3090                 error = copyout((caddr_t)&kev64, addr, advance);
3091         } else {
3092                 struct kevent_qos_s kevqos;
3093
3094                 advance = sizeof(struct kevent_qos_s);
3095                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
3096                         addr -= advance;
3097                 }
3098                 bzero(&kevqos, advance);
3099                 kevqos.ident = kevp->ident;
3100                 kevqos.filter = kevp->filter;
3101                 kevqos.flags = kevp->flags;
3102                 kevqos.qos = kevp->qos;
3103                 kevqos.udata = kevp->udata;
3104                 kevqos.fflags = kevp->fflags;
3105                 kevqos.xflags = 0;
3106                 kevqos.data = (int64_t) kevp->data;
3107                 kevqos.ext[0] = kevp->ext[0];
3108                 kevqos.ext[1] = kevp->ext[1];
3109                 kevqos.ext[2] = kevp->ext[2];
3110                 kevqos.ext[3] = kevp->ext[3];
3111                 error = copyout((caddr_t)&kevqos, addr, advance);
3112         }
3113         if (!error) {
3114                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
3115                         *addrp = addr;
3116                 } else {
3117                         *addrp = addr + advance;
3118                 }
3119         }
3120         return error;
3121 }
3122
3123 static int
3124 kevent_get_data_size(
3125         struct proc *p,
3126         uint64_t data_available,
3127         unsigned int flags,
3128         user_size_t *residp)
3129 {
3130         user_size_t resid;
3131         int error = 0;
3132
3133         if (data_available != USER_ADDR_NULL) {
3134                 if (flags & KEVENT_FLAG_KERNEL) {
3135                         resid = *(user_size_t *)(uintptr_t)data_available;
3136                 } else if (IS_64BIT_PROCESS(p)) {
3137                         user64_size_t usize;
3138                         error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
3139                         resid = (user_size_t)usize;
3140                 } else {
3141                         user32_size_t usize;
3142                         error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
3143                         resid = (user_size_t)usize;
3144                 }
3145                 if (error) {
3146                         return error;
3147                 }
3148         } else {
3149                 resid = 0;
3150         }
3151         *residp = resid;
3152         return 0;
3153 }
3154
3155 static int
3156 kevent_put_data_size(
3157         struct proc *p,
3158         uint64_t data_available,
3159         unsigned int flags,
3160         user_size_t resid)
3161 {
3162         int error = 0;
3163
3164         if (data_available) {
3165                 if (flags & KEVENT_FLAG_KERNEL) {
3166                         *(user_size_t *)(uintptr_t)data_available = resid;
3167                 } else if (IS_64BIT_PROCESS(p)) {
3168                         user64_size_t usize = (user64_size_t)resid;
3169                         error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
3170                 } else {
3171                         user32_size_t usize = (user32_size_t)resid;
3172                         error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
3173                 }
3174         }
3175         return error;
3176 }
3177
3178 /*
3179  * kevent_continue - continue a kevent syscall after blocking
3180  *
3181  *      assume we inherit a use count on the kq fileglob.
3182  */
3183 __attribute__((noreturn))
3184 static void
3185 kevent_continue(__unused struct kqueue *kq, void *data, int error)
3186 {
3187         struct _kevent *cont_args;
3188         struct fileproc *fp;
3189         uint64_t data_available;
3190         user_size_t data_size;
3191         user_size_t data_resid;
3192         unsigned int flags;
3193         int32_t *retval;
3194         int noutputs;
3195         int fd;
3196         struct proc *p = current_proc();
3197
3198         cont_args = (struct _kevent *)data;
3199         data_available = cont_args->data_available;
3200         flags = cont_args->process_data.fp_flags;
3201         data_size = cont_args->process_data.fp_data_size;
3202         data_resid = cont_args->process_data.fp_data_resid;
3203         noutputs = cont_args->eventout;
3204         retval = cont_args->retval;
3205         fd = cont_args->fd;
3206         fp = cont_args->fp;
3207
3208         kevent_put_kq(p, fd, fp, kq);
3209
3210         /* don't abandon other output just because of residual copyout failures */
3211         if (error == 0 && data_available && data_resid != data_size) {
3212                 (void)kevent_put_data_size(p, data_available, flags, data_resid);
3213         }
3214
3215         /* don't restart after signals... */
3216         if (error == ERESTART) {
3217                 error = EINTR;
3218         } else if (error == EWOULDBLOCK) {
3219                 error = 0;
3220         }
3221         if (error == 0) {
3222                 *retval = noutputs;
3223         }
3224         unix_syscall_return(error);
3225 }
3226
3227 /*
3228  * kevent - [syscall] register and wait for kernel events
3229  *
3230  */
3231 int
3232 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
3233 {
3234         unsigned int flags = KEVENT_FLAG_LEGACY32;
3235
3236         return kevent_internal(p,
3237                    (kqueue_id_t)uap->fd, NULL,
3238                    uap->changelist, uap->nchanges,
3239                    uap->eventlist, uap->nevents,
3240                    0ULL, 0ULL,
3241                    flags,
3242                    uap->timeout,
3243                    kevent_continue,
3244                    retval);
3245 }
3246
3247 int
3248 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
3249 {
3250         unsigned int flags;
3251
3252         /* restrict to user flags and set legacy64 */
3253         flags = uap->flags & KEVENT_FLAG_USER;
3254         flags |= KEVENT_FLAG_LEGACY64;
3255
3256         return kevent_internal(p,
3257                    (kqueue_id_t)uap->fd, NULL,
3258                    uap->changelist, uap->nchanges,
3259                    uap->eventlist, uap->nevents,
3260                    0ULL, 0ULL,
3261                    flags,
3262                    uap->timeout,
3263                    kevent_continue,
3264                    retval);
3265 }
3266
3267 int
3268 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
3269 {
3270         /* restrict to user flags */
3271         uap->flags &= KEVENT_FLAG_USER;
3272
3273         return kevent_internal(p,
3274                    (kqueue_id_t)uap->fd, NULL,
3275                    uap->changelist, uap->nchanges,
3276                    uap->eventlist, uap->nevents,
3277                    uap->data_out, (uint64_t)uap->data_available,
3278                    uap->flags,
3279                    0ULL,
3280                    kevent_continue,
3281                    retval);
3282 }
3283
3284 int
3285 kevent_qos_internal(struct proc *p, int fd,
3286     user_addr_t changelist, int nchanges,
3287     user_addr_t eventlist, int nevents,
3288     user_addr_t data_out, user_size_t *data_available,
3289     unsigned int flags,
3290     int32_t *retval)
3291 {
3292         return kevent_internal(p,
3293                    (kqueue_id_t)fd, NULL,
3294                    changelist, nchanges,
3295                    eventlist, nevents,
3296                    data_out, (uint64_t)data_available,
3297                    (flags | KEVENT_FLAG_KERNEL),
3298                    0ULL,
3299                    NULL,
3300                    retval);
3301 }
3302
3303 int
3304 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
3305 {
3306         /* restrict to user flags */
3307         uap->flags &= KEVENT_FLAG_USER;
3308
3309         return kevent_internal(p,
3310                    (kqueue_id_t)uap->id, NULL,
3311                    uap->changelist, uap->nchanges,
3312                    uap->eventlist, uap->nevents,
3313                    uap->data_out, (uint64_t)uap->data_available,
3314                    (uap->flags | KEVENT_FLAG_DYNAMIC_KQUEUE),
3315                    0ULL,
3316                    kevent_continue,
3317                    retval);
3318 }
3319
3320 int
3321 kevent_id_internal(struct proc *p, kqueue_id_t *id,
3322     user_addr_t changelist, int nchanges,
3323     user_addr_t eventlist, int nevents,
3324     user_addr_t data_out, user_size_t *data_available,
3325     unsigned int flags,
3326     int32_t *retval)
3327 {
3328         return kevent_internal(p,
3329                    *id, id,
3330                    changelist, nchanges,
3331                    eventlist, nevents,
3332                    data_out, (uint64_t)data_available,
3333                    (flags | KEVENT_FLAG_KERNEL | KEVENT_FLAG_DYNAMIC_KQUEUE),
3334                    0ULL,
3335                    NULL,
3336                    retval);
3337 }
3338
3339 static int
3340 kevent_get_timeout(struct proc *p,
3341     user_addr_t utimeout,
3342     unsigned int flags,
3343     struct timeval *atvp)
3344 {
3345         struct timeval atv;
3346         int error = 0;
3347
3348         if (flags & KEVENT_FLAG_IMMEDIATE) {
3349                 getmicrouptime(&atv);
3350         } else if (utimeout != USER_ADDR_NULL) {
3351                 struct timeval rtv;
3352                 if (flags & KEVENT_FLAG_KERNEL) {
3353                         struct timespec *tsp = (struct timespec *)utimeout;
3354                         TIMESPEC_TO_TIMEVAL(&rtv, tsp);
3355                 } else if (IS_64BIT_PROCESS(p)) {
3356                         struct user64_timespec ts;
3357                         error = copyin(utimeout, &ts, sizeof(ts));
3358                         if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0) {
3359                                 error = EINVAL;
3360                         } else {
3361                                 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
3362                         }
3363                 } else {
3364                         struct user32_timespec ts;
3365                         error = copyin(utimeout, &ts, sizeof(ts));
3366                         TIMESPEC_TO_TIMEVAL(&rtv, &ts);
3367                 }
3368                 if (error) {
3369                         return error;
3370                 }
3371                 if (itimerfix(&rtv)) {
3372                         return EINVAL;
3373                 }
3374                 getmicrouptime(&atv);
3375                 timevaladd(&atv, &rtv);
3376         } else {
3377                 /* wait forever value */
3378                 atv.tv_sec = 0;
3379                 atv.tv_usec = 0;
3380         }
3381         *atvp = atv;
3382         return 0;
3383 }
3384
3385 static int
3386 kevent_set_kq_mode(struct kqueue *kq, unsigned int flags)
3387 {
3388         /* each kq should only be used for events of one type */
3389         kqlock(kq);
3390         if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) {
3391                 if (flags & KEVENT_FLAG_LEGACY32) {
3392                         if ((kq->kq_state & KQ_KEV32) == 0) {
3393                                 kqunlock(kq);
3394                                 return EINVAL;
3395                         }
3396                 } else if (kq->kq_state & KQ_KEV32) {
3397                         kqunlock(kq);
3398                         return EINVAL;
3399                 }
3400         } else if (flags & KEVENT_FLAG_LEGACY32) {
3401                 kq->kq_state |= KQ_KEV32;
3402         } else if (flags & KEVENT_FLAG_LEGACY64) {
3403                 kq->kq_state |= KQ_KEV64;
3404         } else {
3405                 kq->kq_state |= KQ_KEV_QOS;
3406         }
3407         kqunlock(kq);
3408         return 0;
3409 }
3410
3411 #define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
3412 #define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
3413
3414 static inline void
3415 kqhash_lock(proc_t p)
3416 {
3417         lck_mtx_lock_spin_always(&p->p_fd->fd_kqhashlock);
3418 }
3419
3420 static inline void
3421 kqhash_lock_held(__assert_only proc_t p)
3422 {
3423         LCK_MTX_ASSERT(&p->p_fd->fd_kqhashlock, LCK_MTX_ASSERT_OWNED);
3424 }
3425
3426 static inline void
3427 kqhash_unlock(proc_t p)
3428 {
3429         lck_mtx_unlock(&p->p_fd->fd_kqhashlock);
3430 }
3431
3432 static void
3433 kqueue_hash_init_if_needed(proc_t p)
3434 {
3435         struct filedesc *fdp = p->p_fd;
3436
3437         kqhash_lock_held(p);
3438
3439         if (__improbable(fdp->fd_kqhash == NULL)) {
3440                 struct kqlist *alloc_hash;
3441                 u_long alloc_mask;
3442
3443                 kqhash_unlock(p);
3444                 alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3445                 kqhash_lock(p);
3446
3447                 /* See if we won the race */
3448                 if (fdp->fd_kqhashmask == 0) {
3449                         fdp->fd_kqhash = alloc_hash;
3450                         fdp->fd_kqhashmask = alloc_mask;
3451                 } else {
3452                         kqhash_unlock(p);
3453                         FREE(alloc_hash, M_KQUEUE);
3454                         kqhash_lock(p);
3455                 }
3456         }
3457 }
3458
3459 /*
3460  * Called with the kqhash_lock() held
3461  */
3462 static void
3463 kqueue_hash_insert(
3464         struct proc *p,
3465         kqueue_id_t id,
3466         struct kqueue *kq)
3467 {
3468         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3469         struct filedesc *fdp = p->p_fd;
3470         struct kqlist *list;
3471
3472         /* should hold the kq hash lock */
3473         kqhash_lock_held(p);
3474
3475         if ((kq->kq_state & KQ_DYNAMIC) == 0) {
3476                 assert(kq->kq_state & KQ_DYNAMIC);
3477                 return;
3478         }
3479
3480         /* only dynamically allocate workloop kqs for now */
3481         assert(kq->kq_state & KQ_WORKLOOP);
3482         assert(fdp->fd_kqhash);
3483
3484         kqwl->kqwl_dynamicid = id;
3485
3486         list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3487         SLIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3488 }
3489
3490 /* Called with kqhash_lock held */
3491 static void
3492 kqueue_hash_remove(
3493         struct proc *p,
3494         struct kqueue *kq)
3495 {
3496         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3497         struct filedesc *fdp = p->p_fd;
3498         struct kqlist *list;
3499
3500         /* should hold the kq hash lock */
3501         kqhash_lock_held(p);
3502
3503         if ((kq->kq_state & KQ_DYNAMIC) == 0) {
3504                 assert(kq->kq_state & KQ_DYNAMIC);
3505                 return;
3506         }
3507         assert(kq->kq_state & KQ_WORKLOOP); /* for now */
3508         list = &fdp->fd_kqhash[KQ_HASH(kqwl->kqwl_dynamicid, fdp->fd_kqhashmask)];
3509         SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
3510 }
3511
3512 /* Called with kqhash_lock held */
3513 static struct kqueue *
3514 kqueue_hash_lookup(struct proc *p, kqueue_id_t id)
3515 {
3516         struct filedesc *fdp = p->p_fd;
3517         struct kqlist *list;
3518         struct kqworkloop *kqwl;
3519
3520         /* should hold the kq hash lock */
3521         kqhash_lock_held(p);
3522
3523         if (fdp->fd_kqhashmask == 0) {
3524                 return NULL;
3525         }
3526
3527         list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3528         SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
3529                 if (kqwl->kqwl_dynamicid == id) {
3530                         struct kqueue *kq = (struct kqueue *)kqwl;
3531
3532                         assert(kq->kq_state & KQ_DYNAMIC);
3533                         assert(kq->kq_state & KQ_WORKLOOP); /* for now */
3534                         return kq;
3535                 }
3536         }
3537         return NULL;
3538 }
3539
3540 static inline void
3541 kqueue_release_last(struct proc *p, kqueue_t kqu)
3542 {
3543         struct kqueue *kq = kqu.kq;
3544         if (kq->kq_state & KQ_DYNAMIC) {
3545                 kqhash_lock(p);
3546                 if (kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF)) {
3547                         thread_t cur_owner = kqworkloop_invalidate(kqu.kqwl);
3548                         kqueue_hash_remove(p, kq);
3549                         kqhash_unlock(p);
3550                         if (cur_owner) {
3551                                 thread_deallocate(cur_owner);
3552                         }
3553                         kqueue_dealloc(kq);
3554                 } else {
3555                         kqhash_unlock(p);
3556                 }
3557         }
3558 }
3559
3560 /*
3561  * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3562  * scheduling parameters
3563  *
3564  *              Called with proc_fdlock held.
3565  *              Returns with it locked.
3566  *              Process is in such a state that it will not try to allocate
3567  *              any more knotes during this process (stopped for exit or exec).
3568  */
3569 void
3570 kqworkloops_dealloc(proc_t p)
3571 {
3572         struct filedesc *fdp = p->p_fd;
3573         struct kqlist *list;
3574         struct kqworkloop *kqwl, *kqwln;
3575         struct kqlist tofree;
3576         int i;
3577
3578         if (!(fdp->fd_flags & FD_WORKLOOP)) {
3579                 return;
3580         }
3581
3582         SLIST_INIT(&tofree);
3583
3584         kqhash_lock(p);
3585         assert(fdp->fd_kqhashmask != 0);
3586
3587         for (i = 0; i <= (int)fdp->fd_kqhashmask; i++) {
3588                 list = &fdp->fd_kqhash[i];
3589                 SLIST_FOREACH_SAFE(kqwl, list, kqwl_hashlink, kqwln) {
3590                         /*
3591                          * kqworkloops that have scheduling parameters have an
3592                          * implicit retain from kqueue_workloop_ctl that needs
3593                          * to be balanced on process exit.
3594                          */
3595                         assert(kqwl->kqwl_params);
3596                         SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
3597                         SLIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3598                 }
3599         }
3600
3601         kqhash_unlock(p);
3602
3603         SLIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3604                 struct kqueue *kq = (struct kqueue *)kqwl;
3605                 __assert_only bool released;
3606                 released = kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF);
3607                 assert(released);
3608                 kqueue_dealloc(kq);
3609         }
3610 }
3611
3612 static struct kqueue *
3613 kevent_get_bound_kqworkloop(thread_t thread)
3614 {
3615         struct uthread *ut = get_bsdthread_info(thread);
3616         struct kqrequest *kqr = ut->uu_kqr_bound;
3617
3618         return kqr ? (struct kqueue *)kqr_kqworkloop(kqr) : NULL;
3619 }
3620
3621 static int
3622 kevent_get_kq(struct proc *p, kqueue_id_t id, workq_threadreq_param_t *trp,
3623     unsigned int flags, struct fileproc **fpp, int *fdp,
3624     struct kqueue **kqp)
3625 {
3626         struct filedesc *descp = p->p_fd;
3627         struct fileproc *fp = NULL;
3628         struct kqueue *kq = NULL;
3629         int fd = 0;
3630         int error = 0;
3631         thread_t th = current_thread();
3632
3633         assert(!trp || (flags & KEVENT_FLAG_WORKLOOP));
3634
3635         /* Was the workloop flag passed?  Then it is for sure only a workloop */
3636         if (flags & KEVENT_FLAG_DYNAMIC_KQUEUE) {
3637                 assert(flags & KEVENT_FLAG_WORKLOOP);
3638                 assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3639                 kq = kevent_get_bound_kqworkloop(th);
3640
3641                 /*
3642                  * when kevent_id_internal is called from within the
3643                  * kernel, and the passed 'id' value is '-1' then we
3644                  * look for the currently bound workloop kq.
3645                  */
3646                 if (id == (kqueue_id_t)-1 &&
3647                     (flags & KEVENT_FLAG_KERNEL) &&
3648                     (flags & KEVENT_FLAG_WORKLOOP)) {
3649                         if (!is_workqueue_thread(th) || !kq) {
3650                                 return EINVAL;
3651                         }
3652
3653                         kqueue_retain(kq);
3654                         goto out;
3655                 }
3656
3657                 if (id == 0 || id == (kqueue_id_t)-1) {
3658                         return EINVAL;
3659                 }
3660
3661                 /* try shortcut on kq lookup for bound threads */
3662                 if (kq != NULL && ((struct kqworkloop *)kq)->kqwl_dynamicid == id) {
3663                         if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3664                                 return EEXIST;
3665                         }
3666
3667                         /* retain a reference while working with this kq. */
3668                         assert(kq->kq_state & KQ_DYNAMIC);
3669                         kqueue_retain(kq);
3670                         goto out;
3671                 }
3672
3673                 /* look for the kq on the hash table */
3674                 kqhash_lock(p);
3675                 kq = kqueue_hash_lookup(p, id);
3676                 if (kq == NULL) {
3677                         kqhash_unlock(p);
3678
3679                         if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST) {
3680                                 return ENOENT;
3681                         }
3682
3683                         struct kqueue *alloc_kq;
3684                         alloc_kq = kqueue_alloc(p, flags);
3685                         if (!alloc_kq) {
3686                                 return ENOMEM;
3687                         }
3688
3689                         kqhash_lock(p);
3690                         kqueue_hash_init_if_needed(p);
3691                         kq = kqueue_hash_lookup(p, id);
3692                         if (kq == NULL) {
3693                                 /* insert our new one */
3694                                 kq = alloc_kq;
3695                                 if (trp) {
3696                                         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3697                                         kqwl->kqwl_params = trp->trp_value;
3698                                 }
3699                                 kqueue_hash_insert(p, id, kq);
3700                                 kqhash_unlock(p);
3701                         } else if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3702                                 /* lost race and caller wants an error */
3703                                 kqhash_unlock(p);
3704                                 kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
3705                                 kqueue_dealloc(alloc_kq);
3706                                 return EEXIST;
3707                         } else {
3708                                 /* lost race, retain existing workloop */
3709                                 kqueue_retain(kq);
3710                                 kqhash_unlock(p);
3711                                 kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
3712                                 kqueue_dealloc(alloc_kq);
3713                         }
3714                 } else {
3715                         if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3716                                 kqhash_unlock(p);
3717                                 return EEXIST;
3718                         }
3719
3720                         /* retain a reference while working with this kq. */
3721                         assert(kq->kq_state & KQ_DYNAMIC);
3722                         kqueue_retain(kq);
3723                         kqhash_unlock(p);
3724                 }
3725         } else if (flags & KEVENT_FLAG_WORKQ) {
3726                 /* must already exist for bound threads. */
3727                 if (flags & KEVENT_FLAG_KERNEL) {
3728                         assert(descp->fd_wqkqueue != NULL);
3729                 }
3730
3731                 /*
3732                  * use the private kq associated with the proc workq.
3733                  * Just being a thread within the process (and not
3734                  * being the exit/exec thread) is enough to hold a
3735                  * reference on this special kq.
3736                  */
3737                 kq = descp->fd_wqkqueue;
3738                 if (kq == NULL) {
3739                         struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ);
3740                         if (alloc_kq == NULL) {
3741                                 return ENOMEM;
3742                         }
3743
3744                         knhash_lock(p);
3745                         if (descp->fd_wqkqueue == NULL) {
3746                                 kq = descp->fd_wqkqueue = alloc_kq;
3747                                 knhash_unlock(p);
3748                         } else {
3749                                 knhash_unlock(p);
3750                                 kq = descp->fd_wqkqueue;
3751                                 kqueue_dealloc(alloc_kq);
3752                         }
3753                 }
3754         } else {
3755                 /* get a usecount for the kq itself */
3756                 fd = (int)id;
3757                 if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0) {
3758                         return error;
3759                 }
3760         }
3761         if ((error = kevent_set_kq_mode(kq, flags)) != 0) {
3762                 /* drop the usecount */
3763                 if (fp != NULL) {
3764                         fp_drop(p, fd, fp, 0);
3765                 }
3766                 return error;
3767         }
3768
3769 out:
3770         *fpp = fp;
3771         *fdp = fd;
3772         *kqp = kq;
3773
3774         return error;
3775 }
3776
3777 static void
3778 kevent_put_kq(
3779         struct proc *p,
3780         kqueue_id_t id,
3781         struct fileproc *fp,
3782         struct kqueue *kq)
3783 {
3784         kqueue_release_last(p, kq);
3785         if (fp != NULL) {
3786                 assert((kq->kq_state & KQ_WORKQ) == 0);
3787                 fp_drop(p, (int)id, fp, 0);
3788         }
3789 }
3790
3791 static uint64_t
3792 kevent_workloop_serial_no_copyin(proc_t p, uint64_t workloop_id)
3793 {
3794         uint64_t serial_no = 0;
3795         user_addr_t addr;
3796         int rc;
3797
3798         if (workloop_id == 0 || p->p_dispatchqueue_serialno_offset == 0) {
3799                 return 0;
3800         }
3801         addr = (user_addr_t)(workloop_id + p->p_dispatchqueue_serialno_offset);
3802
3803         if (proc_is64bit(p)) {
3804                 rc = copyin(addr, (caddr_t)&serial_no, sizeof(serial_no));
3805         } else {
3806                 uint32_t serial_no32 = 0;
3807                 rc = copyin(addr, (caddr_t)&serial_no32, sizeof(serial_no32));
3808                 serial_no = serial_no32;
3809         }
3810         return rc == 0 ? serial_no : 0;
3811 }
3812
3813 int
3814 kevent_exit_on_workloop_ownership_leak(thread_t thread)
3815 {
3816         proc_t p = current_proc();
3817         struct filedesc *fdp = p->p_fd;
3818         kqueue_id_t workloop_id = 0;
3819         os_reason_t reason = OS_REASON_NULL;
3820         mach_vm_address_t addr;
3821         uint32_t reason_size;
3822
3823         kqhash_lock(p);
3824         if (fdp->fd_kqhashmask > 0) {
3825                 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
3826                         struct kqworkloop *kqwl;
3827
3828                         SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
3829                                 struct kqueue *kq = &kqwl->kqwl_kqueue;
3830                                 if ((kq->kq_state & KQ_DYNAMIC) && kqwl->kqwl_owner == thread) {
3831                                         workloop_id = kqwl->kqwl_dynamicid;
3832                                         break;
3833                                 }
3834                         }
3835                 }
3836         }
3837         kqhash_unlock(p);
3838
3839         reason = os_reason_create(OS_REASON_LIBSYSTEM,
3840             OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK);
3841         if (reason == OS_REASON_NULL) {
3842                 goto out;
3843         }
3844
3845         reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
3846         reason_size = 2 * sizeof(uint64_t);
3847         reason_size = kcdata_estimate_required_buffer_size(2, reason_size);
3848         if (os_reason_alloc_buffer(reason, reason_size) != 0) {
3849                 goto out;
3850         }
3851
3852         if (workloop_id) {
3853                 struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor;
3854
3855                 if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID,
3856                     sizeof(workloop_id), &addr) == KERN_SUCCESS) {
3857                         kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id));
3858                 }
3859
3860                 uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id);
3861                 if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO,
3862                     sizeof(serial_no), &addr) == KERN_SUCCESS) {
3863                         kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no));
3864                 }
3865         }
3866 out:
3867 #if DEVELOPMENT || DEBUG
3868         if (kevent_debug_flags() & KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK) {
3869                 panic("thread %p in task %p is leaked workloop 0x%016llx ownership",
3870                     thread, p->task, workloop_id);
3871         }
3872         psignal_try_thread_with_reason(p, thread, SIGABRT, reason);
3873         return 0;
3874 #else
3875         return exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL,
3876                    FALSE, FALSE, 0, reason);
3877 #endif
3878 }
3879
3880 static inline boolean_t
3881 kevent_args_requesting_events(unsigned int flags, int nevents)
3882 {
3883         return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
3884 }
3885
3886 static int
3887 kevent_internal(struct proc *p,
3888     kqueue_id_t id, kqueue_id_t *id_out,
3889     user_addr_t changelist, int nchanges,
3890     user_addr_t ueventlist, int nevents,
3891     user_addr_t data_out, uint64_t data_available,
3892     unsigned int flags,
3893     user_addr_t utimeout,
3894     kqueue_continue_t continuation,
3895     int32_t *retval)
3896 {
3897         uthread_t ut;
3898         struct kqueue *kq;
3899         struct fileproc *fp = NULL;
3900         int fd = 0;
3901         struct kevent_internal_s kev;
3902         int error, noutputs, register_rc;
3903         bool needs_end_processing = false;
3904         struct timeval atv;
3905         user_size_t data_size;
3906         user_size_t data_resid;
3907         thread_t thread = current_thread();
3908         KNOTE_LOCK_CTX(knlc);
3909
3910         /* Don't allow user-space threads to process output events from the workq kqs */
3911         if (((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ) &&
3912             kevent_args_requesting_events(flags, nevents)) {
3913                 return EINVAL;
3914         }
3915
3916         if (flags & KEVENT_FLAG_PARKING) {
3917                 if (!kevent_args_requesting_events(flags, nevents) || id != (kqueue_id_t)-1) {
3918                         return EINVAL;
3919                 }
3920         }
3921
3922         /* restrict dynamic kqueue allocation to workloops (for now) */
3923         if ((flags & (KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP)) == KEVENT_FLAG_DYNAMIC_KQUEUE) {
3924                 return EINVAL;
3925         }
3926
3927         if ((flags & (KEVENT_FLAG_WORKLOOP)) && (flags & (KEVENT_FLAG_WORKQ))) {
3928                 return EINVAL;
3929         }
3930
3931         if (flags & (KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3932                 /* allowed only on workloops when calling kevent_id from user-space */
3933                 if (!(flags & KEVENT_FLAG_WORKLOOP) || (flags & KEVENT_FLAG_KERNEL) || !(flags & KEVENT_FLAG_DYNAMIC_KQUEUE)) {
3934                         return EINVAL;
3935                 }
3936         }
3937
3938         /* prepare to deal with stack-wise allocation of out events */
3939         if (flags & KEVENT_FLAG_STACK_EVENTS) {
3940                 int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
3941                     (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
3942                     sizeof(struct user32_kevent)) :
3943                     ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
3944                     sizeof(struct kevent_qos_s)));
3945                 ueventlist += nevents * scale;
3946         }
3947
3948         /* convert timeout to absolute - if we have one (and not immediate) */
3949         error = kevent_get_timeout(p, utimeout, flags, &atv);
3950         if (error) {
3951                 return error;
3952         }
3953
3954         /* copyin initial value of data residual from data_available */
3955         error = kevent_get_data_size(p, data_available, flags, &data_size);
3956         if (error) {
3957                 return error;
3958         }
3959
3960         /* get the kq we are going to be working on */
3961         error = kevent_get_kq(p, id, NULL, flags, &fp, &fd, &kq);
3962 #if CONFIG_WORKLOOP_DEBUG
3963         ut = (uthread_t)get_bsdthread_info(thread);
3964         UU_KEVENT_HISTORY_WRITE_ENTRY(ut, {
3965                 .uu_kqid = id,
3966                 .uu_kq = error ? NULL : kq,
3967                 .uu_error = error,
3968                 .uu_nchanges = nchanges,
3969                 .uu_nevents = nevents,
3970                 .uu_flags = flags,
3971         });
3972 #endif // CONFIG_WORKLOOP_DEBUG
3973         if (error) {
3974                 return error;
3975         }
3976
3977         /* only bound threads can receive events on workloops */
3978         if (flags & KEVENT_FLAG_WORKLOOP) {
3979                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3980                 struct kqrequest *kqr = &kqwl->kqwl_request;
3981
3982                 assert(kq->kq_state & KQ_WORKLOOP);
3983
3984                 if (kevent_args_requesting_events(flags, nevents)) {
3985                         if (kq != kevent_get_bound_kqworkloop(thread)) {
3986                                 error = EXDEV;
3987                                 goto out;
3988                         }
3989
3990                         kq_req_lock(kqwl);
3991                         /*
3992                          * Disable the R2K notification while doing a register, if the
3993                          * caller wants events too, we don't want the AST to be set if we
3994                          * will process these events soon.
3995                          */
3996                         kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
3997                         needs_end_processing = true;
3998                         kq_req_unlock(kq);
3999                 }
4000
4001                 if (id_out) {
4002                         *id_out = kqwl->kqwl_dynamicid;
4003                 }
4004         }
4005
4006         /* register all the change requests the user provided... */
4007         noutputs = 0;
4008         while (nchanges > 0 && error == 0) {
4009                 error = kevent_copyin(&changelist, &kev, p, flags);
4010                 if (error) {
4011                         break;
4012                 }
4013
4014                 /* Make sure user doesn't pass in any system flags */
4015                 kev.flags &= ~EV_SYSFLAGS;
4016
4017                 register_rc = kevent_register(kq, &kev, &knlc);
4018                 if (register_rc & FILTER_REGISTER_WAIT) {
4019                         kqlock_held(kq);
4020
4021                         // f_post_register_wait is meant to call a continuation and not to
4022                         // return, which is why we don't support FILTER_REGISTER_WAIT if
4023                         // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
4024                         // waits isn't the last.
4025                         //
4026                         // It is implementable, but not used by any userspace code at the
4027                         // moment, so for now return ENOTSUP if someone tries to do it.
4028                         if (nchanges == 1 && nevents >= 1 && (flags & KEVENT_FLAG_ERROR_EVENTS)) {
4029                                 struct _kevent_register *cont_args;
4030                                 /* store the continuation/completion data in the uthread */
4031                                 ut = (uthread_t)get_bsdthread_info(thread);
4032                                 cont_args = &ut->uu_save.uus_kevent_register;
4033                                 cont_args->kev = kev;
4034                                 cont_args->kq = kq;
4035                                 cont_args->fp = fp;
4036                                 cont_args->fd = fd;
4037                                 cont_args->ueventlist = ueventlist;
4038                                 cont_args->flags = flags;
4039                                 cont_args->retval = retval;
4040                                 cont_args->eventcount = nevents;
4041                                 cont_args->eventout = noutputs;
4042                                 knote_fops(cont_args->knote)->f_post_register_wait(ut, &knlc, cont_args);
4043                                 panic("f_post_register_wait returned (kev: %p)", &kev);
4044                         }
4045
4046                         kev.flags |= EV_ERROR;
4047                         kev.data = ENOTSUP;
4048                         knote_unlock(kq, knlc.knlc_knote, &knlc, KNOTE_KQ_UNLOCK);
4049                 }
4050
4051                 // keep in sync with kevent_register_wait_return()
4052                 if (nevents > 0 && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
4053                         if ((kev.flags & EV_ERROR) == 0) {
4054                                 kev.flags |= EV_ERROR;
4055                                 kev.data = 0;
4056                         }
4057                         error = kevent_copyout(&kev, &ueventlist, p, flags);
4058                         if (error == 0) {
4059                                 nevents--;
4060                                 noutputs++;
4061                         }
4062                 } else if (kev.flags & EV_ERROR) {
4063                         error = kev.data;
4064                 }
4065                 nchanges--;
4066         }
4067
4068         /* short-circuit the scan if we only want error events */
4069         if (flags & KEVENT_FLAG_ERROR_EVENTS) {
4070                 nevents = 0;
4071         }
4072
4073         /* process pending events */
4074         if (nevents > 0 && noutputs == 0 && error == 0) {
4075                 struct _kevent *cont_args;
4076                 /* store the continuation/completion data in the uthread */
4077                 ut = (uthread_t)get_bsdthread_info(thread);
4078                 cont_args = &ut->uu_save.uus_kevent;
4079                 cont_args->fp = fp;
4080                 cont_args->fd = fd;
4081                 cont_args->retval = retval;
4082                 cont_args->eventlist = ueventlist;
4083                 cont_args->eventcount = nevents;
4084                 cont_args->eventout = noutputs;
4085                 cont_args->data_available = data_available;
4086                 cont_args->process_data.fp_fd = (int)id;
4087                 cont_args->process_data.fp_flags = flags;
4088                 cont_args->process_data.fp_data_out = data_out;
4089                 cont_args->process_data.fp_data_size = data_size;
4090                 cont_args->process_data.fp_data_resid = data_size;
4091
4092                 /*
4093                  * kqworkloop_end_processing() will happen at the end of kqueue_scan()
4094                  */
4095                 needs_end_processing = false;
4096
4097                 error = kqueue_scan(kq, kevent_callback,
4098                     continuation, cont_args,
4099                     &cont_args->process_data,
4100                     &atv, p);
4101
4102                 /* process remaining outputs */
4103                 noutputs = cont_args->eventout;
4104                 data_resid = cont_args->process_data.fp_data_resid;
4105
4106                 /* copyout residual data size value (if it needs to be copied out) */
4107                 /* don't abandon other output just because of residual copyout failures */
4108                 if (error == 0 && data_available && data_resid != data_size) {
4109                         (void)kevent_put_data_size(p, data_available, flags, data_resid);
4110                 }
4111         }
4112
4113 out:
4114         if (__improbable(needs_end_processing)) {
4115                 /*
4116                  * If we didn't through kqworkloop_end_processing(),
4117                  * we need to do it here.
4118                  */
4119                 kqlock(kq);
4120                 kqworkloop_end_processing((struct kqworkloop *)kq, 0, 0);
4121                 kqunlock(kq);
4122         }
4123         kevent_put_kq(p, id, fp, kq);
4124
4125         /* don't restart after signals... */
4126         if (error == ERESTART) {
4127                 error = EINTR;
4128         } else if (error == EWOULDBLOCK) {
4129                 error = 0;
4130         }
4131         if (error == 0) {
4132                 *retval = noutputs;
4133         }
4134         return error;
4135 }
4136
4137
4138 /*
4139  * kevent_callback - callback for each individual event
4140  *
4141  * called with nothing locked
4142  * caller holds a reference on the kqueue
4143  */
4144 static int
4145 kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
4146     void *data)
4147 {
4148         struct _kevent *cont_args;
4149         int error;
4150
4151         cont_args = (struct _kevent *)data;
4152         assert(cont_args->eventout < cont_args->eventcount);
4153
4154         /*
4155          * Copy out the appropriate amount of event data for this user.
4156          */
4157         error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
4158             cont_args->process_data.fp_flags);
4159
4160         /*
4161          * If there isn't space for additional events, return
4162          * a harmless error to stop the processing here
4163          */
4164         if (error == 0 && ++cont_args->eventout == cont_args->eventcount) {
4165                 error = EWOULDBLOCK;
4166         }
4167         return error;
4168 }
4169
4170 /*
4171  * kevent_description - format a description of a kevent for diagnostic output
4172  *
4173  * called with a 256-byte string buffer
4174  */
4175
4176 char *
4177 kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
4178 {
4179         snprintf(s, n,
4180             "kevent="
4181             "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
4182             kevp->ident,
4183             kevp->filter,
4184             kevp->flags,
4185             kevp->udata,
4186             kevp->fflags,
4187             kevp->data,
4188             kevp->ext[0],
4189             kevp->ext[1] );
4190
4191         return s;
4192 }
4193
4194 static int
4195 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
4196     struct kevent_internal_s *kev)
4197 {
4198         /* We don't care about the priority of a disabled or deleted knote */
4199         if (kev->flags & (EV_DISABLE | EV_DELETE)) {
4200                 return 0;
4201         }
4202
4203         if (kq->kq_state & KQ_WORKLOOP) {
4204                 /*
4205                  * Workloops need valid priorities with a QOS (excluding manager) for
4206                  * any enabled knote.
4207                  *
4208                  * When it is pre-existing, just make sure it has a valid QoS as
4209                  * kevent_register() will not use the incoming priority (filters who do
4210                  * have the responsibility to validate it again, see filt_wltouch).
4211                  *
4212                  * If the knote is being made, validate the incoming priority.
4213                  */
4214                 if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
4215                         return ERANGE;
4216                 }
4217         }
4218
4219         return 0;
4220 }
4221
4222 /*
4223  * Prepare a filter for waiting after register.
4224  *
4225  * The f_post_register_wait hook will be called later by kevent_register()
4226  * and should call kevent_register_wait_block()
4227  */
4228 static int
4229 kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev)
4230 {
4231         thread_t thread = current_thread();
4232         struct uthread *uth = get_bsdthread_info(thread);
4233
4234         assert(knote_fops(kn)->f_extended_codes);
4235
4236         if (kn->kn_hook == NULL) {
4237                 thread_reference(thread);
4238                 kn->kn_hook = thread;
4239         } else if (kn->kn_hook != thread) {
4240                 /*
4241                  * kn_hook may be set from a previous aborted wait
4242                  * However, it has to be from the same thread.
4243                  */
4244                 kev->flags |= EV_ERROR;
4245                 kev->data = EXDEV;
4246                 return 0;
4247         }
4248
4249         uth->uu_save.uus_kevent_register.knote = kn;
4250         return FILTER_REGISTER_WAIT;
4251 }
4252
4253 /*
4254  * Cleanup a kevent_register_wait_prepare() effect for threads that have been
4255  * aborted instead of properly woken up with thread_wakeup_thread().
4256  */
4257 static void
4258 kevent_register_wait_cleanup(struct knote *kn)
4259 {
4260         thread_t thread = kn->kn_hook;
4261         kn->kn_hook = NULL;
4262         thread_deallocate(thread);
4263 }
4264
4265 /*
4266  * Must be called at the end of a f_post_register_wait call from a filter.
4267  */
4268 static void
4269 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
4270     struct knote_lock_ctx *knlc, thread_continue_t cont,
4271     struct _kevent_register *cont_args)
4272 {
4273         knote_unlock(cont_args->kq, cont_args->knote, knlc, KNOTE_KQ_UNLOCK);
4274         turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
4275         cont_args->handoff_thread = thread;
4276         thread_handoff_parameter(thread, cont, cont_args);
4277 }
4278
4279 /*
4280  * Called by Filters using a f_post_register_wait to return from their wait.
4281  */
4282 static void
4283 kevent_register_wait_return(struct _kevent_register *cont_args)
4284 {
4285         struct kqueue *kq = cont_args->kq;
4286         proc_t p = kq->kq_p;
4287         struct kevent_internal_s *kev = &cont_args->kev;
4288         int error = 0;
4289
4290         if (cont_args->handoff_thread) {
4291                 thread_deallocate(cont_args->handoff_thread);
4292         }
4293
4294         if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
4295                 if ((kev->flags & EV_ERROR) == 0) {
4296                         kev->flags |= EV_ERROR;
4297                         kev->data = 0;
4298                 }
4299                 error = kevent_copyout(kev, &cont_args->ueventlist, p, cont_args->flags);
4300                 if (error == 0) {
4301                         cont_args->eventout++;
4302                 }
4303         }
4304
4305         kevent_put_kq(p, cont_args->fd, cont_args->fp, kq);
4306         if (error == 0) {
4307                 *cont_args->retval = cont_args->eventout;
4308         }
4309         unix_syscall_return(error);
4310 }
4311
4312 /*
4313  * kevent_register - add a new event to a kqueue
4314  *
4315  *      Creates a mapping between the event source and
4316  *      the kqueue via a knote data structure.
4317  *
4318  *      Because many/most the event sources are file
4319  *      descriptor related, the knote is linked off
4320  *      the filedescriptor table for quick access.
4321  *
4322  *      called with nothing locked
4323  *      caller holds a reference on the kqueue
4324  */
4325
4326 int
4327 kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
4328     struct knote_lock_ctx *knlc)
4329 {
4330         struct proc *p = kq->kq_p;
4331         const struct filterops *fops;
4332         struct knote *kn = NULL;
4333         int result = 0, error = 0;
4334         unsigned short kev_flags = kev->flags;
4335
4336         if (kev->filter < 0) {
4337                 if (kev->filter + EVFILT_SYSCOUNT < 0) {
4338                         error = EINVAL;
4339                         goto out;
4340                 }
4341                 fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
4342         } else {
4343                 error = EINVAL;
4344                 goto out;
4345         }
4346
4347         /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
4348         if ((kev->flags & EV_VANISHED) &&
4349             (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) {
4350                 error = EINVAL;
4351                 goto out;
4352         }
4353
4354         /* Simplify the flags - delete and disable overrule */
4355         if (kev->flags & EV_DELETE) {
4356                 kev->flags &= ~EV_ADD;
4357         }
4358         if (kev->flags & EV_DISABLE) {
4359                 kev->flags &= ~EV_ENABLE;
4360         }
4361
4362         if (kq->kq_state & KQ_WORKLOOP) {
4363                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
4364                     ((struct kqworkloop *)kq)->kqwl_dynamicid,
4365                     kev->udata, kev->flags, kev->filter);
4366         } else if (kq->kq_state & KQ_WORKQ) {
4367                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
4368                     0, kev->udata, kev->flags, kev->filter);
4369         } else {
4370                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
4371                     VM_KERNEL_UNSLIDE_OR_PERM(kq),
4372                     kev->udata, kev->flags, kev->filter);
4373         }
4374
4375 restart:
4376         /* find the matching knote from the fd tables/hashes */
4377         kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
4378         error = kevent_register_validate_priority(kq, kn, kev);
4379         result = 0;
4380         if (error) {
4381                 goto out;
4382         }
4383
4384         if (kn == NULL && (kev->flags & EV_ADD) == 0) {
4385                 /*
4386                  * No knote found, EV_ADD wasn't specified
4387                  */
4388
4389                 if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
4390                     (kq->kq_state & KQ_WORKLOOP)) {
4391                         /*
4392                          * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
4393                          * that doesn't care about ENOENT, so just pretend the deletion
4394                          * happened.
4395                          */
4396                 } else {
4397                         error = ENOENT;
4398                 }
4399                 goto out;
4400         } else if (kn == NULL) {
4401                 /*
4402                  * No knote found, need to attach a new one (attach)
4403                  */
4404
4405                 struct fileproc *knote_fp = NULL;
4406
4407                 /* grab a file reference for the new knote */
4408                 if (fops->f_isfd) {
4409                         if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) {
4410                                 goto out;
4411                         }
4412                 }
4413
4414                 kn = knote_alloc();
4415                 if (kn == NULL) {
4416                         error = ENOMEM;
4417                         if (knote_fp != NULL) {
4418                                 fp_drop(p, kev->ident, knote_fp, 0);
4419                         }
4420                         goto out;
4421                 }
4422
4423                 kn->kn_fp = knote_fp;
4424                 kn->kn_kq_packed = (intptr_t)(struct kqueue *)kq;
4425                 kqueue_retain(kq); /* retain a kq ref */
4426                 kn->kn_filtid = ~kev->filter;
4427                 kn->kn_status = KN_ATTACHING | KN_ATTACHED;
4428
4429                 /* was vanish support requested */
4430                 if (kev->flags & EV_VANISHED) {
4431                         kev->flags &= ~EV_VANISHED;
4432                         kn->kn_status |= KN_REQVANISH;
4433                 }
4434
4435                 /* snapshot matching/dispatching protcol flags into knote */
4436                 if (kev->flags & EV_DISPATCH) {
4437                         kn->kn_status |= KN_DISPATCH;
4438                 }
4439                 if (kev->flags & EV_UDATA_SPECIFIC) {
4440                         kn->kn_status |= KN_UDATA_SPECIFIC;
4441                 }
4442                 if (kev->flags & EV_DISABLE) {
4443                         kn->kn_status |= KN_DISABLED;
4444                 }
4445
4446                 /*
4447                  * copy the kevent state into knote
4448                  * protocol is that fflags and data
4449                  * are saved off, and cleared before
4450                  * calling the attach routine.
4451                  */
4452                 kn->kn_kevent = *kev;
4453                 kn->kn_sfflags = kev->fflags;
4454                 kn->kn_sdata = kev->data;
4455                 kn->kn_fflags = 0;
4456                 kn->kn_data = 0;
4457                 knote_reset_priority(kn, kev->qos);
4458
4459                 /* Add the knote for lookup thru the fd table */
4460                 error = kq_add_knote(kq, kn, knlc, p);
4461                 if (error) {
4462                         (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
4463                         knote_free(kn);
4464                         if (knote_fp != NULL) {
4465                                 fp_drop(p, kev->ident, knote_fp, 0);
4466                         }
4467
4468                         if (error == ERESTART) {
4469                                 goto restart;
4470                         }
4471                         goto out;
4472                 }
4473
4474                 /* fp reference count now applies to knote */
4475
4476                 /*
4477                  * we can't use filter_call() because f_attach can change the filter ops
4478                  * for a filter that supports f_extended_codes, so we need to reload
4479                  * knote_fops() and not use `fops`.
4480                  */
4481                 result = fops->f_attach(kn, kev);
4482                 if (result && !knote_fops(kn)->f_extended_codes) {
4483                         result = FILTER_ACTIVE;
4484                 }
4485
4486                 kqlock(kq);
4487
4488                 if (kn->kn_flags & EV_ERROR) {
4489                         /*
4490                          * Failed to attach correctly, so drop.
4491                          */
4492                         kn->kn_status &= ~(KN_ATTACHED | KN_ATTACHING);
4493                         error = kn->kn_data;
4494                         knote_drop(kq, kn, knlc);
4495                         result = 0;
4496                         goto out;
4497                 }
4498
4499                 /*
4500                  * end "attaching" phase - now just attached
4501                  *
4502                  * Mark the thread request overcommit, if appropos
4503                  *
4504                  * If the attach routine indicated that an
4505                  * event is already fired, activate the knote.
4506                  */
4507                 kn->kn_status &= ~KN_ATTACHING;
4508                 knote_set_qos_overcommit(kn);
4509
4510                 if (result & FILTER_ACTIVE) {
4511                         if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4512                                 knote_adjust_qos(kq, kn, result);
4513                         }
4514                         knote_activate(kn);
4515                 }
4516         } else if (!knote_lock(kq, kn, knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
4517                 /*
4518                  * The knote was dropped while we were waiting for the lock,
4519                  * we need to re-evaluate entirely
4520                  */
4521
4522                 goto restart;
4523         } else if (kev->flags & EV_DELETE) {
4524                 /*
4525                  * Deletion of a knote (drop)
4526                  *
4527                  * If the filter wants to filter drop events, let it do so.
4528                  *
4529                  * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4530                  * we must wait for the knote to be re-enabled (unless it is being
4531                  * re-enabled atomically here).
4532                  */
4533
4534                 if (knote_fops(kn)->f_allow_drop) {
4535                         bool drop;
4536
4537                         kqunlock(kq);
4538                         drop = knote_fops(kn)->f_allow_drop(kn, kev);
4539                         kqlock(kq);
4540
4541                         if (!drop) {
4542                                 goto out_unlock;
4543                         }
4544                 }
4545
4546                 if ((kev->flags & EV_ENABLE) == 0 &&
4547                     (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) ==
4548                     (KN_DISPATCH2 | KN_DISABLED)) {
4549                         kn->kn_status |= KN_DEFERDELETE;
4550                         error = EINPROGRESS;
4551                         goto out_unlock;
4552                 }
4553
4554                 knote_drop(kq, kn, knlc);
4555                 goto out;
4556         } else {
4557                 /*
4558                  * Regular update of a knote (touch)
4559                  *
4560                  * Call touch routine to notify filter of changes in filter values
4561                  * (and to re-determine if any events are fired).
4562                  *
4563                  * If the knote is in defer-delete, avoid calling the filter touch
4564                  * routine (it has delivered its last event already).
4565                  *
4566                  * If the touch routine had no failure,
4567                  * apply the requested side effects to the knote.
4568                  */
4569
4570                 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4571                         if (kev->flags & EV_ENABLE) {
4572                                 result = FILTER_ACTIVE;
4573                         }
4574                 } else {
4575                         kqunlock(kq);
4576                         result = filter_call(knote_fops(kn), f_touch(kn, kev));
4577                         kqlock(kq);
4578                 }
4579
4580                 if (kev->flags & EV_ERROR) {
4581                         result = 0;
4582                 } else {
4583                         /* accept new kevent state */
4584                         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
4585                                 kn->kn_udata = kev->udata;
4586                         }
4587                         if (kev->flags & EV_DISABLE) {
4588                                 knote_disable(kn);
4589                         }
4590                         if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) {
4591                                 knote_dequeue(kn);
4592                         }
4593                         if ((result & FILTER_UPDATE_REQ_QOS) &&
4594                             kev->qos && kev->qos != kn->kn_qos) {
4595                                 knote_reset_priority(kn, kev->qos);
4596                         }
4597                         if (result & FILTER_ACTIVE) {
4598                                 thread_qos_t qos;
4599                                 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4600                                         if (knote_should_apply_qos_override(kq, kn, result, &qos)) {
4601                                                 knote_apply_qos_override(kn, qos);
4602                                         }
4603                                 }
4604                                 knote_activate(kn);
4605                         }
4606                         if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) {
4607                                 if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
4608                                         knote_wakeup(kn);
4609                                 }
4610                         }
4611                         if (kev->flags & EV_ENABLE) {
4612                                 knote_enable(kn);
4613                         }
4614                 }
4615         }
4616
4617 out_unlock:
4618         if ((result & FILTER_REGISTER_WAIT) == 0) {
4619                 /*
4620                  * When the filter asked for a post-register wait,
4621                  * we leave the knote and kqueue locked for kevent_register()
4622                  * to call the filter's f_post_register_wait hook.
4623                  */
4624                 knote_unlock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
4625         }
4626
4627 out:
4628         /* output local errors through the kevent */
4629         if (error) {
4630                 kev->flags |= EV_ERROR;
4631                 kev->data = error;
4632         }
4633         return result;
4634 }
4635
4636 /*
4637  * knote_process - process a triggered event
4638  *
4639  *      Validate that it is really still a triggered event
4640  *      by calling the filter routines (if necessary).  Hold
4641  *      a use reference on the knote to avoid it being detached.
4642  *
4643  *      If it is still considered triggered, we will have taken
4644  *      a copy of the state under the filter lock.  We use that
4645  *      snapshot to dispatch the knote for future processing (or
4646  *      not, if this was a lost event).
4647  *
4648  *      Our caller assures us that nobody else can be processing
4649  *      events from this knote during the whole operation. But
4650  *      others can be touching or posting events to the knote
4651  *      interspersed with our processing it.
4652  *
4653  *      caller holds a reference on the kqueue.
4654  *      kqueue locked on entry and exit - but may be dropped
4655  */
4656 static int
4657 knote_process(struct knote *kn,
4658     kevent_callback_t callback,
4659     void *callback_data,
4660     struct filt_process_s *process_data)
4661 {
4662         struct kevent_internal_s kev;
4663         struct kqueue *kq = knote_get_kq(kn);
4664         KNOTE_LOCK_CTX(knlc);
4665         int result = FILTER_ACTIVE;
4666         int error = 0;
4667         bool drop = false;
4668
4669         bzero(&kev, sizeof(kev));
4670
4671         /*
4672          * Must be active or stayactive
4673          * Must be queued and not disabled/suppressed
4674          */
4675         assert(kn->kn_status & KN_QUEUED);
4676         assert(kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE));
4677         assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4678
4679         if (kq->kq_state & KQ_WORKLOOP) {
4680                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4681                     ((struct kqworkloop *)kq)->kqwl_dynamicid,
4682                     kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4683                     kn->kn_filtid);
4684         } else if (kq->kq_state & KQ_WORKQ) {
4685                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4686                     0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4687                     kn->kn_filtid);
4688         } else {
4689                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4690                     VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4691                     kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4692         }
4693
4694         if ((kn->kn_status & KN_DROPPING) ||
4695             !knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4696                 /*
4697                  * When the knote is dropping or has dropped,
4698                  * then there's nothing we want to process.
4699                  */
4700                 return EJUSTRETURN;
4701         }
4702
4703         /*
4704          * For deferred-drop or vanished events, we just create a fake
4705          * event to acknowledge end-of-life.  Otherwise, we call the
4706          * filter's process routine to snapshot the kevent state under
4707          * the filter's locking protocol.
4708          *
4709          * suppress knotes to avoid returning the same event multiple times in
4710          * a single call.
4711          */
4712         knote_suppress(kn);
4713
4714         if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4715                 /* create fake event */
4716                 kev.filter = kn->kn_filter;
4717                 kev.ident = kn->kn_id;
4718                 kev.flags = (kn->kn_status & KN_DEFERDELETE) ? EV_DELETE : EV_VANISHED;
4719                 kev.flags |= (EV_DISPATCH2 | EV_ONESHOT);
4720                 kev.udata = kn->kn_udata;
4721         } else {
4722                 /* deactivate - so new activations indicate a wakeup */
4723                 knote_deactivate(kn);
4724
4725                 kqunlock(kq);
4726                 result = filter_call(knote_fops(kn), f_process(kn, process_data, &kev));
4727                 kqlock(kq);
4728         }
4729
4730         /*
4731          * Determine how to dispatch the knote for future event handling.
4732          * not-fired: just return (do not callout, leave deactivated).
4733          * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
4734          *            is the deferred delete event delivery itself).  Otherwise,
4735          *            drop it.
4736          * Dispatch:  don't clear state, just mark it disabled.
4737          * Cleared:   just leave it deactivated.
4738          * Others:    re-activate as there may be more events to handle.
4739          *            This will not wake up more handlers right now, but
4740          *            at the completion of handling events it may trigger
4741          *            more handler threads (TODO: optimize based on more than
4742          *            just this one event being detected by the filter).
4743          */
4744         if ((result & FILTER_ACTIVE) == 0) {
4745                 if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
4746                         /*
4747                          * Stay active knotes should not be unsuppressed or we'd create an
4748                          * infinite loop.
4749                          *
4750                          * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4751                          * within f_process() but that doesn't necessarily make them
4752                          * ready to process, so we should leave them be.
4753                          *
4754                          * For other knotes, since we will not return an event,
4755                          * there's no point keeping the knote suppressed.
4756                          */
4757                         knote_unsuppress(kn);
4758                 }
4759                 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4760                 return EJUSTRETURN;
4761         }
4762
4763         if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4764                 knote_adjust_qos(kq, kn, result);
4765         }
4766         kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4767
4768         if (kev.flags & EV_ONESHOT) {
4769                 if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) {
4770                         /* defer dropping non-delete oneshot dispatch2 events */
4771                         kn->kn_status |= KN_DEFERDELETE;
4772                         knote_disable(kn);
4773                 } else {
4774                         drop = true;
4775                 }
4776         } else if (kn->kn_status & KN_DISPATCH) {
4777                 /* disable all dispatch knotes */
4778                 knote_disable(kn);
4779         } else if ((kev.flags & EV_CLEAR) == 0) {
4780                 /* re-activate in case there are more events */
4781                 knote_activate(kn);
4782         }
4783
4784         /*
4785          * callback to handle each event as we find it.
4786          * If we have to detach and drop the knote, do
4787          * it while we have the kq unlocked.
4788          */
4789         if (drop) {
4790                 knote_drop(kq, kn, &knlc);
4791         } else {
4792                 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4793         }
4794
4795         if (kev.flags & EV_VANISHED) {
4796                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4797                     kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4798                     kn->kn_filtid);
4799         }
4800
4801         error = (callback)(kq, &kev, callback_data);
4802         kqlock(kq);
4803         return error;
4804 }
4805
4806 /*
4807  * Returns -1 if the kqueue was unbound and processing should not happen
4808  */
4809 #define KQWQAE_BEGIN_PROCESSING 1
4810 #define KQWQAE_END_PROCESSING   2
4811 #define KQWQAE_UNBIND           3
4812 static int
4813 kqworkq_acknowledge_events(struct kqworkq *kqwq, struct kqrequest *kqr,
4814     int kevent_flags, int kqwqae_op)
4815 {
4816         thread_qos_t old_override = THREAD_QOS_UNSPECIFIED;
4817         thread_t thread = kqr->kqr_thread;
4818         struct knote *kn;
4819         int rc = 0;
4820         bool seen_stayactive = false, unbind;
4821
4822         kqlock_held(&kqwq->kqwq_kqueue);
4823
4824         if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
4825                 /*
4826                  * Return suppressed knotes to their original state.
4827                  * For workq kqueues, suppressed ones that are still
4828                  * truly active (not just forced into the queue) will
4829                  * set flags we check below to see if anything got
4830                  * woken up.
4831                  */
4832                 while ((kn = TAILQ_FIRST(&kqr->kqr_suppressed)) != NULL) {
4833                         assert(kn->kn_status & KN_SUPPRESSED);
4834                         knote_unsuppress(kn);
4835                         if (kn->kn_status & KN_STAYACTIVE) {
4836                                 seen_stayactive = true;
4837                         }
4838                 }
4839         }
4840
4841         kq_req_lock(kqwq);
4842
4843 #if DEBUG || DEVELOPMENT
4844         thread_t self = current_thread();
4845         struct uthread *ut = get_bsdthread_info(self);
4846
4847         assert(kqr->kqr_state & KQR_THREQUESTED);
4848         assert(kqr->kqr_thread == self);
4849         assert(ut->uu_kqr_bound == kqr);
4850 #endif // DEBUG || DEVELOPMENT
4851
4852         if (kqwqae_op == KQWQAE_UNBIND) {
4853                 unbind = true;
4854         } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4855                 unbind = false;
4856         } else if (kqwqae_op == KQWQAE_BEGIN_PROCESSING && seen_stayactive) {
4857                 /*
4858                  * When we unsuppress stayactive knotes, for the kind that are hooked
4859                  * through select, we need to process once before we can assert there's
4860                  * no event pending. Hence we can't unbind during BEGIN PROCESSING.
4861                  */
4862                 unbind = false;
4863         } else {
4864                 unbind = ((kqr->kqr_state & KQR_WAKEUP) == 0);
4865         }
4866         if (unbind) {
4867                 old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4868                 rc = -1;
4869                 /*
4870                  * request a new thread if we didn't process the whole queue or real events
4871                  * have happened (not just putting stay-active events back).
4872                  */
4873                 if (kqr->kqr_state & KQR_WAKEUP) {
4874                         kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4875                             kqr->kqr_qos_index, 0);
4876                 }
4877         }
4878
4879         if (rc == 0) {
4880                 /*
4881                  * Reset wakeup bit to notice events firing while we are processing,
4882                  * as we cannot rely on the bucket queue emptiness because of stay
4883                  * active knotes.
4884                  */
4885                 kqr->kqr_state &= ~KQR_WAKEUP;
4886         }
4887
4888         kq_req_unlock(kqwq);
4889
4890         if (old_override) {
4891                 thread_drop_ipc_override(thread);
4892         }
4893
4894         return rc;
4895 }
4896
4897 /*
4898  * Return 0 to indicate that processing should proceed,
4899  * -1 if there is nothing to process.
4900  *
4901  * Called with kqueue locked and returns the same way,
4902  * but may drop lock temporarily.
4903  */
4904 static int
4905 kqworkq_begin_processing(struct kqworkq *kqwq, struct kqrequest *kqr,
4906     int kevent_flags)
4907 {
4908         int rc = 0;
4909
4910         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4911             0, kqr->kqr_qos_index);
4912
4913         rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4914             KQWQAE_BEGIN_PROCESSING);
4915
4916         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4917             thread_tid(kqr->kqr_thread), kqr->kqr_state);
4918
4919         return rc;
4920 }
4921
4922 static inline bool
4923 kqworkloop_is_processing_on_current_thread(struct kqworkloop *kqwl)
4924 {
4925         struct kqueue *kq = &kqwl->kqwl_kqueue;
4926
4927         kqlock_held(kq);
4928
4929         if (kq->kq_state & KQ_PROCESSING) {
4930                 /*
4931                  * KQ_PROCESSING is unset with the kqlock held, and the kqr thread is
4932                  * never modified while KQ_PROCESSING is set, meaning that peeking at
4933                  * its value is safe from this context.
4934                  */
4935                 return kqwl->kqwl_request.kqr_thread == current_thread();
4936         }
4937         return false;
4938 }
4939
4940 static thread_qos_t
4941 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4942 {
4943         struct kqrequest *kqr = &kqwl->kqwl_request;
4944         kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4945         struct knote *kn, *tmp;
4946
4947         kqlock_held(&kqwl->kqwl_kqueue);
4948
4949         TAILQ_FOREACH_SAFE(kn, &kqr->kqr_suppressed, kn_tqe, tmp) {
4950                 /*
4951                  * If a knote that can adjust QoS is disabled because of the automatic
4952                  * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4953                  * further overrides keep pushing.
4954                  */
4955                 if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
4956                     (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 &&
4957                     (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4958                         qos = MAX(qos, knote_get_qos_override_index(kn));
4959                         continue;
4960                 }
4961                 knote_unsuppress(kn);
4962         }
4963
4964         return qos;
4965 }
4966
4967 static int
4968 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4969 {
4970         struct kqrequest *kqr = &kqwl->kqwl_request;
4971         struct kqueue *kq = &kqwl->kqwl_kqueue;
4972         thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override;
4973         thread_t thread = kqr->kqr_thread;
4974         int rc = 0, op = KQWL_UTQ_NONE;
4975
4976         kqlock_held(kq);
4977
4978         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4979             kqwl->kqwl_dynamicid, 0, 0);
4980
4981         /* nobody else should still be processing */
4982         assert((kq->kq_state & KQ_PROCESSING) == 0);
4983
4984         kq->kq_state |= KQ_PROCESSING;
4985
4986         if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
4987                 op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4988         }
4989
4990         if (kevent_flags & KEVENT_FLAG_PARKING) {
4991                 /*
4992                  * When "parking" we want to process events and if no events are found
4993                  * unbind.
4994                  *
4995                  * However, non overcommit threads sometimes park even when they have
4996                  * more work so that the pool can narrow.  For these, we need to unbind
4997                  * early, so that calling kqworkloop_update_threads_qos() can ask the
4998                  * workqueue subsystem whether the thread should park despite having
4999                  * pending events.
5000                  */
5001                 if (kqr->kqr_state & KQR_THOVERCOMMIT) {
5002                         op = KQWL_UTQ_PARKING;
5003                 } else {
5004                         op = KQWL_UTQ_UNBINDING;
5005                 }
5006         }
5007         if (op == KQWL_UTQ_NONE) {
5008                 goto done;
5009         }
5010
5011         qos_override = kqworkloop_acknowledge_events(kqwl);
5012
5013         kq_req_lock(kqwl);
5014
5015         if (op == KQWL_UTQ_UNBINDING) {
5016                 old_override = kqworkloop_unbind_locked(kqwl, thread);
5017                 (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
5018         }
5019         kqworkloop_update_threads_qos(kqwl, op, qos_override);
5020         if (op == KQWL_UTQ_PARKING) {
5021                 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
5022                         /*
5023                          * We cannot trust KQR_WAKEUP when looking at stay active knotes.
5024                          * We need to process once, and kqworkloop_end_processing will
5025                          * handle the unbind.
5026                          */
5027                 } else if ((kqr->kqr_state & KQR_WAKEUP) == 0 || kqwl->kqwl_owner) {
5028                         old_override = kqworkloop_unbind_locked(kqwl, thread);
5029                         (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
5030                         rc = -1;
5031                 }
5032         } else if (op == KQWL_UTQ_UNBINDING) {
5033                 if (kqr->kqr_thread == thread) {
5034                         /*
5035                          * The thread request fired again, passed the admission check and
5036                          * got bound to the current thread again.
5037                          */
5038                 } else {
5039                         rc = -1;
5040                 }
5041         }
5042
5043         if (rc == 0) {
5044                 /*
5045                  * Reset wakeup bit to notice stay active events firing while we are
5046                  * processing, as we cannot rely on the stayactive bucket emptiness.
5047                  */
5048                 kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
5049         } else {
5050                 kq->kq_state &= ~KQ_PROCESSING;
5051         }
5052
5053         kq_req_unlock(kqwl);
5054
5055         if (old_override) {
5056                 thread_drop_ipc_override(thread);
5057         }
5058
5059 done:
5060         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
5061             kqwl->kqwl_dynamicid, 0, 0);
5062
5063         return rc;
5064 }
5065
5066 /*
5067  * Return 0 to indicate that processing should proceed,
5068  * -1 if there is nothing to process.
5069  *
5070  * Called with kqueue locked and returns the same way,
5071  * but may drop lock temporarily.
5072  * May block.
5073  */
5074 static int
5075 kqfile_begin_processing(struct kqueue *kq)
5076 {
5077         struct kqtailq *suppressq;
5078
5079         kqlock_held(kq);
5080
5081         assert((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
5082         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
5083             VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
5084
5085         /* wait to become the exclusive processing thread */
5086         for (;;) {
5087                 if (kq->kq_state & KQ_DRAIN) {
5088                         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5089                             VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
5090                         return -1;
5091                 }
5092
5093                 if ((kq->kq_state & KQ_PROCESSING) == 0) {
5094                         break;
5095                 }
5096
5097                 /* if someone else is processing the queue, wait */
5098                 kq->kq_state |= KQ_PROCWAIT;
5099                 suppressq = kqueue_get_suppressed_queue(kq, NULL);
5100                 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
5101                     CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT,
5102                     TIMEOUT_WAIT_FOREVER);
5103
5104                 kqunlock(kq);
5105                 thread_block(THREAD_CONTINUE_NULL);
5106                 kqlock(kq);
5107         }
5108
5109         /* Nobody else processing */
5110
5111         /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
5112         waitq_set_clear_preposts(&kq->kq_wqs);
5113         kq->kq_state &= ~KQ_WAKEUP;
5114
5115         /* anything left to process? */
5116         if (kqueue_queue_empty(kq, QOS_INDEX_KQFILE)) {
5117                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5118                     VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
5119                 return -1;
5120         }
5121
5122         /* convert to processing mode */
5123         kq->kq_state |= KQ_PROCESSING;
5124
5125         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5126             VM_KERNEL_UNSLIDE_OR_PERM(kq));
5127
5128         return 0;
5129 }
5130
5131 /*
5132  * Try to end the processing, only called when a workq thread is attempting to
5133  * park (KEVENT_FLAG_PARKING is set).
5134  *
5135  * When returning -1, the kqworkq is setup again so that it is ready to be
5136  * processed.
5137  */
5138 static int
5139 kqworkq_end_processing(struct kqworkq *kqwq, struct kqrequest *kqr,
5140     int kevent_flags)
5141 {
5142         if (!kqueue_queue_empty(&kqwq->kqwq_kqueue, kqr->kqr_qos_index)) {
5143                 /* remember we didn't process everything */
5144                 kq_req_lock(kqwq);
5145                 kqr->kqr_state |= KQR_WAKEUP;
5146                 kq_req_unlock(kqwq);
5147         }
5148
5149         if (kevent_flags & KEVENT_FLAG_PARKING) {
5150                 /*
5151                  * if acknowledge events "succeeds" it means there are events,
5152                  * which is a failure condition for end_processing.
5153                  */
5154                 int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
5155                     KQWQAE_END_PROCESSING);
5156                 if (rc == 0) {
5157                         return -1;
5158                 }
5159         }
5160
5161         return 0;
5162 }
5163
5164 /*
5165  * Try to end the processing, only called when a workq thread is attempting to
5166  * park (KEVENT_FLAG_PARKING is set).
5167  *
5168  * When returning -1, the kqworkq is setup again so that it is ready to be
5169  * processed (as if kqworkloop_begin_processing had just been called).
5170  *
5171  * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
5172  * the kqworkloop is unbound from its servicer as a side effect.
5173  */
5174 static int
5175 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
5176 {
5177         struct kqueue *kq = &kqwl->kqwl_kqueue;
5178         struct kqrequest *kqr = &kqwl->kqwl_request;
5179         thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override;
5180         thread_t thread = kqr->kqr_thread;
5181         int rc = 0;
5182
5183         kqlock_held(kq);
5184
5185         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
5186             kqwl->kqwl_dynamicid, 0, 0);
5187
5188         if (flags & KQ_PROCESSING) {
5189                 assert(kq->kq_state & KQ_PROCESSING);
5190
5191                 /*
5192                  * If we still have queued stayactive knotes, remember we didn't finish
5193                  * processing all of them.  This should be extremely rare and would
5194                  * require to have a lot of them registered and fired.
5195                  */
5196                 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
5197                         kq_req_lock(kqwl);
5198                         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS,
5199                             KQWL_BUCKET_STAYACTIVE);
5200                         kq_req_unlock(kqwl);
5201                 }
5202
5203                 /*
5204                  * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
5205                  * still under the lock.
5206                  *
5207                  * So we do everything kqworkloop_unbind() would do, but because we're
5208                  * inside kqueue_process(), if the workloop actually received events
5209                  * while our locks were dropped, we have the opportunity to fail the end
5210                  * processing and loop again.
5211                  *
5212                  * This avoids going through the process-wide workqueue lock hence
5213                  * scales better.
5214                  */
5215                 if (kevent_flags & KEVENT_FLAG_PARKING) {
5216                         qos_override = kqworkloop_acknowledge_events(kqwl);
5217                 }
5218         }
5219
5220         kq_req_lock(kqwl);
5221
5222         if (kevent_flags & KEVENT_FLAG_PARKING) {
5223                 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
5224                 if ((kqr->kqr_state & KQR_WAKEUP) && !kqwl->kqwl_owner) {
5225                         /*
5226                          * Reset wakeup bit to notice stay active events firing while we are
5227                          * processing, as we cannot rely on the stayactive bucket emptiness.
5228                          */
5229                         kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
5230                         rc = -1;
5231                 } else {
5232                         old_override = kqworkloop_unbind_locked(kqwl, thread);
5233                         (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
5234                         kq->kq_state &= ~flags;
5235                 }
5236         } else {
5237                 kq->kq_state &= ~flags;
5238                 kqr->kqr_state |= KQR_R2K_NOTIF_ARMED;
5239                 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
5240         }
5241
5242         kq_req_unlock(kqwl);
5243
5244         if (old_override) {
5245                 thread_drop_ipc_override(thread);
5246         }
5247
5248         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
5249             kqwl->kqwl_dynamicid, 0, 0);
5250
5251         return rc;
5252 }
5253
5254 /*
5255  * Called with kqueue lock held.
5256  */
5257 static void
5258 kqfile_end_processing(struct kqueue *kq)
5259 {
5260         struct knote *kn;
5261         struct kqtailq *suppressq;
5262         int procwait;
5263
5264         kqlock_held(kq);
5265
5266         assert((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
5267
5268         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
5269             VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
5270
5271         /*
5272          * Return suppressed knotes to their original state.
5273          */
5274         suppressq = kqueue_get_suppressed_queue(kq, NULL);
5275         while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
5276                 assert(kn->kn_status & KN_SUPPRESSED);
5277                 knote_unsuppress(kn);
5278         }
5279
5280         procwait = (kq->kq_state & KQ_PROCWAIT);
5281         kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
5282
5283         if (procwait) {
5284                 /* first wake up any thread already waiting to process */
5285                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
5286                     CAST_EVENT64_T(suppressq),
5287                     THREAD_AWAKENED,
5288                     WAITQ_ALL_PRIORITIES);
5289         }
5290 }
5291
5292 static int
5293 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
5294     struct kqueue_workloop_params *params, int *retval)
5295 {
5296         int error = 0;
5297         int fd;
5298         struct fileproc *fp;
5299         struct kqueue *kq;
5300         struct kqworkloop *kqwl;
5301         struct filedesc *fdp = p->p_fd;
5302         workq_threadreq_param_t trp = { };
5303
5304         switch (cmd) {
5305         case KQ_WORKLOOP_CREATE:
5306                 if (!params->kqwlp_flags) {
5307                         error = EINVAL;
5308                         break;
5309                 }
5310
5311                 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
5312                     (params->kqwlp_sched_pri < 1 ||
5313                     params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
5314                         error = EINVAL;
5315                         break;
5316                 }
5317
5318                 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
5319                     invalid_policy(params->kqwlp_sched_pol)) {
5320                         error = EINVAL;
5321                         break;
5322                 }
5323
5324                 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
5325                     (params->kqwlp_cpu_percent <= 0 ||
5326                     params->kqwlp_cpu_percent > 100 ||
5327                     params->kqwlp_cpu_refillms <= 0 ||
5328                     params->kqwlp_cpu_refillms > 0x00ffffff)) {
5329                         error = EINVAL;
5330                         break;
5331                 }
5332
5333                 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
5334                         trp.trp_flags |= TRP_PRIORITY;
5335                         trp.trp_pri = params->kqwlp_sched_pri;
5336                 }
5337                 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
5338                         trp.trp_flags |= TRP_POLICY;
5339                         trp.trp_pol = params->kqwlp_sched_pol;
5340                 }
5341                 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
5342                         trp.trp_flags |= TRP_CPUPERCENT;
5343                         trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
5344                         trp.trp_refillms = params->kqwlp_cpu_refillms;
5345                 }
5346
5347                 error = kevent_get_kq(p, params->kqwlp_id, &trp,
5348                     KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5349                     KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &fp, &fd, &kq);
5350                 if (error) {
5351                         break;
5352                 }
5353
5354                 if (!(fdp->fd_flags & FD_WORKLOOP)) {
5355                         /* FD_WORKLOOP indicates we've ever created a workloop
5356                          * via this syscall but its only ever added to a process, never
5357                          * removed.
5358                          */
5359                         proc_fdlock(p);
5360                         fdp->fd_flags |= FD_WORKLOOP;
5361                         proc_fdunlock(p);
5362                 }
5363                 break;
5364         case KQ_WORKLOOP_DESTROY:
5365                 error = kevent_get_kq(p, params->kqwlp_id, NULL,
5366                     KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
5367                     KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &fp, &fd, &kq);
5368                 if (error) {
5369                         break;
5370                 }
5371                 kqlock(kq);
5372                 kqwl = (struct kqworkloop *)kq;
5373                 trp.trp_value = kqwl->kqwl_params;
5374                 if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
5375                         trp.trp_flags |= TRP_RELEASED;
5376                         kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
5377                 } else {
5378                         error = EINVAL;
5379                 }
5380                 kqunlock(kq);
5381                 kqueue_release_last(p, kq);
5382                 break;
5383         }
5384         *retval = 0;
5385         return error;
5386 }
5387
5388 int
5389 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
5390 {
5391         struct kqueue_workloop_params params = {
5392                 .kqwlp_id = 0,
5393         };
5394         if (uap->sz < sizeof(params.kqwlp_version)) {
5395                 return EINVAL;
5396         }
5397
5398         size_t copyin_sz = MIN(sizeof(params), uap->sz);
5399         int rv = copyin(uap->addr, &params, copyin_sz);
5400         if (rv) {
5401                 return rv;
5402         }
5403
5404         if (params.kqwlp_version != (int)uap->sz) {
5405                 return EINVAL;
5406         }
5407
5408         return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
5409                    retval);
5410 }
5411
5412 /*
5413  * kqueue_process - process the triggered events in a kqueue
5414  *
5415  *      Walk the queued knotes and validate that they are really still triggered
5416  *      events by calling the filter routines (if necessary).
5417  *
5418  *      For each event that is still considered triggered, invoke the callback
5419  *      routine provided.
5420  *
5421  *      caller holds a reference on the kqueue.
5422  *      kqueue locked on entry and exit - but may be dropped
5423  *      kqueue list locked (held for duration of call)
5424  */
5425 static int
5426 kqueue_process(struct kqueue *kq,
5427     kevent_callback_t callback,
5428     void *callback_data,
5429     struct filt_process_s *process_data,
5430     int *countp)
5431 {
5432         struct uthread *ut = get_bsdthread_info(current_thread());
5433         struct kqrequest *kqr = ut->uu_kqr_bound;
5434         struct knote *kn;
5435         unsigned int flags = process_data ? process_data->fp_flags : 0;
5436         int nevents = 0, error = 0, rc = 0;
5437         struct kqtailq *base_queue, *queue;
5438         kqueue_t kqu = { .kq = kq };
5439 #if DEBUG || DEVELOPMENT
5440         int retries = 64;
5441 #endif
5442
5443         if (kq->kq_state & KQ_WORKQ) {
5444                 if (kqr == NULL || (kqr->kqr_state & KQR_WORKLOOP)) {
5445                         return EJUSTRETURN;
5446                 }
5447                 rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
5448         } else if (kq->kq_state & KQ_WORKLOOP) {
5449                 if (ut->uu_kqr_bound != &kqu.kqwl->kqwl_request) {
5450                         return EJUSTRETURN;
5451                 }
5452                 rc = kqworkloop_begin_processing(kqu.kqwl, flags);
5453         } else {
5454                 rc = kqfile_begin_processing(kq);
5455         }
5456
5457         if (rc == -1) {
5458                 /* Nothing to process */
5459                 *countp = 0;
5460                 return 0;
5461         }
5462
5463         /*
5464          * loop through the enqueued knotes associated with this request,
5465          * processing each one. Each request may have several queues
5466          * of knotes to process (depending on the type of kqueue) so we
5467          * have to loop through all the queues as long as we have additional
5468          * space.
5469          */
5470
5471 process_again:
5472         if (kq->kq_state & KQ_WORKQ) {
5473                 base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->kqr_qos_index];
5474         } else if (kq->kq_state & KQ_WORKLOOP) {
5475                 base_queue = &kqu.kqwl->kqwl_queue[0];
5476                 queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
5477         } else {
5478                 base_queue = queue = &kq->kq_queue[QOS_INDEX_KQFILE];
5479         }
5480
5481         do {
5482                 while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) {
5483                         error = knote_process(kn, callback, callback_data, process_data);
5484                         if (error == EJUSTRETURN) {
5485                                 error = 0;
5486                         } else {
5487                                 nevents++;
5488                         }
5489                         /* error is EWOULDBLOCK when the out event array is full */
5490                 }
5491
5492                 if (error == EWOULDBLOCK) {
5493                         /* break out if no more space for additional events */
5494                         error = 0;
5495                         break;
5496                 }
5497         } while (queue-- > base_queue);
5498
5499         *countp = nevents;
5500
5501         /*
5502          * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
5503          * we want to unbind the kqrequest from the thread.
5504          *
5505          * However, because the kq locks are dropped several times during process,
5506          * new knotes may have fired again, in which case, we want to fail the end
5507          * processing and process again, until it converges.
5508          *
5509          * If we returned events however, end processing never fails.
5510          */
5511         if (error || nevents) {
5512                 flags &= ~KEVENT_FLAG_PARKING;
5513         }
5514         if (kq->kq_state & KQ_WORKQ) {
5515                 rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
5516         } else if (kq->kq_state & KQ_WORKLOOP) {
5517                 rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
5518         } else {
5519                 kqfile_end_processing(kq);
5520                 rc = 0;
5521         }
5522         if (rc == -1) {
5523                 assert(flags & KEVENT_FLAG_PARKING);
5524 #if DEBUG || DEVELOPMENT
5525                 if (retries-- == 0) {
5526                         panic("kevent: way too many knote_process retries, kq: %p (0x%02x)",
5527                             kq, kq->kq_state);
5528                 }
5529 #endif
5530                 goto process_again;
5531         }
5532         return error;
5533 }
5534
5535 static void
5536 kqueue_scan_continue(void *data, wait_result_t wait_result)
5537 {
5538         thread_t self = current_thread();
5539         uthread_t ut = (uthread_t)get_bsdthread_info(self);
5540         struct _kqueue_scan * cont_args = &ut->uu_save.uus_kqueue_scan;
5541         struct kqueue *kq = (struct kqueue *)data;
5542         struct filt_process_s *process_data = cont_args->process_data;
5543         int error;
5544         int count;
5545
5546         /* convert the (previous) wait_result to a proper error */
5547         switch (wait_result) {
5548         case THREAD_AWAKENED: {
5549                 kqlock(kq);
5550 retry:
5551                 error = kqueue_process(kq, cont_args->call, cont_args->data,
5552                     process_data, &count);
5553                 if (error == 0 && count == 0) {
5554                         if (kq->kq_state & KQ_DRAIN) {
5555                                 kqunlock(kq);
5556                                 goto drain;
5557                         }
5558
5559                         if (kq->kq_state & KQ_WAKEUP) {
5560                                 goto retry;
5561                         }
5562
5563                         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
5564                             KQ_EVENT, THREAD_ABORTSAFE,
5565                             cont_args->deadline);
5566                         kq->kq_state |= KQ_SLEEP;
5567                         kqunlock(kq);
5568                         thread_block_parameter(kqueue_scan_continue, kq);
5569                         /* NOTREACHED */
5570                 }
5571                 kqunlock(kq);
5572         } break;
5573         case THREAD_TIMED_OUT:
5574                 error = EWOULDBLOCK;
5575                 break;
5576         case THREAD_INTERRUPTED:
5577                 error = EINTR;
5578                 break;
5579         case THREAD_RESTART:
5580 drain:
5581                 error = EBADF;
5582                 break;
5583         default:
5584                 panic("%s: - invalid wait_result (%d)", __func__,
5585                     wait_result);
5586                 error = 0;
5587         }
5588
5589         /* call the continuation with the results */
5590         assert(cont_args->cont != NULL);
5591         (cont_args->cont)(kq, cont_args->data, error);
5592 }
5593
5594
5595 /*
5596  * kqueue_scan - scan and wait for events in a kqueue
5597  *
5598  *      Process the triggered events in a kqueue.
5599  *
5600  *      If there are no events triggered arrange to
5601  *      wait for them. If the caller provided a
5602  *      continuation routine, then kevent_scan will
5603  *      also.
5604  *
5605  *      The callback routine must be valid.
5606  *      The caller must hold a use-count reference on the kq.
5607  */
5608 int
5609 kqueue_scan(struct kqueue *kq,
5610     kevent_callback_t callback,
5611     kqueue_continue_t continuation,
5612     void *callback_data,
5613     struct filt_process_s *process_data,
5614     struct timeval *atvp,
5615     __unused struct proc *p)
5616 {
5617         thread_continue_t cont = THREAD_CONTINUE_NULL;
5618         unsigned int flags;
5619         uint64_t deadline;
5620         int error;
5621         int first;
5622         int fd;
5623
5624         assert(callback != NULL);
5625
5626         /*
5627          * Determine which QoS index we are servicing
5628          */
5629         flags = (process_data) ? process_data->fp_flags : 0;
5630         fd = (process_data) ? process_data->fp_fd : -1;
5631
5632         first = 1;
5633         for (;;) {
5634                 wait_result_t wait_result;
5635                 int count;
5636
5637                 /*
5638                  * Make a pass through the kq to find events already
5639                  * triggered.
5640                  */
5641                 kqlock(kq);
5642                 error = kqueue_process(kq, callback, callback_data,
5643                     process_data, &count);
5644                 if (error || count) {
5645                         break; /* lock still held */
5646                 }
5647                 /* looks like we have to consider blocking */
5648                 if (first) {
5649                         first = 0;
5650                         /* convert the timeout to a deadline once */
5651                         if (atvp->tv_sec || atvp->tv_usec) {
5652                                 uint64_t now;
5653
5654                                 clock_get_uptime(&now);
5655                                 nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
5656                                     atvp->tv_usec * (long)NSEC_PER_USEC,
5657                                     &deadline);
5658                                 if (now >= deadline) {
5659                                         /* non-blocking call */
5660                                         error = EWOULDBLOCK;
5661                                         break; /* lock still held */
5662                                 }
5663                                 deadline -= now;
5664                                 clock_absolutetime_interval_to_deadline(deadline, &deadline);
5665                         } else {
5666                                 deadline = 0;   /* block forever */
5667                         }
5668
5669                         if (continuation) {
5670                                 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
5671                                 struct _kqueue_scan *cont_args = &ut->uu_save.uus_kqueue_scan;
5672
5673                                 cont_args->call = callback;
5674                                 cont_args->cont = continuation;
5675                                 cont_args->deadline = deadline;
5676                                 cont_args->data = callback_data;
5677                                 cont_args->process_data = process_data;
5678                                 cont = kqueue_scan_continue;
5679                         }
5680                 }
5681
5682                 if (kq->kq_state & KQ_DRAIN) {
5683                         kqunlock(kq);
5684                         return EBADF;
5685                 }
5686
5687                 /* If awakened during processing, try again */
5688                 if (kq->kq_state & KQ_WAKEUP) {
5689                         kqunlock(kq);
5690                         continue;
5691                 }
5692
5693                 /* go ahead and wait */
5694                 waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
5695                     KQ_EVENT, THREAD_ABORTSAFE,
5696                     TIMEOUT_URGENCY_USER_NORMAL,
5697                     deadline, TIMEOUT_NO_LEEWAY);
5698                 kq->kq_state |= KQ_SLEEP;
5699                 kqunlock(kq);
5700                 wait_result = thread_block_parameter(cont, kq);
5701                 /* NOTREACHED if (continuation != NULL) */
5702
5703                 switch (wait_result) {
5704                 case THREAD_AWAKENED:
5705                         continue;
5706                 case THREAD_TIMED_OUT:
5707                         return EWOULDBLOCK;
5708                 case THREAD_INTERRUPTED:
5709                         return EINTR;
5710                 case THREAD_RESTART:
5711                         return EBADF;
5712                 default:
5713                         panic("%s: - bad wait_result (%d)", __func__,
5714                             wait_result);
5715                         error = 0;
5716                 }
5717         }
5718         kqunlock(kq);
5719         return error;
5720 }
5721
5722
5723 /*
5724  * XXX
5725  * This could be expanded to call kqueue_scan, if desired.
5726  */
5727 /*ARGSUSED*/
5728 static int
5729 kqueue_read(__unused struct fileproc *fp,
5730     __unused struct uio *uio,
5731     __unused int flags,
5732     __unused vfs_context_t ctx)
5733 {
5734         return ENXIO;
5735 }
5736
5737 /*ARGSUSED*/
5738 static int
5739 kqueue_write(__unused struct fileproc *fp,
5740     __unused struct uio *uio,
5741     __unused int flags,
5742     __unused vfs_context_t ctx)
5743 {
5744         return ENXIO;
5745 }
5746
5747 /*ARGSUSED*/
5748 static int
5749 kqueue_ioctl(__unused struct fileproc *fp,
5750     __unused u_long com,
5751     __unused caddr_t data,
5752     __unused vfs_context_t ctx)
5753 {
5754         return ENOTTY;
5755 }
5756
5757 /*ARGSUSED*/
5758 static int
5759 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
5760     __unused vfs_context_t ctx)
5761 {
5762         struct kqueue *kq = (struct kqueue *)fp->f_data;
5763         struct kqtailq *queue;
5764         struct kqtailq *suppressq;
5765         struct knote *kn;
5766         int retnum = 0;
5767
5768         if (which != FREAD) {
5769                 return 0;
5770         }
5771
5772         kqlock(kq);
5773
5774         assert((kq->kq_state & KQ_WORKQ) == 0);
5775
5776         /*
5777          * If this is the first pass, link the wait queue associated with the
5778          * the kqueue onto the wait queue set for the select().  Normally we
5779          * use selrecord() for this, but it uses the wait queue within the
5780          * selinfo structure and we need to use the main one for the kqueue to
5781          * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
5782          * (The select() call will unlink them when it ends).
5783          */
5784         if (wq_link_id != NULL) {
5785                 thread_t cur_act = current_thread();
5786                 struct uthread * ut = get_bsdthread_info(cur_act);
5787
5788                 kq->kq_state |= KQ_SEL;
5789                 waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset,
5790                     WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
5791
5792                 /* always consume the reserved link object */
5793                 waitq_link_release(*(uint64_t *)wq_link_id);
5794                 *(uint64_t *)wq_link_id = 0;
5795
5796                 /*
5797                  * selprocess() is expecting that we send it back the waitq
5798                  * that was just added to the thread's waitq set. In order
5799                  * to not change the selrecord() API (which is exported to
5800                  * kexts), we pass this value back through the
5801                  * void *wq_link_id pointer we were passed. We need to use
5802                  * memcpy here because the pointer may not be properly aligned
5803                  * on 32-bit systems.
5804                  */
5805                 void *wqptr = &kq->kq_wqs;
5806                 memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
5807         }
5808
5809         if (kqfile_begin_processing(kq) == -1) {
5810                 kqunlock(kq);
5811                 return 0;
5812         }
5813
5814         queue = &kq->kq_queue[QOS_INDEX_KQFILE];
5815         if (!TAILQ_EMPTY(queue)) {
5816                 /*
5817                  * there is something queued - but it might be a
5818                  * KN_STAYACTIVE knote, which may or may not have
5819                  * any events pending.  Otherwise, we have to walk
5820                  * the list of knotes to see, and peek at the
5821                  * (non-vanished) stay-active ones to be really sure.
5822                  */
5823                 while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
5824                         if (kn->kn_status & KN_ACTIVE) {
5825                                 retnum = 1;
5826                                 goto out;
5827                         }
5828                         assert(kn->kn_status & KN_STAYACTIVE);
5829                         knote_suppress(kn);
5830                 }
5831
5832                 /*
5833                  * There were no regular events on the queue, so take
5834                  * a deeper look at the stay-queued ones we suppressed.
5835                  */
5836                 suppressq = kqueue_get_suppressed_queue(kq, NULL);
5837                 while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
5838                         KNOTE_LOCK_CTX(knlc);
5839                         int result = 0;
5840
5841                         /* If didn't vanish while suppressed - peek at it */
5842                         if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc,
5843                             KNOTE_KQ_LOCK_ON_FAILURE)) {
5844                                 continue;
5845                         }
5846
5847                         result = filter_call(knote_fops(kn), f_peek(kn));
5848
5849                         kqlock(kq);
5850                         knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
5851
5852                         /* unsuppress it */
5853                         knote_unsuppress(kn);
5854
5855                         /* has data or it has to report a vanish */
5856                         if (result & FILTER_ACTIVE) {
5857                                 retnum = 1;
5858                                 goto out;
5859                         }
5860                 }
5861         }
5862
5863 out:
5864         kqfile_end_processing(kq);
5865         kqunlock(kq);
5866         return retnum;
5867 }
5868
5869 /*
5870  * kqueue_close -
5871  */
5872 /*ARGSUSED*/
5873 static int
5874 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
5875 {
5876         struct kqfile *kqf = (struct kqfile *)fg->fg_data;
5877
5878         assert((kqf->kqf_state & KQ_WORKQ) == 0);
5879         kqueue_dealloc(&kqf->kqf_kqueue);
5880         fg->fg_data = NULL;
5881         return 0;
5882 }
5883
5884 /*
5885  * Max depth of the nested kq path that can be created.
5886  * Note that this has to be less than the size of kq_level
5887  * to avoid wrapping around and mislabeling the level.
5888  */
5889 #define MAX_NESTED_KQ 1000
5890
5891 /*ARGSUSED*/
5892 /*
5893  * The callers has taken a use-count reference on this kqueue and will donate it
5894  * to the kqueue we are being added to.  This keeps the kqueue from closing until
5895  * that relationship is torn down.
5896  */
5897 static int
5898 kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn,
5899     __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
5900 {
5901         struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
5902         struct kqueue *kq = &kqf->kqf_kqueue;
5903         struct kqueue *parentkq = knote_get_kq(kn);
5904         uint16_t plevel = 0;
5905
5906         assert((kqf->kqf_state & KQ_WORKQ) == 0);
5907
5908         if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
5909                 knote_set_error(kn, EINVAL);
5910                 return 0;
5911         }
5912
5913         /*
5914          * We have to avoid creating a cycle when nesting kqueues
5915          * inside another.  Rather than trying to walk the whole
5916          * potential DAG of nested kqueues, we just use a simple
5917          * ceiling protocol.  When a kqueue is inserted into another,
5918          * we check that the (future) parent is not already nested
5919          * into another kqueue at a lower level than the potenial
5920          * child (because it could indicate a cycle).  If that test
5921          * passes, we just mark the nesting levels accordingly.
5922          *
5923          * Only up to MAX_NESTED_KQ can be nested.
5924          */
5925
5926         kqlock(parentkq);
5927         if (parentkq->kq_level > 0 &&
5928             parentkq->kq_level < kq->kq_level) {
5929                 kqunlock(parentkq);
5930                 knote_set_error(kn, EINVAL);
5931                 return 0;
5932         } else {
5933                 /* set parent level appropriately */
5934                 plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
5935                 if (plevel < kq->kq_level + 1) {
5936                         if (kq->kq_level + 1 > MAX_NESTED_KQ) {
5937                                 kqunlock(parentkq);
5938                                 knote_set_error(kn, EINVAL);
5939                                 return 0;
5940                         }
5941                         plevel = kq->kq_level + 1;
5942                 }
5943
5944                 parentkq->kq_level = plevel;
5945                 kqunlock(parentkq);
5946
5947                 kn->kn_filtid = EVFILTID_KQREAD;
5948                 kqlock(kq);
5949                 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
5950                 /* indicate nesting in child, if needed */
5951                 if (kq->kq_level == 0) {
5952                         kq->kq_level = 1;
5953                 }
5954
5955                 int count = kq->kq_count;
5956                 kqunlock(kq);
5957                 return count > 0;
5958         }
5959 }
5960
5961 /*
5962  * kqueue_drain - called when kq is closed
5963  */
5964 /*ARGSUSED*/
5965 static int
5966 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
5967 {
5968         struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
5969
5970         assert((kq->kq_state & KQ_WORKQ) == 0);
5971
5972         kqlock(kq);
5973         kq->kq_state |= KQ_DRAIN;
5974         kqueue_interrupt(kq);
5975         kqunlock(kq);
5976         return 0;
5977 }
5978
5979 /*ARGSUSED*/
5980 int
5981 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
5982 {
5983         assert((kq->kq_state & KQ_WORKQ) == 0);
5984
5985         kqlock(kq);
5986         if (isstat64 != 0) {
5987                 struct stat64 *sb64 = (struct stat64 *)ub;
5988
5989                 bzero((void *)sb64, sizeof(*sb64));
5990                 sb64->st_size = kq->kq_count;
5991                 if (kq->kq_state & KQ_KEV_QOS) {
5992                         sb64->st_blksize = sizeof(struct kevent_qos_s);
5993                 } else if (kq->kq_state & KQ_KEV64) {
5994                         sb64->st_blksize = sizeof(struct kevent64_s);
5995                 } else if (IS_64BIT_PROCESS(p)) {
5996                         sb64->st_blksize = sizeof(struct user64_kevent);
5997                 } else {
5998                         sb64->st_blksize = sizeof(struct user32_kevent);
5999                 }
6000                 sb64->st_mode = S_IFIFO;
6001         } else {
6002                 struct stat *sb = (struct stat *)ub;
6003
6004                 bzero((void *)sb, sizeof(*sb));
6005                 sb->st_size = kq->kq_count;
6006                 if (kq->kq_state & KQ_KEV_QOS) {
6007                         sb->st_blksize = sizeof(struct kevent_qos_s);
6008                 } else if (kq->kq_state & KQ_KEV64) {
6009                         sb->st_blksize = sizeof(struct kevent64_s);
6010                 } else if (IS_64BIT_PROCESS(p)) {
6011                         sb->st_blksize = sizeof(struct user64_kevent);
6012                 } else {
6013                         sb->st_blksize = sizeof(struct user32_kevent);
6014                 }
6015                 sb->st_mode = S_IFIFO;
6016         }
6017         kqunlock(kq);
6018         return 0;
6019 }
6020
6021 static inline bool
6022 kqueue_threadreq_can_use_ast(struct kqueue *kq)
6023 {
6024         if (current_proc() == kq->kq_p) {
6025                 /*
6026                  * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
6027                  * do combined send/receive and in the case of self-IPC, the AST may bet
6028                  * set on a thread that will not return to userspace and needs the
6029                  * thread the AST would create to unblock itself.
6030                  *
6031                  * At this time, we really want to target:
6032                  *
6033                  * - kevent variants that can cause thread creations, and dispatch
6034                  *   really only uses kevent_qos and kevent_id,
6035                  *
6036                  * - workq_kernreturn (directly about thread creations)
6037                  *
6038                  * - bsdthread_ctl which is used for qos changes and has direct impact
6039                  *   on the creator thread scheduling decisions.
6040                  */
6041                 switch (current_uthread()->syscall_code) {
6042                 case SYS_kevent_qos:
6043                 case SYS_kevent_id:
6044                 case SYS_workq_kernreturn:
6045                 case SYS_bsdthread_ctl:
6046                         return true;
6047                 }
6048         }
6049         return false;
6050 }
6051
6052 /*
6053  * Interact with the pthread kext to request a servicing there at a specific QoS
6054  * level.
6055  *
6056  * - Caller holds the workq request lock
6057  *
6058  * - May be called with the kqueue's wait queue set locked,
6059  *   so cannot do anything that could recurse on that.
6060  */
6061 static void
6062 kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr,
6063     kq_index_t qos, int flags)
6064 {
6065         assert(kqr->kqr_state & KQR_WAKEUP);
6066         assert(kqr->kqr_thread == THREAD_NULL);
6067         assert((kqr->kqr_state & KQR_THREQUESTED) == 0);
6068         struct turnstile *ts = TURNSTILE_NULL;
6069
6070         if (workq_is_exiting(kq->kq_p)) {
6071                 return;
6072         }
6073
6074         /* Add a thread request reference on the kqueue. */
6075         kqueue_retain(kq);
6076
6077         kq_req_held(kq);
6078
6079         if (kq->kq_state & KQ_WORKLOOP) {
6080                 __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq;
6081
6082                 assert(kqwl->kqwl_owner == THREAD_NULL);
6083                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
6084                     kqwl->kqwl_dynamicid, 0, qos, kqr->kqr_state);
6085                 ts = kqwl->kqwl_turnstile;
6086         } else {
6087                 assert(kq->kq_state & KQ_WORKQ);
6088                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
6089                     -1, 0, qos, kqr->kqr_state);
6090         }
6091
6092         kqr->kqr_state |= KQR_THREQUESTED;
6093
6094         /*
6095          * New-style thread request supported.
6096          * Provide the pthread kext a pointer to a workq_threadreq_s structure for
6097          * its use until a corresponding kqueue_threadreq_bind callback.
6098          */
6099         if (kqueue_threadreq_can_use_ast(kq)) {
6100                 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
6101         }
6102         if (qos == KQWQ_QOS_MANAGER) {
6103                 qos = WORKQ_THREAD_QOS_MANAGER;
6104         }
6105         if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) {
6106                 /*
6107                  * Process is shutting down or exec'ing.
6108                  * All the kqueues are going to be cleaned up
6109                  * soon. Forget we even asked for a thread -
6110                  * and make sure we don't ask for more.
6111                  */
6112                 kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
6113                 kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
6114         }
6115 }
6116
6117 /*
6118  * kqueue_threadreq_bind_prepost - prepost the bind to kevent
6119  *
6120  * This is used when kqueue_threadreq_bind may cause a lock inversion.
6121  */
6122 void
6123 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t req,
6124     thread_t thread)
6125 {
6126         struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
6127         struct uthread *ut = get_bsdthread_info(thread);
6128
6129         req->tr_binding_thread = thread;
6130         ut->uu_kqr_bound = kqr;
6131         req->tr_state = TR_STATE_BINDING;
6132
6133         struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
6134         if (kqwl && kqwl->kqwl_turnstile) {
6135                 struct turnstile *ts = kqwl->kqwl_turnstile;
6136                 /*
6137                  * While a thread request is in flight, the workqueue
6138                  * is the interlock for the turnstile and can update the inheritor.
6139                  */
6140                 turnstile_update_inheritor(ts, thread, TURNSTILE_IMMEDIATE_UPDATE |
6141                     TURNSTILE_INHERITOR_THREAD);
6142                 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
6143         }
6144 }
6145
6146 /*
6147  * kqueue_threadreq_bind_commit - commit a bind prepost
6148  *
6149  * The workq code has to commit any binding prepost before the thread has
6150  * a chance to come back to userspace (and do kevent syscalls) or be aborted.
6151  */
6152 void
6153 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
6154 {
6155         struct uthread *ut = get_bsdthread_info(thread);
6156         struct kqrequest *kqr = ut->uu_kqr_bound;
6157         kqueue_t kqu = kqr_kqueue(p, kqr);
6158
6159         kq_req_lock(kqu);
6160         if (kqr->kqr_req.tr_state == TR_STATE_BINDING) {
6161                 kqueue_threadreq_bind(p, &kqr->kqr_req, thread, 0);
6162         }
6163         kq_req_unlock(kqu);
6164 }
6165
6166 static void
6167 kqueue_threadreq_modify(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos)
6168 {
6169         assert(kqr->kqr_state & KQR_THREQUESTED);
6170         assert(kqr->kqr_thread == THREAD_NULL);
6171
6172         kq_req_held(kq);
6173
6174         int flags = 0;
6175         if (kqueue_threadreq_can_use_ast(kq)) {
6176                 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
6177         }
6178         workq_kern_threadreq_modify(kq->kq_p, kqr, qos, flags);
6179 }
6180
6181 /*
6182  * kqueue_threadreq_bind - bind thread to processing kqrequest
6183  *
6184  * The provided thread will be responsible for delivering events
6185  * associated with the given kqrequest.  Bind it and get ready for
6186  * the thread to eventually arrive.
6187  */
6188 void
6189 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req, thread_t thread,
6190     unsigned int flags)
6191 {
6192         struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
6193         kqueue_t kqu = kqr_kqueue(p, kqr);
6194         struct uthread *ut = get_bsdthread_info(thread);
6195
6196         kq_req_held(kqu);
6197
6198         assert(kqr->kqr_state & KQR_THREQUESTED);
6199         assert(kqr->kqr_thread == THREAD_NULL);
6200         assert(ut->uu_kqueue_override == 0);
6201
6202         if (kqr->kqr_req.tr_state == TR_STATE_BINDING) {
6203                 assert(ut->uu_kqr_bound == kqr);
6204                 assert(kqr->kqr_req.tr_binding_thread == thread);
6205                 kqr->kqr_req.tr_state = TR_STATE_IDLE;
6206                 kqr->kqr_req.tr_binding_thread = NULL;
6207         } else {
6208                 assert(ut->uu_kqr_bound == NULL);
6209         }
6210
6211         ut->uu_kqr_bound = kqr;
6212         kqr->kqr_thread = thread;
6213
6214         if (kqu.kq->kq_state & KQ_WORKLOOP) {
6215                 struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
6216
6217                 if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
6218                         /*
6219                          * <rdar://problem/38626999> shows that asserting here is not ok.
6220                          *
6221                          * This is not supposed to happen for correct use of the interface,
6222                          * but it is sadly possible for userspace (with the help of memory
6223                          * corruption, such as over-release of a dispatch queue) to make
6224                          * the creator thread the "owner" of a workloop.
6225                          *
6226                          * Once that happens, and that creator thread picks up the same
6227                          * workloop as a servicer, we trip this codepath. We need to fixup
6228                          * the state to forget about this thread being the owner, as the
6229                          * entire workloop state machine expects servicers to never be
6230                          * owners and everything would basically go downhill from here.
6231                          */
6232                         kqu.kqwl->kqwl_owner = THREAD_NULL;
6233                         if (kqworkloop_owner_override(kqu.kqwl)) {
6234                                 thread_drop_ipc_override(thread);
6235                         }
6236                         thread_ends_owning_workloop(thread);
6237                 }
6238
6239                 if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
6240                         /*
6241                          * Past this point, the interlock is the kq req lock again,
6242                          * so we can fix the inheritor for good.
6243                          */
6244                         filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
6245                         turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
6246                 }
6247
6248                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
6249                     thread_tid(thread), kqr->kqr_qos_index,
6250                     (kqr->kqr_override_index << 16) | kqr->kqr_state);
6251
6252                 ut->uu_kqueue_override = kqr->kqr_override_index;
6253                 if (kqr->kqr_override_index) {
6254                         thread_add_ipc_override(thread, kqr->kqr_override_index);
6255                 }
6256         } else {
6257                 assert(kqr->kqr_override_index == 0);
6258
6259                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
6260                     thread_tid(thread), kqr->kqr_qos_index,
6261                     (kqr->kqr_override_index << 16) | kqr->kqr_state);
6262         }
6263 }
6264
6265 /*
6266  * kqueue_threadreq_cancel - abort a pending thread request
6267  *
6268  * Called when exiting/exec'ing. Forget our pending request.
6269  */
6270 void
6271 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req)
6272 {
6273         struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
6274         kqueue_t kqu = kqr_kqueue(p, kqr);
6275
6276         kq_req_lock(kqu);
6277
6278         assert(kqr->kqr_thread == THREAD_NULL);
6279         assert(kqr->kqr_state & KQR_THREQUESTED);
6280         kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
6281
6282         kq_req_unlock(kqu);
6283
6284         kqueue_release_last(p, kqu); /* may dealloc kqu */
6285 }
6286
6287 workq_threadreq_param_t
6288 kqueue_threadreq_workloop_param(workq_threadreq_t req)
6289 {
6290         struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
6291         struct kqworkloop *kqwl;
6292         workq_threadreq_param_t trp;
6293
6294         assert(kqr->kqr_state & KQR_WORKLOOP);
6295         kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
6296         trp.trp_value = kqwl->kqwl_params;
6297         return trp;
6298 }
6299
6300 /*
6301  *      kqueue_threadreq_unbind - unbind thread from processing kqueue
6302  *
6303  *      End processing the per-QoS bucket of events and allow other threads
6304  *      to be requested for future servicing.
6305  *
6306  *      caller holds a reference on the kqueue.
6307  */
6308 void
6309 kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr)
6310 {
6311         if (kqr->kqr_state & KQR_WORKLOOP) {
6312                 kqworkloop_unbind(p, kqr_kqworkloop(kqr));
6313         } else {
6314                 kqworkq_unbind(p, kqr);
6315         }
6316 }
6317
6318 /*
6319  * If we aren't already busy processing events [for this QoS],
6320  * request workq thread support as appropriate.
6321  *
6322  * TBD - for now, we don't segregate out processing by QoS.
6323  *
6324  * - May be called with the kqueue's wait queue set locked,
6325  *   so cannot do anything that could recurse on that.
6326  */
6327 static void
6328 kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index)
6329 {
6330         struct kqrequest *kqr;
6331
6332         /* convert to thread qos value */
6333         assert(qos_index < KQWQ_NBUCKETS);
6334
6335         kq_req_lock(kqwq);
6336         kqr = kqworkq_get_request(kqwq, qos_index);
6337
6338         if ((kqr->kqr_state & KQR_WAKEUP) == 0) {
6339                 kqr->kqr_state |= KQR_WAKEUP;
6340                 if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
6341                         kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
6342                 }
6343         }
6344         kq_req_unlock(kqwq);
6345 }
6346
6347 static kq_index_t
6348 kqworkloop_owner_override(struct kqworkloop *kqwl)
6349 {
6350         struct kqrequest *kqr = &kqwl->kqwl_request;
6351         return MAX(kqr->kqr_qos_index, kqr->kqr_override_index);
6352 }
6353
6354 static inline void
6355 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
6356 {
6357         struct kqrequest *kqr = &kqwl->kqwl_request;
6358
6359         kq_req_held(kqwl);
6360
6361         if (kqr->kqr_state & KQR_R2K_NOTIF_ARMED) {
6362                 assert(kqr->kqr_thread);
6363                 kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
6364                 act_set_astkevent(kqr->kqr_thread, AST_KEVENT_RETURN_TO_KERNEL);
6365         }
6366 }
6367
6368 static void
6369 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
6370 {
6371         struct kqrequest *kqr = &kqwl->kqwl_request;
6372         struct kqueue *kq = &kqwl->kqwl_kqueue;
6373         kq_index_t old_owner_override = kqworkloop_owner_override(kqwl);
6374         kq_index_t i;
6375
6376         /* must hold the kqr lock */
6377         kq_req_held(kqwl);
6378
6379         switch (op) {
6380         case KQWL_UTQ_UPDATE_WAKEUP_QOS:
6381                 if (qos == KQWL_BUCKET_STAYACTIVE) {
6382                         /*
6383                          * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
6384                          * a high watermark (kqr_stayactive_qos) of any stay active knote
6385                          * that was ever registered with this workloop.
6386                          *
6387                          * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
6388                          * knote, we use this high-watermark as a wakeup-index, and also set
6389                          * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
6390                          * there is at least one stay active knote fired until the next full
6391                          * processing of this bucket.
6392                          */
6393                         kqr->kqr_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT;
6394                         qos = kqr->kqr_stayactive_qos;
6395                         assert(qos);
6396                 }
6397                 if (kqr->kqr_wakeup_indexes & (1 << qos)) {
6398                         assert(kqr->kqr_state & KQR_WAKEUP);
6399                         break;
6400                 }
6401
6402                 kqr->kqr_wakeup_indexes |= (1 << qos);
6403                 kqr->kqr_state |= KQR_WAKEUP;
6404                 kqworkloop_request_fire_r2k_notification(kqwl);
6405                 goto recompute;
6406
6407         case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
6408                 assert(qos);
6409                 if (kqr->kqr_stayactive_qos < qos) {
6410                         kqr->kqr_stayactive_qos = qos;
6411                         if (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
6412                                 assert(kqr->kqr_state & KQR_WAKEUP);
6413                                 kqr->kqr_wakeup_indexes |= (1 << qos);
6414                                 goto recompute;
6415                         }
6416                 }
6417                 break;
6418
6419         case KQWL_UTQ_PARKING:
6420         case KQWL_UTQ_UNBINDING:
6421                 kqr->kqr_override_index = qos;
6422         /* FALLTHROUGH */
6423         case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
6424                 if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
6425                         assert(qos == THREAD_QOS_UNSPECIFIED);
6426                 }
6427                 kqlock_held(kqwl); // to look at kq_queues
6428                 i = KQWL_BUCKET_STAYACTIVE;
6429                 if (TAILQ_EMPTY(&kqr->kqr_suppressed)) {
6430                         kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
6431                 }
6432                 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) &&
6433                     (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
6434                         /*
6435                          * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
6436                          * knote may have fired, so we need to merge in kqr_stayactive_qos.
6437                          *
6438                          * Unlike other buckets, this one is never empty but could be idle.
6439                          */
6440                         kqr->kqr_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT;
6441                         kqr->kqr_wakeup_indexes |= (1 << kqr->kqr_stayactive_qos);
6442                 } else {
6443                         kqr->kqr_wakeup_indexes = 0;
6444                 }
6445                 for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) {
6446                         if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) {
6447                                 kqr->kqr_wakeup_indexes |= (1 << i);
6448                         }
6449                 }
6450                 if (kqr->kqr_wakeup_indexes) {
6451                         kqr->kqr_state |= KQR_WAKEUP;
6452                         kqworkloop_request_fire_r2k_notification(kqwl);
6453                 } else {
6454                         kqr->kqr_state &= ~KQR_WAKEUP;
6455                 }
6456                 goto recompute;
6457
6458         case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
6459                 kqr->kqr_override_index = qos;
6460                 goto recompute;
6461
6462         case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
6463 recompute:
6464                 /*
6465                  * When modifying the wakeup QoS or the override QoS, we always need to
6466                  * maintain our invariant that kqr_override_index is at least as large
6467                  * as the highest QoS for which an event is fired.
6468                  *
6469                  * However this override index can be larger when there is an overriden
6470                  * suppressed knote pushing on the kqueue.
6471                  */
6472                 if (kqr->kqr_wakeup_indexes > (1 << qos)) {
6473                         qos = fls(kqr->kqr_wakeup_indexes) - 1; /* fls is 1-based */
6474                 }
6475                 if (kqr->kqr_override_index < qos) {
6476                         kqr->kqr_override_index = qos;
6477                 }
6478                 break;
6479
6480         case KQWL_UTQ_REDRIVE_EVENTS:
6481                 break;
6482
6483         case KQWL_UTQ_SET_QOS_INDEX:
6484                 kqr->kqr_qos_index = qos;
6485                 break;
6486
6487         default:
6488                 panic("unknown kqwl thread qos update operation: %d", op);
6489         }
6490
6491         thread_t kqwl_owner = kqwl->kqwl_owner;
6492         thread_t servicer = kqr->kqr_thread;
6493         boolean_t qos_changed = FALSE;
6494         kq_index_t new_owner_override = kqworkloop_owner_override(kqwl);
6495
6496         /*
6497          * Apply the diffs to the owner if applicable
6498          */
6499         if (kqwl_owner) {
6500 #if 0
6501                 /* JMM - need new trace hooks for owner overrides */
6502                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
6503                     kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->kqr_qos_index,
6504                     (kqr->kqr_override_index << 16) | kqr->kqr_state);
6505 #endif
6506                 if (new_owner_override == old_owner_override) {
6507                         // nothing to do
6508                 } else if (old_owner_override == THREAD_QOS_UNSPECIFIED) {
6509                         thread_add_ipc_override(kqwl_owner, new_owner_override);
6510                 } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) {
6511                         thread_drop_ipc_override(kqwl_owner);
6512                 } else { /*  old_owner_override != new_owner_override */
6513                         thread_update_ipc_override(kqwl_owner, new_owner_override);
6514                 }
6515         }
6516
6517         /*
6518          * apply the diffs to the servicer
6519          */
6520         if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
6521                 /*
6522                  * No servicer, nor thread-request
6523                  *
6524                  * Make a new thread request, unless there is an owner (or the workloop
6525                  * is suspended in userland) or if there is no asynchronous work in the
6526                  * first place.
6527                  */
6528
6529                 if (kqwl_owner == NULL && (kqr->kqr_state & KQR_WAKEUP)) {
6530                         int initiate_flags = 0;
6531                         if (op == KQWL_UTQ_UNBINDING) {
6532                                 initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
6533                         }
6534                         kqueue_threadreq_initiate(kq, kqr, new_owner_override,
6535                             initiate_flags);
6536                 }
6537         } else if (servicer) {
6538                 /*
6539                  * Servicer in flight
6540                  *
6541                  * Just apply the diff to the servicer
6542                  */
6543                 struct uthread *ut = get_bsdthread_info(servicer);
6544                 if (ut->uu_kqueue_override != kqr->kqr_override_index) {
6545                         if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
6546                                 thread_add_ipc_override(servicer, kqr->kqr_override_index);
6547                         } else if (kqr->kqr_override_index == THREAD_QOS_UNSPECIFIED) {
6548                                 thread_drop_ipc_override(servicer);
6549                         } else { /* ut->uu_kqueue_override != kqr->kqr_override_index */
6550                                 thread_update_ipc_override(servicer, kqr->kqr_override_index);
6551                         }
6552                         ut->uu_kqueue_override = kqr->kqr_override_index;
6553                         qos_changed = TRUE;
6554                 }
6555         } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) {
6556                 /*
6557                  * No events to deliver anymore.
6558                  *
6559                  * However canceling with turnstiles is challenging, so the fact that
6560                  * the request isn't useful will be discovered by the servicer himself
6561                  * later on.
6562                  */
6563         } else if (old_owner_override != new_owner_override) {
6564                 /*
6565                  * Request is in flight
6566                  *
6567                  * Apply the diff to the thread request
6568                  */
6569                 kqueue_threadreq_modify(kq, kqr, new_owner_override);
6570                 qos_changed = TRUE;
6571         }
6572
6573         if (qos_changed) {
6574                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
6575                     thread_tid(kqr->kqr_thread), kqr->kqr_qos_index,
6576                     (kqr->kqr_override_index << 16) | kqr->kqr_state);
6577         }
6578 }
6579
6580 static void
6581 kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index)
6582 {
6583         /* convert to thread qos value */
6584         assert(qos_index < KQWL_NBUCKETS);
6585
6586         kq_req_lock(kqwl);
6587         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos_index);
6588         kq_req_unlock(kqwl);
6589 }
6590
6591 static struct kqtailq *
6592 kqueue_get_queue(struct kqueue *kq, kq_index_t qos_index)
6593 {
6594         if (kq->kq_state & KQ_WORKQ) {
6595                 assert(qos_index < KQWQ_NBUCKETS);
6596         } else if (kq->kq_state & KQ_WORKLOOP) {
6597                 assert(qos_index < KQWL_NBUCKETS);
6598         } else {
6599                 assert(qos_index == QOS_INDEX_KQFILE);
6600         }
6601         static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue),
6602             "struct kqueue::kq_queue must be exactly at the end");
6603         return &kq->kq_queue[qos_index];
6604 }
6605
6606 static int
6607 kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
6608 {
6609         return TAILQ_EMPTY(kqueue_get_queue(kq, qos_index));
6610 }
6611
6612 static struct kqtailq *
6613 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
6614 {
6615         if (kq.kq->kq_state & KQ_WORKQ) {
6616                 return &kqworkq_get_request(kq.kqwq, kn->kn_qos_index)->kqr_suppressed;
6617         } else if (kq.kq->kq_state & KQ_WORKLOOP) {
6618                 return &kq.kqwl->kqwl_request.kqr_suppressed;
6619         } else {
6620                 return &kq.kqf->kqf_suppressed;
6621         }
6622 }
6623
6624 static struct turnstile *
6625 kqueue_get_turnstile(kqueue_t kqu, bool can_alloc)
6626 {
6627         uint8_t kqr_state;
6628
6629         if ((kqu.kq->kq_state & KQ_WORKLOOP) == 0) {
6630                 return TURNSTILE_NULL;
6631         }
6632
6633         kqr_state = os_atomic_load(&kqu.kqwl->kqwl_request.kqr_state, relaxed);
6634         if (kqr_state & KQR_ALLOCATED_TURNSTILE) {
6635                 /* force a dependency to pair with the atomic or with release below */
6636                 return os_atomic_load_with_dependency_on(&kqu.kqwl->kqwl_turnstile,
6637                            kqr_state);
6638         }
6639
6640         if (!can_alloc) {
6641                 return TURNSTILE_NULL;
6642         }
6643
6644         struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
6645
6646         kq_req_lock(kqu);
6647         if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) {
6648                 workq_kern_threadreq_lock(kqu.kqwl->kqwl_p);
6649         }
6650
6651         if (kqu.kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) {
6652                 free_ts = ts;
6653                 ts = kqu.kqwl->kqwl_turnstile;
6654         } else {
6655                 ts = turnstile_prepare((uintptr_t)kqu.kqwl, &kqu.kqwl->kqwl_turnstile,
6656                     ts, TURNSTILE_WORKLOOPS);
6657
6658                 /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
6659                 os_atomic_or(&kqu.kqwl->kqwl_request.kqr_state,
6660                     KQR_ALLOCATED_TURNSTILE, release);
6661         }
6662
6663         if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) {
6664                 workq_kern_threadreq_unlock(kqu.kqwl->kqwl_p);
6665         }
6666         kq_req_unlock(kqu.kqwl);
6667
6668         if (free_ts) {
6669                 turnstile_deallocate(free_ts);
6670         }
6671         return ts;
6672 }
6673
6674 struct turnstile *
6675 kqueue_turnstile(struct kqueue *kq)
6676 {
6677         return kqueue_get_turnstile(kq, false);
6678 }
6679
6680 struct turnstile *
6681 kqueue_alloc_turnstile(struct kqueue *kq)
6682 {
6683         return kqueue_get_turnstile(kq, true);
6684 }
6685
6686 static struct kqtailq *
6687 knote_get_queue(struct knote *kn)
6688 {
6689         return kqueue_get_queue(knote_get_kq(kn), kn->kn_qos_index);
6690 }
6691
6692 static void
6693 knote_reset_priority(struct knote *kn, pthread_priority_t pp)
6694 {
6695         struct kqueue *kq = knote_get_kq(kn);
6696         kq_index_t qos = _pthread_priority_thread_qos(pp);
6697
6698         assert((kn->kn_status & KN_QUEUED) == 0);
6699
6700         if (kq->kq_state & KQ_WORKQ) {
6701                 if (qos == THREAD_QOS_UNSPECIFIED) {
6702                         /* On workqueues, outside of QoS means MANAGER */
6703                         qos = KQWQ_QOS_MANAGER;
6704                         pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
6705                 } else {
6706                         pp = _pthread_priority_normalize(pp);
6707                 }
6708         } else if (kq->kq_state & KQ_WORKLOOP) {
6709                 assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
6710                 pp = _pthread_priority_normalize(pp);
6711         } else {
6712                 pp = _pthread_unspecified_priority();
6713                 qos = THREAD_QOS_UNSPECIFIED;
6714         }
6715
6716         kn->kn_qos = pp;
6717         kn->kn_req_index = qos;
6718
6719         if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
6720                 /* Never lower QoS when in "Merge" mode */
6721                 kn->kn_qos_override = qos;
6722         }
6723
6724         /* only adjust in-use qos index when not suppressed */
6725         if ((kn->kn_status & KN_SUPPRESSED) == 0) {
6726                 kn->kn_qos_index = qos;
6727         } else if (kq->kq_state & KQ_WORKQ) {
6728                 kqworkq_update_override((struct kqworkq *)kq, kn, qos);
6729         } else if (kq->kq_state & KQ_WORKLOOP) {
6730                 kqworkloop_update_override((struct kqworkloop *)kq, qos);
6731         }
6732 }
6733
6734 static void
6735 knote_set_qos_overcommit(struct knote *kn)
6736 {
6737         struct kqueue *kq = knote_get_kq(kn);
6738
6739         /* turn overcommit on for the appropriate thread request? */
6740         if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
6741             (kq->kq_state & KQ_WORKLOOP)) {
6742                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
6743                 struct kqrequest *kqr = &kqwl->kqwl_request;
6744
6745                 /*
6746                  * This test is racy, but since we never remove this bit,
6747                  * it allows us to avoid taking a lock.
6748                  */
6749                 if (kqr->kqr_state & KQR_THOVERCOMMIT) {
6750                         return;
6751                 }
6752
6753                 kq_req_lock(kqwl);
6754                 kqr->kqr_state |= KQR_THOVERCOMMIT;
6755                 if (!kqr->kqr_thread && (kqr->kqr_state & KQR_THREQUESTED)) {
6756                         kqueue_threadreq_modify(kq, kqr, kqr->kqr_req.tr_qos);
6757                 }
6758                 kq_req_unlock(kqwl);
6759         }
6760 }
6761
6762 static kq_index_t
6763 knote_get_qos_override_index(struct knote *kn)
6764 {
6765         return kn->kn_qos_override;
6766 }
6767
6768 static void
6769 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
6770     kq_index_t override_index)
6771 {
6772         struct kqrequest *kqr;
6773         kq_index_t old_override_index;
6774         kq_index_t queue_index = kn->kn_qos_index;
6775
6776         if (override_index <= queue_index) {
6777                 return;
6778         }
6779
6780         kqr = kqworkq_get_request(kqwq, queue_index);
6781
6782         kq_req_lock(kqwq);
6783         old_override_index = kqr->kqr_override_index;
6784         if (override_index > MAX(kqr->kqr_qos_index, old_override_index)) {
6785                 kqr->kqr_override_index = override_index;
6786
6787                 /* apply the override to [incoming?] servicing thread */
6788                 if (kqr->kqr_thread) {
6789                         if (old_override_index) {
6790                                 thread_update_ipc_override(kqr->kqr_thread, override_index);
6791                         } else {
6792                                 thread_add_ipc_override(kqr->kqr_thread, override_index);
6793                         }
6794                 }
6795         }
6796         kq_req_unlock(kqwq);
6797 }
6798
6799 static void
6800 kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index)
6801 {
6802         kq_req_lock(kqwl);
6803         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
6804             override_index);
6805         kq_req_unlock(kqwl);
6806 }
6807
6808 static thread_qos_t
6809 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread)
6810 {
6811         struct uthread *ut = get_bsdthread_info(thread);
6812         struct kqrequest *kqr = &kqwl->kqwl_request;
6813         kq_index_t ipc_override = ut->uu_kqueue_override;
6814
6815         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
6816             thread_tid(thread), 0, 0);
6817
6818         kq_req_held(kqwl);
6819         assert(ut->uu_kqr_bound == kqr);
6820         ut->uu_kqr_bound = NULL;
6821         ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
6822
6823         if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
6824                 turnstile_update_inheritor(kqwl->kqwl_turnstile,
6825                     TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
6826                 turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
6827                     TURNSTILE_INTERLOCK_HELD);
6828         }
6829
6830         kqr->kqr_thread = NULL;
6831         kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
6832         return ipc_override;
6833 }
6834
6835 /*
6836  *      kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
6837  *
6838  *      It will acknowledge events, and possibly request a new thread if:
6839  *      - there were active events left
6840  *      - we pended waitq hook callouts during processing
6841  *      - we pended wakeups while processing (or unsuppressing)
6842  *
6843  *      Called with kqueue lock held.
6844  */
6845 static void
6846 kqworkloop_unbind(proc_t p, struct kqworkloop *kqwl)
6847 {
6848         struct kqueue *kq = &kqwl->kqwl_kqueue;
6849         struct kqrequest *kqr = &kqwl->kqwl_request;
6850         thread_t thread = kqr->kqr_thread;
6851         int op = KQWL_UTQ_PARKING;
6852         kq_index_t ipc_override, qos_override = THREAD_QOS_UNSPECIFIED;
6853
6854         assert(thread == current_thread());
6855
6856         kqlock(kqwl);
6857
6858         /*
6859          * Forcing the KQ_PROCESSING flag allows for QoS updates because of
6860          * unsuppressing knotes not to be applied until the eventual call to
6861          * kqworkloop_update_threads_qos() below.
6862          */
6863         assert((kq->kq_state & KQ_PROCESSING) == 0);
6864         if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
6865                 kq->kq_state |= KQ_PROCESSING;
6866                 qos_override = kqworkloop_acknowledge_events(kqwl);
6867                 kq->kq_state &= ~KQ_PROCESSING;
6868         }
6869
6870         kq_req_lock(kqwl);
6871
6872         ipc_override = kqworkloop_unbind_locked(kqwl, thread);
6873         kqworkloop_update_threads_qos(kqwl, op, qos_override);
6874
6875         kq_req_unlock(kqwl);
6876
6877         kqunlock(kqwl);
6878
6879         /*
6880          * Drop the override on the current thread last, after the call to
6881          * kqworkloop_update_threads_qos above.
6882          */
6883         if (ipc_override) {
6884                 thread_drop_ipc_override(thread);
6885         }
6886
6887         /* If last reference, dealloc the workloop kq */
6888         kqueue_release_last(p, kqwl);
6889 }
6890
6891 static thread_qos_t
6892 kqworkq_unbind_locked(__assert_only struct kqworkq *kqwq,
6893     struct kqrequest *kqr, thread_t thread)
6894 {
6895         struct uthread *ut = get_bsdthread_info(thread);
6896         kq_index_t old_override = kqr->kqr_override_index;
6897
6898         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
6899             thread_tid(kqr->kqr_thread), kqr->kqr_qos_index, 0);
6900
6901         kq_req_held(kqwq);
6902         assert(ut->uu_kqr_bound == kqr);
6903         ut->uu_kqr_bound = NULL;
6904         kqr->kqr_thread = NULL;
6905         kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
6906         kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
6907
6908         return old_override;
6909 }
6910
6911 /*
6912  *      kqworkq_unbind - unbind of a workq kqueue from a thread
6913  *
6914  *      We may have to request new threads.
6915  *      This can happen there are no waiting processing threads and:
6916  *      - there were active events we never got to (count > 0)
6917  *      - we pended waitq hook callouts during processing
6918  *      - we pended wakeups while processing (or unsuppressing)
6919  */
6920 static void
6921 kqworkq_unbind(proc_t p, struct kqrequest *kqr)
6922 {
6923         struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
6924         __assert_only int rc;
6925
6926         kqlock(kqwq);
6927         rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
6928         assert(rc == -1);
6929         kqunlock(kqwq);
6930 }
6931
6932 struct kqrequest *
6933 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
6934 {
6935         assert(qos_index < KQWQ_NBUCKETS);
6936         return &kqwq->kqwq_request[qos_index];
6937 }
6938
6939 static void
6940 knote_apply_qos_override(struct knote *kn, kq_index_t qos_index)
6941 {
6942         assert((kn->kn_status & KN_QUEUED) == 0);
6943
6944         kn->kn_qos_override = qos_index;
6945
6946         if (kn->kn_status & KN_SUPPRESSED) {
6947                 struct kqueue *kq = knote_get_kq(kn);
6948                 /*
6949                  * For suppressed events, the kn_qos_index field cannot be touched as it
6950                  * allows us to know on which supress queue the knote is for a kqworkq.
6951                  *
6952                  * Also, there's no natural push applied on the kqueues when this field
6953                  * changes anyway. We hence need to apply manual overrides in this case,
6954                  * which will be cleared when the events are later acknowledged.
6955                  */
6956                 if (kq->kq_state & KQ_WORKQ) {
6957                         kqworkq_update_override((struct kqworkq *)kq, kn, qos_index);
6958                 } else {
6959                         kqworkloop_update_override((struct kqworkloop *)kq, qos_index);
6960                 }
6961         } else {
6962                 kn->kn_qos_index = qos_index;
6963         }
6964 }
6965
6966 static bool
6967 knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn, int result,
6968     thread_qos_t *qos_out)
6969 {
6970         thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
6971
6972         kqlock_held(kq);
6973
6974         assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
6975         assert(qos_index < THREAD_QOS_LAST);
6976
6977         /*
6978          * Early exit for knotes that should not change QoS
6979          *
6980          * It is safe to test kn_req_index against MANAGER / STAYACTIVE because
6981          * knotes with such kn_req_index values never change for their entire
6982          * lifetime.
6983          */
6984         if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
6985                 panic("filter %d cannot change QoS", kn->kn_filtid);
6986         } else if (kq->kq_state & KQ_WORKLOOP) {
6987                 if (kn->kn_req_index == KQWL_BUCKET_STAYACTIVE) {
6988                         return false;
6989                 }
6990         } else if (kq->kq_state & KQ_WORKQ) {
6991                 if (kn->kn_req_index == KQWQ_QOS_MANAGER) {
6992                         return false;
6993                 }
6994         } else {
6995                 return false;
6996         }
6997
6998         /*
6999          * knotes with the FALLBACK flag will only use their registration QoS if the
7000          * incoming event has no QoS, else, the registration QoS acts as a floor.
7001          */
7002         if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
7003                 if (qos_index == THREAD_QOS_UNSPECIFIED) {
7004                         qos_index = kn->kn_req_index;
7005                 }
7006         } else {
7007                 if (qos_index < kn->kn_req_index) {
7008                         qos_index = kn->kn_req_index;
7009                 }
7010         }
7011         if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
7012                 /* Never lower QoS when in "Merge" mode */
7013                 return false;
7014         }
7015
7016         if ((kn->kn_status & KN_LOCKED) && kn->kn_inuse) {
7017                 /*
7018                  * When we're trying to update the QoS override and that both an
7019                  * f_event() and other f_* calls are running concurrently, any of these
7020                  * in flight calls may want to perform overrides that aren't properly
7021                  * serialized with each other.
7022                  *
7023                  * The first update that observes this racy situation enters a "Merge"
7024                  * mode which causes subsequent override requests to saturate the
7025                  * override instead of replacing its value.
7026                  *
7027                  * This mode is left when knote_unlock() or knote_call_filter_event()
7028                  * observe that no other f_* routine is in flight.
7029                  */
7030                 kn->kn_status |= KN_MERGE_QOS;
7031         }
7032
7033         if (kn->kn_qos_override == qos_index) {
7034                 return false;
7035         }
7036
7037         *qos_out = qos_index;
7038         return true;
7039 }
7040
7041 static void
7042 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
7043 {
7044         thread_qos_t qos;
7045         if (knote_should_apply_qos_override(kq, kn, result, &qos)) {
7046                 knote_dequeue(kn);
7047                 knote_apply_qos_override(kn, qos);
7048                 if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
7049                         knote_wakeup(kn);
7050                 }
7051         }
7052 }
7053
7054 static void
7055 knote_wakeup(struct knote *kn)
7056 {
7057         struct kqueue *kq = knote_get_kq(kn);
7058
7059         kqlock_held(kq);
7060
7061         if (kq->kq_state & KQ_WORKQ) {
7062                 struct kqworkq *kqwq = (struct kqworkq *)kq;
7063
7064                 kqworkq_request_help(kqwq, kn->kn_qos_index);
7065         } else if (kq->kq_state & KQ_WORKLOOP) {
7066                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7067
7068                 /*
7069                  * kqworkloop_end_processing() will perform the required QoS
7070                  * computations when it unsets the processing mode.
7071                  */
7072                 if (!kqworkloop_is_processing_on_current_thread(kqwl)) {
7073                         kqworkloop_request_help(kqwl, kn->kn_qos_index);
7074                 }
7075         } else {
7076                 struct kqfile *kqf = (struct kqfile *)kq;
7077
7078                 /* flag wakeups during processing */
7079                 if (kq->kq_state & KQ_PROCESSING) {
7080                         kq->kq_state |= KQ_WAKEUP;
7081                 }
7082
7083                 /* wakeup a thread waiting on this queue */
7084                 if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
7085                         kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
7086                         waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT,
7087                             THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
7088                 }
7089
7090                 /* wakeup other kqueues/select sets we're inside */
7091                 KNOTE(&kqf->kqf_sel.si_note, 0);
7092         }
7093 }
7094
7095 /*
7096  * Called with the kqueue locked
7097  */
7098 static void
7099 kqueue_interrupt(struct kqueue *kq)
7100 {
7101         assert((kq->kq_state & KQ_WORKQ) == 0);
7102
7103         /* wakeup sleeping threads */
7104         if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0) {
7105                 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
7106                 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7107                     KQ_EVENT,
7108                     THREAD_RESTART,
7109                     WAITQ_ALL_PRIORITIES);
7110         }
7111
7112         /* wakeup threads waiting their turn to process */
7113         if (kq->kq_state & KQ_PROCWAIT) {
7114                 struct kqtailq *suppressq;
7115
7116                 assert(kq->kq_state & KQ_PROCESSING);
7117
7118                 kq->kq_state &= ~KQ_PROCWAIT;
7119                 suppressq = kqueue_get_suppressed_queue(kq, NULL);
7120                 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7121                     CAST_EVENT64_T(suppressq),
7122                     THREAD_RESTART,
7123                     WAITQ_ALL_PRIORITIES);
7124         }
7125 }
7126
7127 /*
7128  * Called back from waitq code when no threads waiting and the hook was set.
7129  *
7130  * Interrupts are likely disabled and spin locks are held - minimal work
7131  * can be done in this context!!!
7132  *
7133  * JMM - in the future, this will try to determine which knotes match the
7134  * wait queue wakeup and apply these wakeups against those knotes themselves.
7135  * For now, all the events dispatched this way are dispatch-manager handled,
7136  * so hard-code that for now.
7137  */
7138 void
7139 waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos)
7140 {
7141 #pragma unused(knote_hook, qos)
7142
7143         struct kqueue *kq = (struct kqueue *)kq_hook;
7144
7145         if (kq->kq_state & KQ_WORKQ) {
7146                 struct kqworkq *kqwq = (struct kqworkq *)kq;
7147
7148                 kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER);
7149         } else if (kq->kq_state & KQ_WORKLOOP) {
7150                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7151
7152                 kqworkloop_request_help(kqwl, KQWL_BUCKET_STAYACTIVE);
7153         }
7154 }
7155
7156 void
7157 klist_init(struct klist *list)
7158 {
7159         SLIST_INIT(list);
7160 }
7161
7162
7163 /*
7164  * Query/Post each knote in the object's list
7165  *
7166  *      The object lock protects the list. It is assumed
7167  *      that the filter/event routine for the object can
7168  *      determine that the object is already locked (via
7169  *      the hint) and not deadlock itself.
7170  *
7171  *      The object lock should also hold off pending
7172  *      detach/drop operations.
7173  */
7174 void
7175 knote(struct klist *list, long hint)
7176 {
7177         struct knote *kn;
7178
7179         SLIST_FOREACH(kn, list, kn_selnext) {
7180                 struct kqueue *kq = knote_get_kq(kn);
7181                 kqlock(kq);
7182                 knote_call_filter_event(kq, kn, hint);
7183                 kqunlock(kq);
7184         }
7185 }
7186
7187 /*
7188  * attach a knote to the specified list.  Return true if this is the first entry.
7189  * The list is protected by whatever lock the object it is associated with uses.
7190  */
7191 int
7192 knote_attach(struct klist *list, struct knote *kn)
7193 {
7194         int ret = SLIST_EMPTY(list);
7195         SLIST_INSERT_HEAD(list, kn, kn_selnext);
7196         return ret;
7197 }
7198
7199 /*
7200  * detach a knote from the specified list.  Return true if that was the last entry.
7201  * The list is protected by whatever lock the object it is associated with uses.
7202  */
7203 int
7204 knote_detach(struct klist *list, struct knote *kn)
7205 {
7206         SLIST_REMOVE(list, kn, knote, kn_selnext);
7207         return SLIST_EMPTY(list);
7208 }
7209
7210 /*
7211  * knote_vanish - Indicate that the source has vanished
7212  *
7213  * If the knote has requested EV_VANISHED delivery,
7214  * arrange for that. Otherwise, deliver a NOTE_REVOKE
7215  * event for backward compatibility.
7216  *
7217  * The knote is marked as having vanished, but is not
7218  * actually detached from the source in this instance.
7219  * The actual detach is deferred until the knote drop.
7220  *
7221  * Our caller already has the object lock held. Calling
7222  * the detach routine would try to take that lock
7223  * recursively - which likely is not supported.
7224  */
7225 void
7226 knote_vanish(struct klist *list, bool make_active)
7227 {
7228         struct knote *kn;
7229         struct knote *kn_next;
7230
7231         SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
7232                 struct kqueue *kq = knote_get_kq(kn);
7233
7234                 kqlock(kq);
7235                 if (__probable(kn->kn_status & KN_REQVANISH)) {
7236                         /*
7237                          * If EV_VANISH supported - prepare to deliver one
7238                          */
7239                         kn->kn_status |= KN_VANISHED;
7240                 } else {
7241                         /*
7242                          * Handle the legacy way to indicate that the port/portset was
7243                          * deallocated or left the current Mach portspace (modern technique
7244                          * is with an EV_VANISHED protocol).
7245                          *
7246                          * Deliver an EV_EOF event for these changes (hopefully it will get
7247                          * delivered before the port name recycles to the same generation
7248                          * count and someone tries to re-register a kevent for it or the
7249                          * events are udata-specific - avoiding a conflict).
7250                          */
7251                         kn->kn_flags |= EV_EOF | EV_ONESHOT;
7252                 }
7253                 if (make_active) {
7254                         knote_activate(kn);
7255                 }
7256                 kqunlock(kq);
7257         }
7258 }
7259
7260 /*
7261  * Force a lazy allocation of the waitqset link
7262  * of the kq_wqs associated with the kn
7263  * if it wasn't already allocated.
7264  *
7265  * This allows knote_link_waitq to never block
7266  * if reserved_link is not NULL.
7267  */
7268 void
7269 knote_link_waitqset_lazy_alloc(struct knote *kn)
7270 {
7271         struct kqueue *kq = knote_get_kq(kn);
7272         waitq_set_lazy_init_link(&kq->kq_wqs);
7273 }
7274
7275 /*
7276  * Check if a lazy allocation for the waitqset link
7277  * of the kq_wqs is needed.
7278  */
7279 boolean_t
7280 knote_link_waitqset_should_lazy_alloc(struct knote *kn)
7281 {
7282         struct kqueue *kq = knote_get_kq(kn);
7283         return waitq_set_should_lazy_init_link(&kq->kq_wqs);
7284 }
7285
7286 /*
7287  * For a given knote, link a provided wait queue directly with the kqueue.
7288  * Wakeups will happen via recursive wait queue support.  But nothing will move
7289  * the knote to the active list at wakeup (nothing calls knote()).  Instead,
7290  * we permanently enqueue them here.
7291  *
7292  * kqueue and knote references are held by caller.
7293  * waitq locked by caller.
7294  *
7295  * caller provides the wait queue link structure and insures that the kq->kq_wqs
7296  * is linked by previously calling knote_link_waitqset_lazy_alloc.
7297  */
7298 int
7299 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
7300 {
7301         struct kqueue *kq = knote_get_kq(kn);
7302         kern_return_t kr;
7303
7304         kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
7305         if (kr == KERN_SUCCESS) {
7306                 knote_markstayactive(kn);
7307                 return 0;
7308         } else {
7309                 return EINVAL;
7310         }
7311 }
7312
7313 /*
7314  * Unlink the provided wait queue from the kqueue associated with a knote.
7315  * Also remove it from the magic list of directly attached knotes.
7316  *
7317  * Note that the unlink may have already happened from the other side, so
7318  * ignore any failures to unlink and just remove it from the kqueue list.
7319  *
7320  * On success, caller is responsible for the link structure
7321  */
7322 int
7323 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
7324 {
7325         struct kqueue *kq = knote_get_kq(kn);
7326         kern_return_t kr;
7327
7328         kr = waitq_unlink(wq, &kq->kq_wqs);
7329         knote_clearstayactive(kn);
7330         return (kr != KERN_SUCCESS) ? EINVAL : 0;
7331 }
7332
7333 /*
7334  * remove all knotes referencing a specified fd
7335  *
7336  * Entered with the proc_fd lock already held.
7337  * It returns the same way, but may drop it temporarily.
7338  */
7339 void
7340 knote_fdclose(struct proc *p, int fd)
7341 {
7342         struct klist *list;
7343         struct knote *kn;
7344         KNOTE_LOCK_CTX(knlc);
7345
7346 restart:
7347         list = &p->p_fd->fd_knlist[fd];
7348         SLIST_FOREACH(kn, list, kn_link) {
7349                 struct kqueue *kq = knote_get_kq(kn);
7350
7351                 kqlock(kq);
7352
7353                 if (kq->kq_p != p) {
7354                         panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
7355                             __func__, kq->kq_p, p);
7356                 }
7357
7358                 /*
7359                  * If the knote supports EV_VANISHED delivery,
7360                  * transition it to vanished mode (or skip over
7361                  * it if already vanished).
7362                  */
7363                 if (kn->kn_status & KN_VANISHED) {
7364                         kqunlock(kq);
7365                         continue;
7366                 }
7367
7368                 proc_fdunlock(p);
7369                 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
7370                         /* the knote was dropped by someone, nothing to do */
7371                 } else if (kn->kn_status & KN_REQVANISH) {
7372                         kn->kn_status |= KN_VANISHED;
7373                         kn->kn_status &= ~KN_ATTACHED;
7374
7375                         kqunlock(kq);
7376                         knote_fops(kn)->f_detach(kn);
7377                         if (knote_fops(kn)->f_isfd) {
7378                                 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
7379                         }
7380                         kqlock(kq);
7381
7382                         knote_activate(kn);
7383                         knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
7384                 } else {
7385                         knote_drop(kq, kn, &knlc);
7386                 }
7387
7388                 proc_fdlock(p);
7389                 goto restart;
7390         }
7391 }
7392
7393 /*
7394  * knote_fdfind - lookup a knote in the fd table for process
7395  *
7396  * If the filter is file-based, lookup based on fd index.
7397  * Otherwise use a hash based on the ident.
7398  *
7399  * Matching is based on kq, filter, and ident. Optionally,
7400  * it may also be based on the udata field in the kevent -
7401  * allowing multiple event registration for the file object
7402  * per kqueue.
7403  *
7404  * fd_knhashlock or fdlock held on entry (and exit)
7405  */
7406 static struct knote *
7407 knote_fdfind(struct kqueue *kq,
7408     struct kevent_internal_s *kev,
7409     bool is_fd,
7410     struct proc *p)
7411 {
7412         struct filedesc *fdp = p->p_fd;
7413         struct klist *list = NULL;
7414         struct knote *kn = NULL;
7415
7416         /*
7417          * determine where to look for the knote
7418          */
7419         if (is_fd) {
7420                 /* fd-based knotes are linked off the fd table */
7421                 if (kev->ident < (u_int)fdp->fd_knlistsize) {
7422                         list = &fdp->fd_knlist[kev->ident];
7423                 }
7424         } else if (fdp->fd_knhashmask != 0) {
7425                 /* hash non-fd knotes here too */
7426                 list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
7427         }
7428
7429         /*
7430          * scan the selected list looking for a match
7431          */
7432         if (list != NULL) {
7433                 SLIST_FOREACH(kn, list, kn_link) {
7434                         if (kq == knote_get_kq(kn) &&
7435                             kev->ident == kn->kn_id &&
7436                             kev->filter == kn->kn_filter) {
7437                                 if (kev->flags & EV_UDATA_SPECIFIC) {
7438                                         if ((kn->kn_status & KN_UDATA_SPECIFIC) &&
7439                                             kev->udata == kn->kn_udata) {
7440                                                 break; /* matching udata-specific knote */
7441                                         }
7442                                 } else if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
7443                                         break; /* matching non-udata-specific knote */
7444                                 }
7445                         }
7446                 }
7447         }
7448         return kn;
7449 }
7450
7451 /*
7452  * kq_add_knote- Add knote to the fd table for process
7453  * while checking for duplicates.
7454  *
7455  * All file-based filters associate a list of knotes by file
7456  * descriptor index. All other filters hash the knote by ident.
7457  *
7458  * May have to grow the table of knote lists to cover the
7459  * file descriptor index presented.
7460  *
7461  * fd_knhashlock and fdlock unheld on entry (and exit).
7462  *
7463  * Takes a rwlock boost if inserting the knote is successful.
7464  */
7465 static int
7466 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
7467     struct proc *p)
7468 {
7469         struct filedesc *fdp = p->p_fd;
7470         struct klist *list = NULL;
7471         int ret = 0;
7472         bool is_fd = knote_fops(kn)->f_isfd;
7473
7474         if (is_fd) {
7475                 proc_fdlock(p);
7476         } else {
7477                 knhash_lock(p);
7478         }
7479
7480         if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
7481                 /* found an existing knote: we can't add this one */
7482                 ret = ERESTART;
7483                 goto out_locked;
7484         }
7485
7486         /* knote was not found: add it now */
7487         if (!is_fd) {
7488                 if (fdp->fd_knhashmask == 0) {
7489                         u_long size = 0;
7490
7491                         list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
7492                         if (list == NULL) {
7493                                 ret = ENOMEM;
7494                                 goto out_locked;
7495                         }
7496
7497                         fdp->fd_knhash = list;
7498                         fdp->fd_knhashmask = size;
7499                 }
7500
7501                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
7502                 SLIST_INSERT_HEAD(list, kn, kn_link);
7503                 ret = 0;
7504                 goto out_locked;
7505         } else {
7506                 /* knote is fd based */
7507
7508                 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
7509                         u_int size = 0;
7510
7511                         if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
7512                             || kn->kn_id >= (uint64_t)maxfiles) {
7513                                 ret = EINVAL;
7514                                 goto out_locked;
7515                         }
7516                         /* have to grow the fd_knlist */
7517                         size = fdp->fd_knlistsize;
7518                         while (size <= kn->kn_id) {
7519                                 size += KQEXTENT;
7520                         }
7521
7522                         if (size >= (UINT_MAX / sizeof(struct klist *))) {
7523                                 ret = EINVAL;
7524                                 goto out_locked;
7525                         }
7526
7527                         MALLOC(list, struct klist *,
7528                             size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
7529                         if (list == NULL) {
7530                                 ret = ENOMEM;
7531                                 goto out_locked;
7532                         }
7533
7534                         bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
7535                             fdp->fd_knlistsize * sizeof(struct klist *));
7536                         bzero((caddr_t)list +
7537                             fdp->fd_knlistsize * sizeof(struct klist *),
7538                             (size - fdp->fd_knlistsize) * sizeof(struct klist *));
7539                         FREE(fdp->fd_knlist, M_KQUEUE);
7540                         fdp->fd_knlist = list;
7541                         fdp->fd_knlistsize = size;
7542                 }
7543
7544                 list = &fdp->fd_knlist[kn->kn_id];
7545                 SLIST_INSERT_HEAD(list, kn, kn_link);
7546                 ret = 0;
7547                 goto out_locked;
7548         }
7549
7550 out_locked:
7551         if (ret == 0) {
7552                 kqlock(kq);
7553                 assert((kn->kn_status & KN_LOCKED) == 0);
7554                 (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
7555         }
7556         if (is_fd) {
7557                 proc_fdunlock(p);
7558         } else {
7559                 knhash_unlock(p);
7560         }
7561
7562         return ret;
7563 }
7564
7565 /*
7566  * kq_remove_knote - remove a knote from the fd table for process
7567  *
7568  * If the filter is file-based, remove based on fd index.
7569  * Otherwise remove from the hash based on the ident.
7570  *
7571  * fd_knhashlock and fdlock unheld on entry (and exit).
7572  */
7573 static void
7574 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
7575     struct knote_lock_ctx *knlc)
7576 {
7577         struct filedesc *fdp = p->p_fd;
7578         struct klist *list = NULL;
7579         uint16_t kq_state;
7580         bool is_fd;
7581
7582         is_fd = knote_fops(kn)->f_isfd;
7583
7584         if (is_fd) {
7585                 proc_fdlock(p);
7586         } else {
7587                 knhash_lock(p);
7588         }
7589
7590         if (is_fd) {
7591                 assert((u_int)fdp->fd_knlistsize > kn->kn_id);
7592                 list = &fdp->fd_knlist[kn->kn_id];
7593         } else {
7594                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
7595         }
7596         SLIST_REMOVE(list, kn, knote, kn_link);
7597
7598         kqlock(kq);
7599         kq_state = kq->kq_state;
7600         if (knlc) {
7601                 knote_unlock_cancel(kq, kn, knlc, KNOTE_KQ_UNLOCK);
7602         } else {
7603                 kqunlock(kq);
7604         }
7605         if (is_fd) {
7606                 proc_fdunlock(p);
7607         } else {
7608                 knhash_unlock(p);
7609         }
7610
7611         if (kq_state & KQ_DYNAMIC) {
7612                 kqueue_release_last(p, kq);
7613         }
7614 }
7615
7616 /*
7617  * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
7618  * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
7619  *
7620  * fd_knhashlock or fdlock unheld on entry (and exit)
7621  */
7622
7623 static struct knote *
7624 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev,
7625     bool is_fd, struct proc *p)
7626 {
7627         struct knote * ret;
7628
7629         if (is_fd) {
7630                 proc_fdlock(p);
7631         } else {
7632                 knhash_lock(p);
7633         }
7634
7635         ret = knote_fdfind(kq, kev, is_fd, p);
7636
7637         if (ret) {
7638                 kqlock(kq);
7639         }
7640
7641         if (is_fd) {
7642                 proc_fdunlock(p);
7643         } else {
7644                 knhash_unlock(p);
7645         }
7646
7647         return ret;
7648 }
7649 /*
7650  * knote_drop - disconnect and drop the knote
7651  *
7652  * Called with the kqueue locked, returns with the kqueue unlocked.
7653  *
7654  * If a knote locking context is passed, it is canceled.
7655  *
7656  * The knote may have already been detached from
7657  * (or not yet attached to) its source object.
7658  */
7659 static void
7660 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
7661 {
7662         struct proc *p = kq->kq_p;
7663
7664         kqlock_held(kq);
7665
7666         assert((kn->kn_status & KN_DROPPING) == 0);
7667         if (knlc == NULL) {
7668                 assert((kn->kn_status & KN_LOCKED) == 0);
7669         }
7670         kn->kn_status |= KN_DROPPING;
7671
7672         knote_unsuppress(kn);
7673         knote_dequeue(kn);
7674         knote_wait_for_filter_events(kq, kn);
7675
7676         /* If we are attached, disconnect from the source first */
7677         if (kn->kn_status & KN_ATTACHED) {
7678                 knote_fops(kn)->f_detach(kn);
7679         }
7680
7681         /* kq may be freed when kq_remove_knote() returns */
7682         kq_remove_knote(kq, kn, p, knlc);
7683         if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0)) {
7684                 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
7685         }
7686
7687         knote_free(kn);
7688 }
7689
7690 /* called with kqueue lock held */
7691 static void
7692 knote_activate(struct knote *kn)
7693 {
7694         if (kn->kn_status & KN_ACTIVE) {
7695                 return;
7696         }
7697
7698         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
7699             kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
7700             kn->kn_filtid);
7701
7702         kn->kn_status |= KN_ACTIVE;
7703         if (knote_enqueue(kn)) {
7704                 knote_wakeup(kn);
7705         }
7706 }
7707
7708 /* called with kqueue lock held */
7709 static void
7710 knote_deactivate(struct knote *kn)
7711 {
7712         kn->kn_status &= ~KN_ACTIVE;
7713         if ((kn->kn_status & KN_STAYACTIVE) == 0) {
7714                 knote_dequeue(kn);
7715         }
7716 }
7717
7718 /* called with kqueue lock held */
7719 static void
7720 knote_enable(struct knote *kn)
7721 {
7722         if ((kn->kn_status & KN_DISABLED) == 0) {
7723                 return;
7724         }
7725
7726         kn->kn_status &= ~KN_DISABLED;
7727
7728         if (kn->kn_status & KN_SUPPRESSED) {
7729                 /*
7730                  * it is possible for userland to have knotes registered for a given
7731                  * workloop `wl_orig` but really handled on another workloop `wl_new`.
7732                  *
7733                  * In that case, rearming will happen from the servicer thread of
7734                  * `wl_new` which if `wl_orig` is no longer being serviced, would cause
7735                  * this knote to stay suppressed forever if we only relied on
7736                  * kqworkloop_acknowledge_events to be called by `wl_orig`.
7737                  *
7738                  * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
7739                  * unsuppress because that would mess with the processing phase of
7740                  * `wl_orig`, however it also means kqworkloop_acknowledge_events()
7741                  * will be called.
7742                  */
7743                 struct kqueue *kq = knote_get_kq(kn);
7744                 if ((kq->kq_state & KQ_PROCESSING) == 0) {
7745                         knote_unsuppress(kn);
7746                 }
7747         } else if (knote_enqueue(kn)) {
7748                 knote_wakeup(kn);
7749         }
7750 }
7751
7752 /* called with kqueue lock held */
7753 static void
7754 knote_disable(struct knote *kn)
7755 {
7756         if (kn->kn_status & KN_DISABLED) {
7757                 return;
7758         }
7759
7760         kn->kn_status |= KN_DISABLED;
7761         knote_dequeue(kn);
7762 }
7763
7764 /* called with kqueue lock held */
7765 static void
7766 knote_suppress(struct knote *kn)
7767 {
7768         struct kqtailq *suppressq;
7769         struct kqueue *kq = knote_get_kq(kn);
7770
7771         kqlock_held(kq);
7772
7773         if (kn->kn_status & KN_SUPPRESSED) {
7774                 return;
7775         }
7776
7777         knote_dequeue(kn);
7778         kn->kn_status |= KN_SUPPRESSED;
7779         suppressq = kqueue_get_suppressed_queue(kq, kn);
7780         TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
7781 }
7782
7783 /* called with kqueue lock held */
7784 static void
7785 knote_unsuppress(struct knote *kn)
7786 {
7787         struct kqtailq *suppressq;
7788         struct kqueue *kq = knote_get_kq(kn);
7789
7790         kqlock_held(kq);
7791
7792         if ((kn->kn_status & KN_SUPPRESSED) == 0) {
7793                 return;
7794         }
7795
7796         kn->kn_status &= ~KN_SUPPRESSED;
7797         suppressq = kqueue_get_suppressed_queue(kq, kn);
7798         TAILQ_REMOVE(suppressq, kn, kn_tqe);
7799
7800         /*
7801          * If the knote is no longer active, reset its push,
7802          * and resynchronize kn_qos_index with kn_qos_override
7803          */
7804         if ((kn->kn_status & KN_ACTIVE) == 0) {
7805                 kn->kn_qos_override = kn->kn_req_index;
7806         }
7807         kn->kn_qos_index = kn->kn_qos_override;
7808
7809         /* don't wakeup if unsuppressing just a stay-active knote */
7810         if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
7811                 knote_wakeup(kn);
7812         }
7813
7814         if ((kq->kq_state & KQ_WORKLOOP) && TAILQ_EMPTY(suppressq)) {
7815                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7816
7817                 if (kqworkloop_is_processing_on_current_thread(kqwl)) {
7818                         /*
7819                          * kqworkloop_end_processing() or kqworkloop_begin_processing()
7820                          * will perform the required QoS computations when it unsets the
7821                          * processing mode.
7822                          */
7823                 } else {
7824                         kq_req_lock(kqwl);
7825                         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, 0);
7826                         kq_req_unlock(kqwl);
7827                 }
7828         }
7829 }
7830
7831 /* called with kqueue lock held */
7832 static int
7833 knote_enqueue(struct knote *kn)
7834 {
7835         if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0 ||
7836             (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING))) {
7837                 return 0;
7838         }
7839
7840         if ((kn->kn_status & KN_QUEUED) == 0) {
7841                 struct kqtailq *queue = knote_get_queue(kn);
7842                 struct kqueue *kq = knote_get_kq(kn);
7843
7844                 kqlock_held(kq);
7845                 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
7846                 kn->kn_status |= KN_QUEUED;
7847                 kq->kq_count++;
7848                 return 1;
7849         }
7850         return (kn->kn_status & KN_STAYACTIVE) != 0;
7851 }
7852
7853
7854 /* called with kqueue lock held */
7855 static void
7856 knote_dequeue(struct knote *kn)
7857 {
7858         struct kqueue *kq = knote_get_kq(kn);
7859         struct kqtailq *queue;
7860
7861         kqlock_held(kq);
7862
7863         if ((kn->kn_status & KN_QUEUED) == 0) {
7864                 return;
7865         }
7866
7867         queue = knote_get_queue(kn);
7868         TAILQ_REMOVE(queue, kn, kn_tqe);
7869         kn->kn_status &= ~KN_QUEUED;
7870         kq->kq_count--;
7871 }
7872
7873 void
7874 knote_init(void)
7875 {
7876         knote_zone = zinit(sizeof(struct knote), 8192 * sizeof(struct knote),
7877             8192, "knote zone");
7878
7879         kqfile_zone = zinit(sizeof(struct kqfile), 8192 * sizeof(struct kqfile),
7880             8192, "kqueue file zone");
7881
7882         kqworkq_zone = zinit(sizeof(struct kqworkq), 8192 * sizeof(struct kqworkq),
7883             8192, "kqueue workq zone");
7884
7885         kqworkloop_zone = zinit(sizeof(struct kqworkloop), 8192 * sizeof(struct kqworkloop),
7886             8192, "kqueue workloop zone");
7887
7888         /* allocate kq lock group attribute and group */
7889         kq_lck_grp_attr = lck_grp_attr_alloc_init();
7890
7891         kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr);
7892
7893         /* Allocate kq lock attribute */
7894         kq_lck_attr = lck_attr_alloc_init();
7895
7896 #if CONFIG_MEMORYSTATUS
7897         /* Initialize the memorystatus list lock */
7898         memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
7899 #endif
7900 }
7901 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
7902
7903 const struct filterops *
7904 knote_fops(struct knote *kn)
7905 {
7906         return sysfilt_ops[kn->kn_filtid];
7907 }
7908
7909 static struct knote *
7910 knote_alloc(void)
7911 {
7912         struct knote *kn = ((struct knote *)zalloc(knote_zone));
7913         bzero(kn, sizeof(struct knote));
7914         return kn;
7915 }
7916
7917 static void
7918 knote_free(struct knote *kn)
7919 {
7920         assert(kn->kn_inuse == 0);
7921         assert((kn->kn_status & KN_LOCKED) == 0);
7922         zfree(knote_zone, kn);
7923 }
7924
7925 #if SOCKETS
7926 #include <sys/param.h>
7927 #include <sys/socket.h>
7928 #include <sys/protosw.h>
7929 #include <sys/domain.h>
7930 #include <sys/mbuf.h>
7931 #include <sys/kern_event.h>
7932 #include <sys/malloc.h>
7933 #include <sys/sys_domain.h>
7934 #include <sys/syslog.h>
7935
7936 #ifndef ROUNDUP64
7937 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
7938 #endif
7939
7940 #ifndef ADVANCE64
7941 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
7942 #endif
7943
7944 static lck_grp_attr_t *kev_lck_grp_attr;
7945 static lck_attr_t *kev_lck_attr;
7946 static lck_grp_t *kev_lck_grp;
7947 static decl_lck_rw_data(, kev_lck_data);
7948 static lck_rw_t *kev_rwlock = &kev_lck_data;
7949
7950 static int kev_attach(struct socket *so, int proto, struct proc *p);
7951 static int kev_detach(struct socket *so);
7952 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
7953     struct ifnet *ifp, struct proc *p);
7954 static lck_mtx_t * event_getlock(struct socket *, int);
7955 static int event_lock(struct socket *, int, void *);
7956 static int event_unlock(struct socket *, int, void *);
7957
7958 static int event_sofreelastref(struct socket *);
7959 static void kev_delete(struct kern_event_pcb *);
7960
7961 static struct pr_usrreqs event_usrreqs = {
7962         .pru_attach =           kev_attach,
7963         .pru_control =          kev_control,
7964         .pru_detach =           kev_detach,
7965         .pru_soreceive =        soreceive,
7966 };
7967
7968 static struct protosw eventsw[] = {
7969         {
7970                 .pr_type =              SOCK_RAW,
7971                 .pr_protocol =          SYSPROTO_EVENT,
7972                 .pr_flags =             PR_ATOMIC,
7973                 .pr_usrreqs =           &event_usrreqs,
7974                 .pr_lock =              event_lock,
7975                 .pr_unlock =            event_unlock,
7976                 .pr_getlock =           event_getlock,
7977         }
7978 };
7979
7980 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
7981 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
7982
7983 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
7984     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
7985
7986 struct kevtstat kevtstat;
7987 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
7988     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
7989     kevt_getstat, "S,kevtstat", "");
7990
7991 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
7992     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
7993     kevt_pcblist, "S,xkevtpcb", "");
7994
7995 static lck_mtx_t *
7996 event_getlock(struct socket *so, int flags)
7997 {
7998 #pragma unused(flags)
7999         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8000
8001         if (so->so_pcb != NULL) {
8002                 if (so->so_usecount < 0) {
8003                         panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8004                             so, so->so_usecount, solockhistory_nr(so));
8005                 }
8006                 /* NOTREACHED */
8007         } else {
8008                 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
8009                     so, solockhistory_nr(so));
8010                 /* NOTREACHED */
8011         }
8012         return &ev_pcb->evp_mtx;
8013 }
8014
8015 static int
8016 event_lock(struct socket *so, int refcount, void *lr)
8017 {
8018         void *lr_saved;
8019
8020         if (lr == NULL) {
8021                 lr_saved = __builtin_return_address(0);
8022         } else {
8023                 lr_saved = lr;
8024         }
8025
8026         if (so->so_pcb != NULL) {
8027                 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8028         } else {
8029                 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
8030                     so, lr_saved, solockhistory_nr(so));
8031                 /* NOTREACHED */
8032         }
8033
8034         if (so->so_usecount < 0) {
8035                 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
8036                     so, so->so_pcb, lr_saved, so->so_usecount,
8037                     solockhistory_nr(so));
8038                 /* NOTREACHED */
8039         }
8040
8041         if (refcount) {
8042                 so->so_usecount++;
8043         }
8044
8045         so->lock_lr[so->next_lock_lr] = lr_saved;
8046         so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8047         return 0;
8048 }
8049
8050 static int
8051 event_unlock(struct socket *so, int refcount, void *lr)
8052 {
8053         void *lr_saved;
8054         lck_mtx_t *mutex_held;
8055
8056         if (lr == NULL) {
8057                 lr_saved = __builtin_return_address(0);
8058         } else {
8059                 lr_saved = lr;
8060         }
8061
8062         if (refcount) {
8063                 so->so_usecount--;
8064         }
8065         if (so->so_usecount < 0) {
8066                 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8067                     so, so->so_usecount, solockhistory_nr(so));
8068                 /* NOTREACHED */
8069         }
8070         if (so->so_pcb == NULL) {
8071                 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
8072                     so, so->so_usecount, (void *)lr_saved,
8073                     solockhistory_nr(so));
8074                 /* NOTREACHED */
8075         }
8076         mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8077
8078         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8079         so->unlock_lr[so->next_unlock_lr] = lr_saved;
8080         so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8081
8082         if (so->so_usecount == 0) {
8083                 VERIFY(so->so_flags & SOF_PCBCLEARING);
8084                 event_sofreelastref(so);
8085         } else {
8086                 lck_mtx_unlock(mutex_held);
8087         }
8088
8089         return 0;
8090 }
8091
8092 static int
8093 event_sofreelastref(struct socket *so)
8094 {
8095         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8096
8097         LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8098
8099         so->so_pcb = NULL;
8100
8101         /*
8102          * Disable upcall in the event another thread is in kev_post_msg()
8103          * appending record to the receive socket buffer, since sbwakeup()
8104          * may release the socket lock otherwise.
8105          */
8106         so->so_rcv.sb_flags &= ~SB_UPCALL;
8107         so->so_snd.sb_flags &= ~SB_UPCALL;
8108         so->so_event = sonullevent;
8109         lck_mtx_unlock(&(ev_pcb->evp_mtx));
8110
8111         LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8112         lck_rw_lock_exclusive(kev_rwlock);
8113         LIST_REMOVE(ev_pcb, evp_link);
8114         kevtstat.kes_pcbcount--;
8115         kevtstat.kes_gencnt++;
8116         lck_rw_done(kev_rwlock);
8117         kev_delete(ev_pcb);
8118
8119         sofreelastref(so, 1);
8120         return 0;
8121 }
8122
8123 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8124
8125 static
8126 struct kern_event_head kern_event_head;
8127
8128 static u_int32_t static_event_id = 0;
8129
8130 #define EVPCB_ZONE_MAX          65536
8131 #define EVPCB_ZONE_NAME         "kerneventpcb"
8132 static struct zone *ev_pcb_zone;
8133
8134 /*
8135  * Install the protosw's for the NKE manager.  Invoked at extension load time
8136  */
8137 void
8138 kern_event_init(struct domain *dp)
8139 {
8140         struct protosw *pr;
8141         int i;
8142
8143         VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8144         VERIFY(dp == systemdomain);
8145
8146         kev_lck_grp_attr = lck_grp_attr_alloc_init();
8147         if (kev_lck_grp_attr == NULL) {
8148                 panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
8149                 /* NOTREACHED */
8150         }
8151
8152         kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
8153             kev_lck_grp_attr);
8154         if (kev_lck_grp == NULL) {
8155                 panic("%s: lck_grp_alloc_init failed\n", __func__);
8156                 /* NOTREACHED */
8157         }
8158
8159         kev_lck_attr = lck_attr_alloc_init();
8160         if (kev_lck_attr == NULL) {
8161                 panic("%s: lck_attr_alloc_init failed\n", __func__);
8162                 /* NOTREACHED */
8163         }
8164
8165         lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
8166         if (kev_rwlock == NULL) {
8167                 panic("%s: lck_mtx_alloc_init failed\n", __func__);
8168                 /* NOTREACHED */
8169         }
8170
8171         for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8172                 net_add_proto(pr, dp, 1);
8173         }
8174
8175         ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
8176             EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
8177         if (ev_pcb_zone == NULL) {
8178                 panic("%s: failed allocating ev_pcb_zone", __func__);
8179                 /* NOTREACHED */
8180         }
8181         zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
8182         zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
8183 }
8184
8185 static int
8186 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8187 {
8188         int error = 0;
8189         struct kern_event_pcb *ev_pcb;
8190
8191         error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8192         if (error != 0) {
8193                 return error;
8194         }
8195
8196         if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
8197                 return ENOBUFS;
8198         }
8199         bzero(ev_pcb, sizeof(struct kern_event_pcb));
8200         lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
8201
8202         ev_pcb->evp_socket = so;
8203         ev_pcb->evp_vendor_code_filter = 0xffffffff;
8204
8205         so->so_pcb = (caddr_t) ev_pcb;
8206         lck_rw_lock_exclusive(kev_rwlock);
8207         LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8208         kevtstat.kes_pcbcount++;
8209         kevtstat.kes_gencnt++;
8210         lck_rw_done(kev_rwlock);
8211
8212         return error;
8213 }
8214
8215 static void
8216 kev_delete(struct kern_event_pcb *ev_pcb)
8217 {
8218         VERIFY(ev_pcb != NULL);
8219         lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
8220         zfree(ev_pcb_zone, ev_pcb);
8221 }
8222
8223 static int
8224 kev_detach(struct socket *so)
8225 {
8226         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8227
8228         if (ev_pcb != NULL) {
8229                 soisdisconnected(so);
8230                 so->so_flags |= SOF_PCBCLEARING;
8231         }
8232
8233         return 0;
8234 }
8235
8236 /*
8237  * For now, kev_vendor_code and mbuf_tags use the same
8238  * mechanism.
8239  */
8240 errno_t
8241 kev_vendor_code_find(
8242         const char      *string,
8243         u_int32_t       *out_vendor_code)
8244 {
8245         if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8246                 return EINVAL;
8247         }
8248         return net_str_id_find_internal(string, out_vendor_code,
8249                    NSI_VENDOR_CODE, 1);
8250 }
8251
8252 errno_t
8253 kev_msg_post(struct kev_msg *event_msg)
8254 {
8255         mbuf_tag_id_t min_vendor, max_vendor;
8256
8257         net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8258
8259         if (event_msg == NULL) {
8260                 return EINVAL;
8261         }
8262
8263         /*
8264          * Limit third parties to posting events for registered vendor codes
8265          * only
8266          */
8267         if (event_msg->vendor_code < min_vendor ||
8268             event_msg->vendor_code > max_vendor) {
8269                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
8270                 return EINVAL;
8271         }
8272         return kev_post_msg(event_msg);
8273 }
8274
8275 int
8276 kev_post_msg(struct kev_msg *event_msg)
8277 {
8278         struct mbuf *m, *m2;
8279         struct kern_event_pcb *ev_pcb;
8280         struct kern_event_msg *ev;
8281         char *tmp;
8282         u_int32_t total_size;
8283         int i;
8284
8285         /* Verify the message is small enough to fit in one mbuf w/o cluster */
8286         total_size = KEV_MSG_HEADER_SIZE;
8287
8288         for (i = 0; i < 5; i++) {
8289                 if (event_msg->dv[i].data_length == 0) {
8290                         break;
8291                 }
8292                 total_size += event_msg->dv[i].data_length;
8293         }
8294
8295         if (total_size > MLEN) {
8296                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
8297                 return EMSGSIZE;
8298         }
8299
8300         m = m_get(M_WAIT, MT_DATA);
8301         if (m == 0) {
8302                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
8303                 return ENOMEM;
8304         }
8305         ev = mtod(m, struct kern_event_msg *);
8306         total_size = KEV_MSG_HEADER_SIZE;
8307
8308         tmp = (char *) &ev->event_data[0];
8309         for (i = 0; i < 5; i++) {
8310                 if (event_msg->dv[i].data_length == 0) {
8311                         break;
8312                 }
8313
8314                 total_size += event_msg->dv[i].data_length;
8315                 bcopy(event_msg->dv[i].data_ptr, tmp,
8316                     event_msg->dv[i].data_length);
8317                 tmp += event_msg->dv[i].data_length;
8318         }
8319
8320         ev->id = ++static_event_id;
8321         ev->total_size   = total_size;
8322         ev->vendor_code  = event_msg->vendor_code;
8323         ev->kev_class    = event_msg->kev_class;
8324         ev->kev_subclass = event_msg->kev_subclass;
8325         ev->event_code   = event_msg->event_code;
8326
8327         m->m_len = total_size;
8328         lck_rw_lock_shared(kev_rwlock);
8329         for (ev_pcb = LIST_FIRST(&kern_event_head);
8330             ev_pcb;
8331             ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8332                 lck_mtx_lock(&ev_pcb->evp_mtx);
8333                 if (ev_pcb->evp_socket->so_pcb == NULL) {
8334                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8335                         continue;
8336                 }
8337                 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8338                         if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8339                                 lck_mtx_unlock(&ev_pcb->evp_mtx);
8340                                 continue;
8341                         }
8342
8343                         if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8344                                 if (ev_pcb->evp_class_filter != ev->kev_class) {
8345                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8346                                         continue;
8347                                 }
8348
8349                                 if ((ev_pcb->evp_subclass_filter !=
8350                                     KEV_ANY_SUBCLASS) &&
8351                                     (ev_pcb->evp_subclass_filter !=
8352                                     ev->kev_subclass)) {
8353                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8354                                         continue;
8355                                 }
8356                         }
8357                 }
8358
8359                 m2 = m_copym(m, 0, m->m_len, M_WAIT);
8360                 if (m2 == 0) {
8361                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
8362                         m_free(m);
8363                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8364                         lck_rw_done(kev_rwlock);
8365                         return ENOMEM;
8366                 }
8367                 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8368                         /*
8369                          * We use "m" for the socket stats as it would be
8370                          * unsafe to use "m2"
8371                          */
8372                         so_inc_recv_data_stat(ev_pcb->evp_socket,
8373                             1, m->m_len, MBUF_TC_BE);
8374
8375                         sorwakeup(ev_pcb->evp_socket);
8376                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
8377                 } else {
8378                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
8379                 }
8380                 lck_mtx_unlock(&ev_pcb->evp_mtx);
8381         }
8382         m_free(m);
8383         lck_rw_done(kev_rwlock);
8384
8385         return 0;
8386 }
8387
8388 static int
8389 kev_control(struct socket *so,
8390     u_long cmd,
8391     caddr_t data,
8392     __unused struct ifnet *ifp,
8393     __unused struct proc *p)
8394 {
8395         struct kev_request *kev_req = (struct kev_request *) data;
8396         struct kern_event_pcb  *ev_pcb;
8397         struct kev_vendor_code *kev_vendor;
8398         u_int32_t  *id_value = (u_int32_t *) data;
8399
8400         switch (cmd) {
8401         case SIOCGKEVID:
8402                 *id_value = static_event_id;
8403                 break;
8404         case SIOCSKEVFILT:
8405                 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8406                 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
8407                 ev_pcb->evp_class_filter = kev_req->kev_class;
8408                 ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
8409                 break;
8410         case SIOCGKEVFILT:
8411                 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8412                 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
8413                 kev_req->kev_class   = ev_pcb->evp_class_filter;
8414                 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
8415                 break;
8416         case SIOCGKEVVENDOR:
8417                 kev_vendor = (struct kev_vendor_code *)data;
8418                 /* Make sure string is NULL terminated */
8419                 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
8420                 return net_str_id_find_internal(kev_vendor->vendor_string,
8421                            &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
8422         default:
8423                 return ENOTSUP;
8424         }
8425
8426         return 0;
8427 }
8428
8429 int
8430 kevt_getstat SYSCTL_HANDLER_ARGS
8431 {
8432 #pragma unused(oidp, arg1, arg2)
8433         int error = 0;
8434
8435         lck_rw_lock_shared(kev_rwlock);
8436
8437         if (req->newptr != USER_ADDR_NULL) {
8438                 error = EPERM;
8439                 goto done;
8440         }
8441         if (req->oldptr == USER_ADDR_NULL) {
8442                 req->oldidx = sizeof(struct kevtstat);
8443                 goto done;
8444         }
8445
8446         error = SYSCTL_OUT(req, &kevtstat,
8447             MIN(sizeof(struct kevtstat), req->oldlen));
8448 done:
8449         lck_rw_done(kev_rwlock);
8450
8451         return error;
8452 }
8453
8454 __private_extern__ int
8455 kevt_pcblist SYSCTL_HANDLER_ARGS
8456 {
8457 #pragma unused(oidp, arg1, arg2)
8458         int error = 0;
8459         int n, i;
8460         struct xsystmgen xsg;
8461         void *buf = NULL;
8462         size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
8463             ROUNDUP64(sizeof(struct xsocket_n)) +
8464             2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
8465             ROUNDUP64(sizeof(struct xsockstat_n));
8466         struct kern_event_pcb  *ev_pcb;
8467
8468         buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
8469         if (buf == NULL) {
8470                 return ENOMEM;
8471         }
8472
8473         lck_rw_lock_shared(kev_rwlock);
8474
8475         n = kevtstat.kes_pcbcount;
8476
8477         if (req->oldptr == USER_ADDR_NULL) {
8478                 req->oldidx = (n + n / 8) * item_size;
8479                 goto done;
8480         }
8481         if (req->newptr != USER_ADDR_NULL) {
8482                 error = EPERM;
8483                 goto done;
8484         }
8485         bzero(&xsg, sizeof(xsg));
8486         xsg.xg_len = sizeof(xsg);
8487         xsg.xg_count = n;
8488         xsg.xg_gen = kevtstat.kes_gencnt;
8489         xsg.xg_sogen = so_gencnt;
8490         error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8491         if (error) {
8492                 goto done;
8493         }
8494         /*
8495          * We are done if there is no pcb
8496          */
8497         if (n == 0) {
8498                 goto done;
8499         }
8500
8501         i = 0;
8502         for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
8503             i < n && ev_pcb != NULL;
8504             i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8505                 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
8506                 struct xsocket_n *xso = (struct xsocket_n *)
8507                     ADVANCE64(xk, sizeof(*xk));
8508                 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
8509                     ADVANCE64(xso, sizeof(*xso));
8510                 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
8511                     ADVANCE64(xsbrcv, sizeof(*xsbrcv));
8512                 struct xsockstat_n *xsostats = (struct xsockstat_n *)
8513                     ADVANCE64(xsbsnd, sizeof(*xsbsnd));
8514
8515                 bzero(buf, item_size);
8516
8517                 lck_mtx_lock(&ev_pcb->evp_mtx);
8518
8519                 xk->kep_len = sizeof(struct xkevtpcb);
8520                 xk->kep_kind = XSO_EVT;
8521                 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
8522                 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
8523                 xk->kep_class_filter = ev_pcb->evp_class_filter;
8524                 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
8525
8526                 sotoxsocket_n(ev_pcb->evp_socket, xso);
8527                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
8528                     &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
8529                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
8530                     &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
8531                 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
8532
8533                 lck_mtx_unlock(&ev_pcb->evp_mtx);
8534
8535                 error = SYSCTL_OUT(req, buf, item_size);
8536         }
8537
8538         if (error == 0) {
8539                 /*
8540                  * Give the user an updated idea of our state.
8541                  * If the generation differs from what we told
8542                  * her before, she knows that something happened
8543                  * while we were processing this request, and it
8544                  * might be necessary to retry.
8545                  */
8546                 bzero(&xsg, sizeof(xsg));
8547                 xsg.xg_len = sizeof(xsg);
8548                 xsg.xg_count = n;
8549                 xsg.xg_gen = kevtstat.kes_gencnt;
8550                 xsg.xg_sogen = so_gencnt;
8551                 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8552                 if (error) {
8553                         goto done;
8554                 }
8555         }
8556
8557 done:
8558         lck_rw_done(kev_rwlock);
8559
8560         return error;
8561 }
8562
8563 #endif /* SOCKETS */
8564
8565
8566 int
8567 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
8568 {
8569         struct vinfo_stat * st;
8570
8571         st = &kinfo->kq_stat;
8572
8573         st->vst_size = kq->kq_count;
8574         if (kq->kq_state & KQ_KEV_QOS) {
8575                 st->vst_blksize = sizeof(struct kevent_qos_s);
8576         } else if (kq->kq_state & KQ_KEV64) {
8577                 st->vst_blksize = sizeof(struct kevent64_s);
8578         } else {
8579                 st->vst_blksize = sizeof(struct kevent);
8580         }
8581         st->vst_mode = S_IFIFO;
8582         st->vst_ino = (kq->kq_state & KQ_DYNAMIC) ?
8583             ((struct kqworkloop *)kq)->kqwl_dynamicid : 0;
8584
8585         /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
8586 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
8587         kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
8588
8589         return 0;
8590 }
8591
8592 static int
8593 fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi)
8594 {
8595         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8596         struct kqrequest *kqr = &kqwl->kqwl_request;
8597         workq_threadreq_param_t trp = {};
8598         int err;
8599
8600         if ((kq->kq_state & KQ_WORKLOOP) == 0) {
8601                 return EINVAL;
8602         }
8603
8604         if ((err = fill_kqueueinfo(kq, &kqdi->kqdi_info))) {
8605                 return err;
8606         }
8607
8608         kq_req_lock(kqwl);
8609
8610         kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread);
8611         kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
8612         kqdi->kqdi_request_state = kqr->kqr_state;
8613         kqdi->kqdi_async_qos = kqr->kqr_qos_index;
8614         kqdi->kqdi_events_qos = kqr->kqr_override_index;
8615         kqdi->kqdi_sync_waiters = kqr->kqr_dsync_waiters;
8616         kqdi->kqdi_sync_waiter_qos = 0;
8617
8618         trp.trp_value = kqwl->kqwl_params;
8619         if (trp.trp_flags & TRP_PRIORITY) {
8620                 kqdi->kqdi_pri = trp.trp_pri;
8621         } else {
8622                 kqdi->kqdi_pri = 0;
8623         }
8624
8625         if (trp.trp_flags & TRP_POLICY) {
8626                 kqdi->kqdi_pol = trp.trp_pol;
8627         } else {
8628                 kqdi->kqdi_pol = 0;
8629         }
8630
8631         if (trp.trp_flags & TRP_CPUPERCENT) {
8632                 kqdi->kqdi_cpupercent = trp.trp_cpupercent;
8633         } else {
8634                 kqdi->kqdi_cpupercent = 0;
8635         }
8636
8637         kq_req_unlock(kqwl);
8638
8639         return 0;
8640 }
8641
8642
8643 void
8644 knote_markstayactive(struct knote *kn)
8645 {
8646         struct kqueue *kq = knote_get_kq(kn);
8647         kq_index_t qos;
8648
8649         kqlock(kq);
8650         kn->kn_status |= KN_STAYACTIVE;
8651
8652         /*
8653          * Making a knote stay active is a property of the knote that must be
8654          * established before it is fully attached.
8655          */
8656         assert(kn->kn_status & KN_ATTACHING);
8657         assert((kn->kn_status & (KN_QUEUED | KN_SUPPRESSED)) == 0);
8658
8659         /* handle all stayactive knotes on the (appropriate) manager */
8660         if (kq->kq_state & KQ_WORKQ) {
8661                 qos = KQWQ_QOS_MANAGER;
8662         } else if (kq->kq_state & KQ_WORKLOOP) {
8663                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8664
8665                 qos = _pthread_priority_thread_qos(kn->kn_qos);
8666                 assert(qos && qos < THREAD_QOS_LAST);
8667                 kq_req_lock(kq);
8668                 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, qos);
8669                 kq_req_unlock(kq);
8670                 qos = KQWL_BUCKET_STAYACTIVE;
8671         } else {
8672                 qos = THREAD_QOS_UNSPECIFIED;
8673         }
8674
8675         kn->kn_req_index = qos;
8676         kn->kn_qos_override = qos;
8677         kn->kn_qos_index = qos;
8678
8679         knote_activate(kn);
8680         kqunlock(kq);
8681 }
8682
8683 void
8684 knote_clearstayactive(struct knote *kn)
8685 {
8686         kqlock(knote_get_kq(kn));
8687         kn->kn_status &= ~KN_STAYACTIVE;
8688         knote_deactivate(kn);
8689         kqunlock(knote_get_kq(kn));
8690 }
8691
8692 static unsigned long
8693 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
8694     unsigned long buflen, unsigned long nknotes)
8695 {
8696         for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
8697                 if (kq == knote_get_kq(kn)) {
8698                         if (nknotes < buflen) {
8699                                 struct kevent_extinfo *info = &buf[nknotes];
8700                                 struct kevent_internal_s *kevp = &kn->kn_kevent;
8701
8702                                 kqlock(kq);
8703
8704                                 info->kqext_kev = (struct kevent_qos_s){
8705                                         .ident = kevp->ident,
8706                                         .filter = kevp->filter,
8707                                         .flags = kevp->flags,
8708                                         .fflags = kevp->fflags,
8709                                         .data = (int64_t)kevp->data,
8710                                         .udata = kevp->udata,
8711                                         .ext[0] = kevp->ext[0],
8712                                         .ext[1] = kevp->ext[1],
8713                                         .ext[2] = kevp->ext[2],
8714                                         .ext[3] = kevp->ext[3],
8715                                         .qos = kn->kn_req_index,
8716                                 };
8717                                 info->kqext_sdata = kn->kn_sdata;
8718                                 info->kqext_status = kn->kn_status;
8719                                 info->kqext_sfflags = kn->kn_sfflags;
8720
8721                                 kqunlock(kq);
8722                         }
8723
8724                         /* we return total number of knotes, which may be more than requested */
8725                         nknotes++;
8726                 }
8727         }
8728
8729         return nknotes;
8730 }
8731
8732 int
8733 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
8734     int32_t *nkqueues_out)
8735 {
8736         proc_t p = (proc_t)proc;
8737         struct filedesc *fdp = p->p_fd;
8738         unsigned int nkqueues = 0;
8739         unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
8740         size_t buflen, bufsize;
8741         kqueue_id_t *kq_ids = NULL;
8742         int err = 0;
8743
8744         assert(p != NULL);
8745
8746         if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
8747                 err = EINVAL;
8748                 goto out;
8749         }
8750
8751         buflen = min(ubuflen, PROC_PIDDYNKQUEUES_MAX);
8752
8753         if (ubuflen != 0) {
8754                 if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
8755                         err = ERANGE;
8756                         goto out;
8757                 }
8758                 kq_ids = kalloc(bufsize);
8759                 if (!kq_ids) {
8760                         err = ENOMEM;
8761                         goto out;
8762                 }
8763                 bzero(kq_ids, bufsize);
8764         }
8765
8766         kqhash_lock(p);
8767
8768         if (fdp->fd_kqhashmask > 0) {
8769                 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
8770                         struct kqworkloop *kqwl;
8771
8772                         SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8773                                 /* report the number of kqueues, even if they don't all fit */
8774                                 if (nkqueues < buflen) {
8775                                         kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
8776                                 }
8777                                 nkqueues++;
8778                         }
8779                 }
8780         }
8781
8782         kqhash_unlock(p);
8783
8784         if (kq_ids) {
8785                 size_t copysize;
8786                 if (os_mul_overflow(sizeof(kqueue_id_t), min(buflen, nkqueues), &copysize)) {
8787                         err = ERANGE;
8788                         goto out;
8789                 }
8790
8791                 assert(ubufsize >= copysize);
8792                 err = copyout(kq_ids, ubuf, copysize);
8793         }
8794
8795 out:
8796         if (kq_ids) {
8797                 kfree(kq_ids, bufsize);
8798         }
8799
8800         if (!err) {
8801                 *nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
8802         }
8803         return err;
8804 }
8805
8806 int
8807 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8808     uint32_t ubufsize, int32_t *size_out)
8809 {
8810         proc_t p = (proc_t)proc;
8811         struct kqueue *kq;
8812         int err = 0;
8813         struct kqueue_dyninfo kqdi = { };
8814
8815         assert(p != NULL);
8816
8817         if (ubufsize < sizeof(struct kqueue_info)) {
8818                 return ENOBUFS;
8819         }
8820
8821         kqhash_lock(p);
8822         kq = kqueue_hash_lookup(p, kq_id);
8823         if (!kq) {
8824                 kqhash_unlock(p);
8825                 return ESRCH;
8826         }
8827         kqueue_retain(kq);
8828         kqhash_unlock(p);
8829
8830         /*
8831          * backward compatibility: allow the argument to this call to only be
8832          * a struct kqueue_info
8833          */
8834         if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
8835                 ubufsize = sizeof(struct kqueue_dyninfo);
8836                 err = fill_kqueue_dyninfo(kq, &kqdi);
8837         } else {
8838                 ubufsize = sizeof(struct kqueue_info);
8839                 err = fill_kqueueinfo(kq, &kqdi.kqdi_info);
8840         }
8841         if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
8842                 *size_out = ubufsize;
8843         }
8844         kqueue_release_last(p, kq);
8845         return err;
8846 }
8847
8848 int
8849 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8850     uint32_t ubufsize, int32_t *nknotes_out)
8851 {
8852         proc_t p = (proc_t)proc;
8853         struct kqueue *kq;
8854         int err;
8855
8856         assert(p != NULL);
8857
8858         kqhash_lock(p);
8859         kq = kqueue_hash_lookup(p, kq_id);
8860         if (!kq) {
8861                 kqhash_unlock(p);
8862                 return ESRCH;
8863         }
8864         kqueue_retain(kq);
8865         kqhash_unlock(p);
8866
8867         err = pid_kqueue_extinfo(p, kq, ubuf, ubufsize, nknotes_out);
8868         kqueue_release_last(p, kq);
8869         return err;
8870 }
8871
8872 int
8873 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
8874     uint32_t bufsize, int32_t *retval)
8875 {
8876         struct knote *kn;
8877         int i;
8878         int err = 0;
8879         struct filedesc *fdp = p->p_fd;
8880         unsigned long nknotes = 0;
8881         unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
8882         struct kevent_extinfo *kqext = NULL;
8883
8884         /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
8885         buflen = min(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
8886
8887         kqext = kalloc(buflen * sizeof(struct kevent_extinfo));
8888         if (kqext == NULL) {
8889                 err = ENOMEM;
8890                 goto out;
8891         }
8892         bzero(kqext, buflen * sizeof(struct kevent_extinfo));
8893
8894         proc_fdlock(p);
8895         for (i = 0; i < fdp->fd_knlistsize; i++) {
8896                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
8897                 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8898         }
8899         proc_fdunlock(p);
8900
8901         if (fdp->fd_knhashmask != 0) {
8902                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
8903                         kqhash_lock(p);
8904                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
8905                         nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8906                         kqhash_unlock(p);
8907                 }
8908         }
8909
8910         assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
8911         err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
8912
8913 out:
8914         if (kqext) {
8915                 kfree(kqext, buflen * sizeof(struct kevent_extinfo));
8916                 kqext = NULL;
8917         }
8918
8919         if (!err) {
8920                 *retval = min(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
8921         }
8922         return err;
8923 }
8924
8925 static unsigned int
8926 klist_copy_udata(struct klist *list, uint64_t *buf,
8927     unsigned int buflen, unsigned int nknotes)
8928 {
8929         struct kevent_internal_s *kev;
8930         struct knote *kn;
8931         SLIST_FOREACH(kn, list, kn_link) {
8932                 if (nknotes < buflen) {
8933                         struct kqueue *kq = knote_get_kq(kn);
8934                         kqlock(kq);
8935                         kev = &(kn->kn_kevent);
8936                         buf[nknotes] = kev->udata;
8937                         kqunlock(kq);
8938                 }
8939                 /* we return total number of knotes, which may be more than requested */
8940                 nknotes++;
8941         }
8942
8943         return nknotes;
8944 }
8945
8946 static unsigned int
8947 kqlist_copy_dynamicids(__assert_only proc_t p, struct kqlist *list,
8948     uint64_t *buf, unsigned int buflen, unsigned int nids)
8949 {
8950         kqhash_lock_held(p);
8951         struct kqworkloop *kqwl;
8952         SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
8953                 if (nids < buflen) {
8954                         buf[nids] = kqwl->kqwl_dynamicid;
8955                 }
8956                 nids++;
8957         }
8958         return nids;
8959 }
8960
8961 int
8962 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize)
8963 {
8964         proc_t p = (proc_t)proc;
8965         struct filedesc *fdp = p->p_fd;
8966         unsigned int nuptrs = 0;
8967         unsigned long buflen = bufsize / sizeof(uint64_t);
8968
8969         if (buflen > 0) {
8970                 assert(buf != NULL);
8971         }
8972
8973         proc_fdlock(p);
8974         for (int i = 0; i < fdp->fd_knlistsize; i++) {
8975                 nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
8976         }
8977         knhash_lock(p);
8978         proc_fdunlock(p);
8979         if (fdp->fd_knhashmask != 0) {
8980                 for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
8981                         nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
8982                 }
8983         }
8984         knhash_unlock(p);
8985
8986         kqhash_lock(p);
8987         if (fdp->fd_kqhashmask != 0) {
8988                 for (int i = 0; i < (int)fdp->fd_kqhashmask + 1; i++) {
8989                         nuptrs = kqlist_copy_dynamicids(p, &fdp->fd_kqhash[i], buf, buflen,
8990                             nuptrs);
8991                 }
8992         }
8993         kqhash_unlock(p);
8994
8995         return (int)nuptrs;
8996 }
8997
8998 static void
8999 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9000 {
9001         uint64_t ast_addr;
9002         bool proc_is_64bit = !!(p->p_flag & P_LP64);
9003         size_t user_addr_size = proc_is_64bit ? 8 : 4;
9004         uint32_t ast_flags32 = 0;
9005         uint64_t ast_flags64 = 0;
9006         struct uthread *ut = get_bsdthread_info(thread);
9007
9008         if (ut->uu_kqr_bound != NULL) {
9009                 ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9010         }
9011
9012         if (ast_flags64 == 0) {
9013                 return;
9014         }
9015
9016         if (!(p->p_flag & P_LP64)) {
9017                 ast_flags32 = (uint32_t)ast_flags64;
9018                 assert(ast_flags64 < 0x100000000ull);
9019         }
9020
9021         ast_addr = thread_rettokern_addr(thread);
9022         if (ast_addr == 0) {
9023                 return;
9024         }
9025
9026         if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9027             (user_addr_t)ast_addr,
9028             user_addr_size) != 0) {
9029                 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9030                     "ast_addr = %llu\n", p->p_pid, thread_tid(current_thread()), ast_addr);
9031         }
9032 }
9033
9034 void
9035 kevent_ast(thread_t thread, uint16_t bits)
9036 {
9037         proc_t p = current_proc();
9038
9039         if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9040                 workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9041         }
9042         if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9043                 kevent_set_return_to_kernel_user_tsd(p, thread);
9044         }
9045 }
9046
9047 #if DEVELOPMENT || DEBUG
9048
9049 #define KEVENT_SYSCTL_BOUND_ID 1
9050
9051 static int
9052 kevent_sysctl SYSCTL_HANDLER_ARGS
9053 {
9054 #pragma unused(oidp, arg2)
9055         uintptr_t type = (uintptr_t)arg1;
9056         uint64_t bound_id = 0;
9057
9058         if (type != KEVENT_SYSCTL_BOUND_ID) {
9059                 return EINVAL;
9060         }
9061
9062         if (req->newptr) {
9063                 return EINVAL;
9064         }
9065
9066         struct uthread *ut = get_bsdthread_info(current_thread());
9067         if (!ut) {
9068                 return EFAULT;
9069         }
9070
9071         struct kqrequest *kqr = ut->uu_kqr_bound;
9072         if (kqr) {
9073                 if (kqr->kqr_state & KQR_WORKLOOP) {
9074                         bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9075                 } else {
9076                         bound_id = -1;
9077                 }
9078         }
9079
9080         return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9081 }
9082
9083 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9084     "kevent information");
9085
9086 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9087     CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9088     (void *)KEVENT_SYSCTL_BOUND_ID,
9089     sizeof(kqueue_id_t), kevent_sysctl, "Q",
9090     "get the ID of the bound kqueue");
9091
9092 #endif /* DEVELOPMENT || DEBUG */