bsd/kern/kern_event.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  */
  29 /*-
  30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  31  * All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  52  * SUCH DAMAGE.
  53  */
  54 /*
  55  *      @(#)kern_event.c       1.0 (3/31/2000)
  56  */
  57 #include <stdint.h>
  58 #include <machine/atomic.h>
  59
  60 #include <sys/param.h>
  61 #include <sys/systm.h>
  62 #include <sys/filedesc.h>
  63 #include <sys/kernel.h>
  64 #include <sys/proc_internal.h>
  65 #include <sys/kauth.h>
  66 #include <sys/malloc.h>
  67 #include <sys/unistd.h>
  68 #include <sys/file_internal.h>
  69 #include <sys/fcntl.h>
  70 #include <sys/select.h>
  71 #include <sys/queue.h>
  72 #include <sys/event.h>
  73 #include <sys/eventvar.h>
  74 #include <sys/protosw.h>
  75 #include <sys/socket.h>
  76 #include <sys/socketvar.h>
  77 #include <sys/stat.h>
  78 #include <sys/syscall.h> // SYS_* constants
  79 #include <sys/sysctl.h>
  80 #include <sys/uio.h>
  81 #include <sys/sysproto.h>
  82 #include <sys/user.h>
  83 #include <sys/vnode_internal.h>
  84 #include <string.h>
  85 #include <sys/proc_info.h>
  86 #include <sys/codesign.h>
  87 #include <sys/pthread_shims.h>
  88 #include <sys/kdebug.h>
  89 #include <os/base.h>
  90 #include <pexpert/pexpert.h>
  91
  92 #include <kern/locks.h>
  93 #include <kern/clock.h>
  94 #include <kern/cpu_data.h>
  95 #include <kern/policy_internal.h>
  96 #include <kern/thread_call.h>
  97 #include <kern/sched_prim.h>
  98 #include <kern/waitq.h>
  99 #include <kern/zalloc.h>
 100 #include <kern/kalloc.h>
 101 #include <kern/assert.h>
 102 #include <kern/ast.h>
 103 #include <kern/thread.h>
 104 #include <kern/kcdata.h>
 105
 106 #include <pthread/priority_private.h>
 107 #include <pthread/workqueue_syscalls.h>
 108 #include <pthread/workqueue_internal.h>
 109 #include <libkern/libkern.h>
 110
 111 #include "net/net_str_id.h"
 112
 113 #include <mach/task.h>
 114 #include <libkern/section_keywords.h>
 115
 116 #if CONFIG_MEMORYSTATUS
 117 #include <sys/kern_memorystatus.h>
 118 #endif
 119
 120 #if DEVELOPMENT || DEBUG
 121 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK  (1U << 0)
 122 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS     (1U << 1)
 123 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
 124 #endif
 125
 126 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
 127 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
 128     VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
 129
 130 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
 131 extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
 132
 133 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
 134
 135 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 136
 137 #define KQ_EVENT        NO_EVENT64
 138
 139 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
 140     vfs_context_t ctx);
 141 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
 142 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
 143     struct kevent_qos_s *kev);
 144 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
 145
 146 static const struct fileops kqueueops = {
 147         .fo_type     = DTYPE_KQUEUE,
 148         .fo_read     = fo_no_read,
 149         .fo_write    = fo_no_write,
 150         .fo_ioctl    = fo_no_ioctl,
 151         .fo_select   = kqueue_select,
 152         .fo_close    = kqueue_close,
 153         .fo_drain    = kqueue_drain,
 154         .fo_kqfilter = kqueue_kqfilter,
 155 };
 156
 157 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
 158 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
 159 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
 160     thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
 161 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
 162 static void kevent_register_wait_cleanup(struct knote *kn);
 163
 164 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
 165 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
 166
 167 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
 168 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
 169 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
 170
 171 static void kqworkloop_unbind(struct kqworkloop *kwql);
 172
 173 enum kqwl_unbind_locked_mode {
 174         KQWL_OVERRIDE_DROP_IMMEDIATELY,
 175         KQWL_OVERRIDE_DROP_DELAYED,
 176 };
 177 static void kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread,
 178     enum kqwl_unbind_locked_mode how);
 179 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
 180 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
 181 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
 182 enum {
 183         KQWL_UTQ_NONE,
 184         /*
 185          * The wakeup qos is the qos of QUEUED knotes.
 186          *
 187          * This QoS is accounted for with the events override in the
 188          * kqr_override_index field. It is raised each time a new knote is queued at
 189          * a given QoS. The kqwl_wakeup_indexes field is a superset of the non empty
 190          * knote buckets and is recomputed after each event delivery.
 191          */
 192         KQWL_UTQ_UPDATE_WAKEUP_QOS,
 193         KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
 194         KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
 195         KQWL_UTQ_UNBINDING, /* attempt to rebind */
 196         KQWL_UTQ_PARKING,
 197         /*
 198          * The wakeup override is for suppressed knotes that have fired again at
 199          * a higher QoS than the one for which they are suppressed already.
 200          * This override is cleared when the knote suppressed list becomes empty.
 201          */
 202         KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
 203         KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
 204         /*
 205          * The QoS is the maximum QoS of an event enqueued on this workloop in
 206          * userland. It is copied from the only EVFILT_WORKLOOP knote with
 207          * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
 208          * such knote, this QoS is 0.
 209          */
 210         KQWL_UTQ_SET_QOS_INDEX,
 211         KQWL_UTQ_REDRIVE_EVENTS,
 212 };
 213 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
 214 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
 215
 216 static struct knote *knote_alloc(void);
 217 static void knote_free(struct knote *kn);
 218 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
 219     struct knote_lock_ctx *knlc, struct proc *p);
 220 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
 221     struct kevent_qos_s *kev, bool is_fd, struct proc *p);
 222
 223 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
 224 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
 225
 226 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
 227     struct kevent_qos_s *kev, int result);
 228 static void knote_suppress(kqueue_t kqu, struct knote *kn);
 229 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
 230 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
 231
 232 // both these functions may dequeue the knote and it is up to the caller
 233 // to enqueue the knote back
 234 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
 235 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
 236
 237 static ZONE_DECLARE(knote_zone, "knote zone",
 238     sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
 239 static ZONE_DECLARE(kqfile_zone, "kqueue file zone",
 240     sizeof(struct kqfile), ZC_ZFREE_CLEARMEM);
 241 static ZONE_DECLARE(kqworkq_zone, "kqueue workq zone",
 242     sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM);
 243 static ZONE_DECLARE(kqworkloop_zone, "kqueue workloop zone",
 244     sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM);
 245
 246 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 247
 248 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
 249 static void filt_no_detach(struct knote *kn);
 250 static int filt_bad_event(struct knote *kn, long hint);
 251 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
 252 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
 253
 254 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
 255         .f_attach  = filt_no_attach,
 256         .f_detach  = filt_no_detach,
 257         .f_event   = filt_bad_event,
 258         .f_touch   = filt_bad_touch,
 259         .f_process = filt_bad_process,
 260 };
 261
 262 #if CONFIG_MEMORYSTATUS
 263 extern const struct filterops memorystatus_filtops;
 264 #endif /* CONFIG_MEMORYSTATUS */
 265 extern const struct filterops fs_filtops;
 266 extern const struct filterops sig_filtops;
 267 extern const struct filterops machport_filtops;
 268 extern const struct filterops pipe_nfiltops;
 269 extern const struct filterops pipe_rfiltops;
 270 extern const struct filterops pipe_wfiltops;
 271 extern const struct filterops ptsd_kqops;
 272 extern const struct filterops ptmx_kqops;
 273 extern const struct filterops soread_filtops;
 274 extern const struct filterops sowrite_filtops;
 275 extern const struct filterops sock_filtops;
 276 extern const struct filterops soexcept_filtops;
 277 extern const struct filterops spec_filtops;
 278 extern const struct filterops bpfread_filtops;
 279 extern const struct filterops necp_fd_rfiltops;
 280 extern const struct filterops fsevent_filtops;
 281 extern const struct filterops vnode_filtops;
 282 extern const struct filterops tty_filtops;
 283
 284 const static struct filterops file_filtops;
 285 const static struct filterops kqread_filtops;
 286 const static struct filterops proc_filtops;
 287 const static struct filterops timer_filtops;
 288 const static struct filterops user_filtops;
 289 const static struct filterops workloop_filtops;
 290
 291 /*
 292  *
 293  * Rules for adding new filters to the system:
 294  * Public filters:
 295  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
 296  *   in the exported section of the header
 297  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
 298  * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
 299  *   of the Public Filters section in the array.
 300  * Private filters:
 301  * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
 302  *   in the XNU_KERNEL_PRIVATE section of the header
 303  * - Update the EVFILTID_MAX value to reflect the new addition
 304  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
 305  *   the Private filters section of the array.
 306  */
 307 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
 308 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
 309         /* Public Filters */
 310         [~EVFILT_READ]                  = &file_filtops,
 311         [~EVFILT_WRITE]                 = &file_filtops,
 312         [~EVFILT_AIO]                   = &bad_filtops,
 313         [~EVFILT_VNODE]                 = &file_filtops,
 314         [~EVFILT_PROC]                  = &proc_filtops,
 315         [~EVFILT_SIGNAL]                = &sig_filtops,
 316         [~EVFILT_TIMER]                 = &timer_filtops,
 317         [~EVFILT_MACHPORT]              = &machport_filtops,
 318         [~EVFILT_FS]                    = &fs_filtops,
 319         [~EVFILT_USER]                  = &user_filtops,
 320         [~EVFILT_UNUSED_11]             = &bad_filtops,
 321         [~EVFILT_VM]                    = &bad_filtops,
 322         [~EVFILT_SOCK]                  = &file_filtops,
 323 #if CONFIG_MEMORYSTATUS
 324         [~EVFILT_MEMORYSTATUS]          = &memorystatus_filtops,
 325 #else
 326         [~EVFILT_MEMORYSTATUS]          = &bad_filtops,
 327 #endif
 328         [~EVFILT_EXCEPT]                = &file_filtops,
 329         [~EVFILT_WORKLOOP]              = &workloop_filtops,
 330
 331         /* Private filters */
 332         [EVFILTID_KQREAD]               = &kqread_filtops,
 333         [EVFILTID_PIPE_N]               = &pipe_nfiltops,
 334         [EVFILTID_PIPE_R]               = &pipe_rfiltops,
 335         [EVFILTID_PIPE_W]               = &pipe_wfiltops,
 336         [EVFILTID_PTSD]                 = &ptsd_kqops,
 337         [EVFILTID_SOREAD]               = &soread_filtops,
 338         [EVFILTID_SOWRITE]              = &sowrite_filtops,
 339         [EVFILTID_SCK]                  = &sock_filtops,
 340         [EVFILTID_SOEXCEPT]             = &soexcept_filtops,
 341         [EVFILTID_SPEC]                 = &spec_filtops,
 342         [EVFILTID_BPFREAD]              = &bpfread_filtops,
 343         [EVFILTID_NECP_FD]              = &necp_fd_rfiltops,
 344         [EVFILTID_FSEVENT]              = &fsevent_filtops,
 345         [EVFILTID_VN]                   = &vnode_filtops,
 346         [EVFILTID_TTY]                  = &tty_filtops,
 347         [EVFILTID_PTMX]                 = &ptmx_kqops,
 348
 349         /* fake filter for detached knotes, keep last */
 350         [EVFILTID_DETACHED]             = &bad_filtops,
 351 };
 352
 353 /* waitq prepost callback */
 354 void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *kq_hook);
 355
 356 static inline bool
 357 kqr_thread_bound(workq_threadreq_t kqr)
 358 {
 359         return kqr->tr_state == WORKQ_TR_STATE_BOUND;
 360 }
 361
 362 static inline bool
 363 kqr_thread_requested_pending(workq_threadreq_t kqr)
 364 {
 365         workq_tr_state_t tr_state = kqr->tr_state;
 366         return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
 367 }
 368
 369 static inline bool
 370 kqr_thread_requested(workq_threadreq_t kqr)
 371 {
 372         return kqr->tr_state != WORKQ_TR_STATE_IDLE;
 373 }
 374
 375 static inline thread_t
 376 kqr_thread_fast(workq_threadreq_t kqr)
 377 {
 378         assert(kqr_thread_bound(kqr));
 379         return kqr->tr_thread;
 380 }
 381
 382 static inline thread_t
 383 kqr_thread(workq_threadreq_t kqr)
 384 {
 385         return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
 386 }
 387
 388 static inline struct kqworkloop *
 389 kqr_kqworkloop(workq_threadreq_t kqr)
 390 {
 391         if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
 392                 return __container_of(kqr, struct kqworkloop, kqwl_request);
 393         }
 394         return NULL;
 395 }
 396
 397 static inline kqueue_t
 398 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
 399 {
 400         kqueue_t kqu;
 401         if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
 402                 kqu.kqwl = kqr_kqworkloop(kqr);
 403         } else {
 404                 kqu.kqwq = p->p_fd->fd_wqkqueue;
 405                 assert(kqr >= kqu.kqwq->kqwq_request &&
 406                     kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
 407         }
 408         return kqu;
 409 }
 410
 411 /*
 412  * kqueue/note lock implementations
 413  *
 414  *      The kqueue lock guards the kq state, the state of its queues,
 415  *      and the kqueue-aware status and locks of individual knotes.
 416  *
 417  *      The kqueue workq lock is used to protect state guarding the
 418  *      interaction of the kqueue with the workq.  This state cannot
 419  *      be guarded by the kq lock - as it needs to be taken when we
 420  *      already have the waitq set lock held (during the waitq hook
 421  *      callback).  It might be better to use the waitq lock itself
 422  *      for this, but the IRQ requirements make that difficult).
 423  *
 424  *      Knote flags, filter flags, and associated data are protected
 425  *      by the underlying object lock - and are only ever looked at
 426  *      by calling the filter to get a [consistent] snapshot of that
 427  *      data.
 428  */
 429
 430 static inline void
 431 kqlock(kqueue_t kqu)
 432 {
 433         lck_spin_lock(&kqu.kq->kq_lock);
 434 }
 435
 436 static inline void
 437 kqlock_held(__assert_only kqueue_t kqu)
 438 {
 439         LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
 440 }
 441
 442 static inline void
 443 kqunlock(kqueue_t kqu)
 444 {
 445         lck_spin_unlock(&kqu.kq->kq_lock);
 446 }
 447
 448 static inline void
 449 knhash_lock(struct filedesc *fdp)
 450 {
 451         lck_mtx_lock(&fdp->fd_knhashlock);
 452 }
 453
 454 static inline void
 455 knhash_unlock(struct filedesc *fdp)
 456 {
 457         lck_mtx_unlock(&fdp->fd_knhashlock);
 458 }
 459
 460 /* wait event for knote locks */
 461 static inline event_t
 462 knote_lock_wev(struct knote *kn)
 463 {
 464         return (event_t)(&kn->kn_hook);
 465 }
 466
 467 /* wait event for kevent_register_wait_* */
 468 static inline event64_t
 469 knote_filt_wev64(struct knote *kn)
 470 {
 471         /* kdp_workloop_sync_wait_find_owner knows about this */
 472         return CAST_EVENT64_T(kn);
 473 }
 474
 475 /* wait event for knote_post/knote_drop */
 476 static inline event64_t
 477 knote_post_wev64(struct knote *kn)
 478 {
 479         return CAST_EVENT64_T(&kn->kn_kevent);
 480 }
 481
 482 /*!
 483  * @function knote_has_qos
 484  *
 485  * @brief
 486  * Whether the knote has a regular QoS.
 487  *
 488  * @discussion
 489  * kn_qos_override is:
 490  * - 0 on kqfiles
 491  * - THREAD_QOS_LAST for special buckets (stayactive, manager)
 492  *
 493  * Other values mean the knote participates to QoS propagation.
 494  */
 495 static inline bool
 496 knote_has_qos(struct knote *kn)
 497 {
 498         return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
 499 }
 500
 501 #pragma mark knote locks
 502
 503 /*
 504  * Enum used by the knote_lock_* functions.
 505  *
 506  * KNOTE_KQ_LOCK_ALWAYS
 507  *   The function will always return with the kq lock held.
 508  *
 509  * KNOTE_KQ_LOCK_ON_SUCCESS
 510  *   The function will return with the kq lock held if it was successful
 511  *   (knote_lock() is the only function that can fail).
 512  *
 513  * KNOTE_KQ_LOCK_ON_FAILURE
 514  *   The function will return with the kq lock held if it was unsuccessful
 515  *   (knote_lock() is the only function that can fail).
 516  *
 517  * KNOTE_KQ_UNLOCK:
 518  *   The function returns with the kq unlocked.
 519  */
 520 enum kqlocking {
 521         KNOTE_KQ_LOCK_ALWAYS,
 522         KNOTE_KQ_LOCK_ON_SUCCESS,
 523         KNOTE_KQ_LOCK_ON_FAILURE,
 524         KNOTE_KQ_UNLOCK,
 525 };
 526
 527 static struct knote_lock_ctx *
 528 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
 529 {
 530         struct knote_lock_ctx *ctx;
 531         LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
 532                 if (ctx->knlc_knote == kn) {
 533                         return ctx;
 534                 }
 535         }
 536         panic("knote lock context not found: %p", kn);
 537         __builtin_trap();
 538 }
 539
 540 /* slowpath of knote_lock() */
 541 __attribute__((noinline))
 542 static bool __result_use_check
 543 knote_lock_slow(kqueue_t kqu, struct knote *kn,
 544     struct knote_lock_ctx *knlc, int kqlocking)
 545 {
 546         struct knote_lock_ctx *owner_lc;
 547         struct uthread *uth = current_uthread();
 548         wait_result_t wr;
 549
 550         kqlock_held(kqu);
 551
 552         owner_lc = knote_lock_ctx_find(kqu, kn);
 553 #if DEBUG || DEVELOPMENT
 554         knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
 555 #endif
 556         owner_lc->knlc_waiters++;
 557
 558         /*
 559          * Make our lock context visible to knote_unlock()
 560          */
 561         uth->uu_knlock = knlc;
 562
 563         wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
 564             knote_lock_wev(kn), owner_lc->knlc_thread,
 565             THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
 566
 567         if (wr == THREAD_RESTART) {
 568                 /*
 569                  * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
 570                  * We need to cleanup the state since no one did.
 571                  */
 572                 uth->uu_knlock = NULL;
 573 #if DEBUG || DEVELOPMENT
 574                 assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
 575                 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
 576 #endif
 577
 578                 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
 579                     kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
 580                         kqlock(kqu);
 581                 }
 582                 return false;
 583         } else {
 584                 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
 585                     kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
 586                         kqlock(kqu);
 587 #if DEBUG || DEVELOPMENT
 588                         /*
 589                          * This state is set under the lock so we can't
 590                          * really assert this unless we hold the lock.
 591                          */
 592                         assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
 593 #endif
 594                 }
 595                 return true;
 596         }
 597 }
 598
 599 /*
 600  * Attempts to take the "knote" lock.
 601  *
 602  * Called with the kqueue lock held.
 603  *
 604  * Returns true if the knote lock is acquired, false if it has been dropped
 605  */
 606 static bool __result_use_check
 607 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
 608     enum kqlocking kqlocking)
 609 {
 610         kqlock_held(kqu);
 611
 612 #if DEBUG || DEVELOPMENT
 613         assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
 614 #endif
 615         knlc->knlc_knote = kn;
 616         knlc->knlc_thread = current_thread();
 617         knlc->knlc_waiters = 0;
 618
 619         if (__improbable(kn->kn_status & KN_LOCKED)) {
 620                 return knote_lock_slow(kqu, kn, knlc, kqlocking);
 621         }
 622
 623         /*
 624          * When the knote will be dropped, the knote lock is taken before
 625          * KN_DROPPING is set, and then the knote will be removed from any
 626          * hash table that references it before the lock is canceled.
 627          */
 628         assert((kn->kn_status & KN_DROPPING) == 0);
 629         LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
 630         kn->kn_status |= KN_LOCKED;
 631 #if DEBUG || DEVELOPMENT
 632         knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
 633 #endif
 634
 635         if (kqlocking == KNOTE_KQ_UNLOCK ||
 636             kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
 637                 kqunlock(kqu);
 638         }
 639         return true;
 640 }
 641
 642 /*
 643  * Unlocks a knote successfully locked with knote_lock().
 644  *
 645  * Called with the kqueue lock held.
 646  *
 647  * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
 648  */
 649 static void
 650 knote_unlock(kqueue_t kqu, struct knote *kn,
 651     struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
 652 {
 653         kqlock_held(kqu);
 654
 655         assert(knlc->knlc_knote == kn);
 656         assert(kn->kn_status & KN_LOCKED);
 657 #if DEBUG || DEVELOPMENT
 658         assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
 659 #endif
 660
 661         LIST_REMOVE(knlc, knlc_link);
 662
 663         if (knlc->knlc_waiters) {
 664                 thread_t thread = THREAD_NULL;
 665
 666                 wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
 667                     LCK_WAKE_DEFAULT, &thread);
 668
 669                 /*
 670                  * knote_lock_slow() publishes the lock context of waiters
 671                  * in uthread::uu_knlock.
 672                  *
 673                  * Reach out and make this context the new owner.
 674                  */
 675                 struct uthread *ut = get_bsdthread_info(thread);
 676                 struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
 677
 678                 assert(next_owner_lc->knlc_knote == kn);
 679                 next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
 680                 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
 681 #if DEBUG || DEVELOPMENT
 682                 next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
 683 #endif
 684                 ut->uu_knlock = NULL;
 685                 thread_deallocate_safe(thread);
 686         } else {
 687                 kn->kn_status &= ~KN_LOCKED;
 688         }
 689
 690         if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
 691                 /*
 692                  * No f_event() in flight anymore, we can leave QoS "Merge" mode
 693                  *
 694                  * See knote_adjust_qos()
 695                  */
 696                 kn->kn_status &= ~KN_MERGE_QOS;
 697         }
 698         if (kqlocking == KNOTE_KQ_UNLOCK) {
 699                 kqunlock(kqu);
 700         }
 701 #if DEBUG || DEVELOPMENT
 702         knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
 703 #endif
 704 }
 705
 706 /*
 707  * Aborts all waiters for a knote lock, and unlock the knote.
 708  *
 709  * Called with the kqueue lock held.
 710  *
 711  * Returns with the kqueue unlocked.
 712  */
 713 static void
 714 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
 715     struct knote_lock_ctx *knlc)
 716 {
 717         kqlock_held(kq);
 718
 719         assert(knlc->knlc_knote == kn);
 720         assert(kn->kn_status & KN_LOCKED);
 721         assert(kn->kn_status & KN_DROPPING);
 722
 723         LIST_REMOVE(knlc, knlc_link);
 724         kn->kn_status &= ~KN_LOCKED;
 725         kqunlock(kq);
 726
 727         if (knlc->knlc_waiters) {
 728                 wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
 729         }
 730 #if DEBUG || DEVELOPMENT
 731         knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
 732 #endif
 733 }
 734
 735 /*
 736  * Call the f_event hook of a given filter.
 737  *
 738  * Takes a use count to protect against concurrent drops.
 739  */
 740 static void
 741 knote_post(struct knote *kn, long hint)
 742 {
 743         struct kqueue *kq = knote_get_kq(kn);
 744         int dropping, result;
 745
 746         kqlock(kq);
 747
 748         if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
 749                 return kqunlock(kq);
 750         }
 751
 752         if (__improbable(kn->kn_status & KN_POSTING)) {
 753                 panic("KNOTE() called concurrently on knote %p", kn);
 754         }
 755
 756         kn->kn_status |= KN_POSTING;
 757
 758         kqunlock(kq);
 759         result = filter_call(knote_fops(kn), f_event(kn, hint));
 760         kqlock(kq);
 761
 762         dropping = (kn->kn_status & KN_DROPPING);
 763
 764         if (!dropping && (result & FILTER_ACTIVE)) {
 765                 knote_activate(kq, kn, result);
 766         }
 767
 768         if ((kn->kn_status & KN_LOCKED) == 0) {
 769                 /*
 770                  * There's no other f_* call in flight, we can leave QoS "Merge" mode.
 771                  *
 772                  * See knote_adjust_qos()
 773                  */
 774                 kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
 775         } else {
 776                 kn->kn_status &= ~KN_POSTING;
 777         }
 778
 779         if (__improbable(dropping)) {
 780                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, knote_post_wev64(kn),
 781                     THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
 782         }
 783
 784         kqunlock(kq);
 785 }
 786
 787 /*
 788  * Called by knote_drop() to wait for the last f_event() caller to be done.
 789  *
 790  *      - kq locked at entry
 791  *      - kq unlocked at exit
 792  */
 793 static void
 794 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
 795 {
 796         wait_result_t wr = THREAD_NOT_WAITING;
 797
 798         kqlock_held(kq);
 799
 800         assert(kn->kn_status & KN_DROPPING);
 801
 802         if (kn->kn_status & KN_POSTING) {
 803                 wr = waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
 804                     knote_post_wev64(kn), THREAD_UNINT | THREAD_WAIT_NOREPORT,
 805                     TIMEOUT_WAIT_FOREVER);
 806         }
 807         kqunlock(kq);
 808         if (wr == THREAD_WAITING) {
 809                 thread_block(THREAD_CONTINUE_NULL);
 810         }
 811 }
 812
 813 #pragma mark knote helpers for filters
 814
 815 OS_ALWAYS_INLINE
 816 void
 817 knote_set_error(struct knote *kn, int error)
 818 {
 819         kn->kn_flags |= EV_ERROR;
 820         kn->kn_sdata = error;
 821 }
 822
 823 OS_ALWAYS_INLINE
 824 int64_t
 825 knote_low_watermark(const struct knote *kn)
 826 {
 827         return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
 828 }
 829
 830 /*!
 831  * @function knote_fill_kevent_with_sdata
 832  *
 833  * @brief
 834  * Fills in a kevent from the current content of a knote.
 835  *
 836  * @discussion
 837  * This is meant to be called from filter's f_event hooks.
 838  * The kevent data is filled with kn->kn_sdata.
 839  *
 840  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
 841  *
 842  * Using knote_fill_kevent is typically preferred.
 843  */
 844 OS_ALWAYS_INLINE
 845 void
 846 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
 847 {
 848 #define knote_assert_aliases(name1, offs1, name2) \
 849         static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
 850             offsetof(struct kevent_internal_s, name2), \
 851                 "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
 852         /*
 853          * All the code makes assumptions on these aliasing,
 854          * so make sure we fail the build if we ever ever ever break them.
 855          */
 856         knote_assert_aliases(ident, 0, kei_ident);
 857 #ifdef __LITTLE_ENDIAN__
 858         knote_assert_aliases(filter, 0, kei_filter);  // non trivial overlap
 859         knote_assert_aliases(filter, 1, kei_filtid);  // non trivial overlap
 860 #else
 861         knote_assert_aliases(filter, 0, kei_filtid);  // non trivial overlap
 862         knote_assert_aliases(filter, 1, kei_filter);  // non trivial overlap
 863 #endif
 864         knote_assert_aliases(flags, 0, kei_flags);
 865         knote_assert_aliases(qos, 0, kei_qos);
 866         knote_assert_aliases(udata, 0, kei_udata);
 867         knote_assert_aliases(fflags, 0, kei_fflags);
 868         knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
 869         knote_assert_aliases(data, 0, kei_sdata);     // non trivial overlap
 870         knote_assert_aliases(ext, 0, kei_ext);
 871 #undef knote_assert_aliases
 872
 873         /*
 874          * Fix the differences between kevent_qos_s and kevent_internal_s:
 875          * - xflags is where kn_sfflags lives, we need to zero it
 876          * - fixup the high bits of `filter` where kn_filtid lives
 877          */
 878         *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
 879         kev->xflags = 0;
 880         kev->filter |= 0xff00;
 881         if (kn->kn_flags & EV_CLEAR) {
 882                 kn->kn_fflags = 0;
 883         }
 884 }
 885
 886 /*!
 887  * @function knote_fill_kevent
 888  *
 889  * @brief
 890  * Fills in a kevent from the current content of a knote.
 891  *
 892  * @discussion
 893  * This is meant to be called from filter's f_event hooks.
 894  * The kevent data is filled with the passed in data.
 895  *
 896  * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
 897  */
 898 OS_ALWAYS_INLINE
 899 void
 900 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
 901 {
 902         knote_fill_kevent_with_sdata(kn, kev);
 903         kev->filter = kn->kn_filter;
 904         kev->data = data;
 905 }
 906
 907
 908 #pragma mark file_filtops
 909
 910 static int
 911 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
 912 {
 913         return fo_kqfilter(kn->kn_fp, kn, kev);
 914 }
 915
 916 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
 917         .f_isfd = 1,
 918         .f_attach = filt_fileattach,
 919 };
 920
 921 #pragma mark kqread_filtops
 922
 923 #define f_flag fp_glob->fg_flag
 924 #define f_ops fp_glob->fg_ops
 925 #define f_data fp_glob->fg_data
 926 #define f_lflags fp_glob->fg_lflags
 927
 928 static void
 929 filt_kqdetach(struct knote *kn)
 930 {
 931         struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
 932         struct kqueue *kq = &kqf->kqf_kqueue;
 933
 934         kqlock(kq);
 935         KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
 936         kqunlock(kq);
 937 }
 938
 939 static int
 940 filt_kqueue(struct knote *kn, __unused long hint)
 941 {
 942         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 943
 944         return kq->kq_count > 0;
 945 }
 946
 947 static int
 948 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
 949 {
 950 #pragma unused(kev)
 951         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 952         int res;
 953
 954         kqlock(kq);
 955         res = (kq->kq_count > 0);
 956         kqunlock(kq);
 957
 958         return res;
 959 }
 960
 961 static int
 962 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
 963 {
 964         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 965         int res = 0;
 966
 967         kqlock(kq);
 968         if (kq->kq_count) {
 969                 knote_fill_kevent(kn, kev, kq->kq_count);
 970                 res = 1;
 971         }
 972         kqunlock(kq);
 973
 974         return res;
 975 }
 976
 977 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
 978         .f_isfd = 1,
 979         .f_detach = filt_kqdetach,
 980         .f_event = filt_kqueue,
 981         .f_touch = filt_kqtouch,
 982         .f_process = filt_kqprocess,
 983 };
 984
 985 #pragma mark proc_filtops
 986
 987 static int
 988 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
 989 {
 990         struct proc *p;
 991
 992         assert(PID_MAX < NOTE_PDATAMASK);
 993
 994         if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
 995                 knote_set_error(kn, ENOTSUP);
 996                 return 0;
 997         }
 998
 999         p = proc_find((int)kn->kn_id);
1000         if (p == NULL) {
1001                 knote_set_error(kn, ESRCH);
1002                 return 0;
1003         }
1004
1005         const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1006
1007         if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1008                 do {
1009                         pid_t selfpid = proc_selfpid();
1010
1011                         if (p->p_ppid == selfpid) {
1012                                 break;  /* parent => ok */
1013                         }
1014                         if ((p->p_lflag & P_LTRACED) != 0 &&
1015                             (p->p_oppid == selfpid)) {
1016                                 break;  /* parent-in-waiting => ok */
1017                         }
1018                         if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1019                                 break; /* allowed to signal => ok */
1020                         }
1021                         proc_rele(p);
1022                         knote_set_error(kn, EACCES);
1023                         return 0;
1024                 } while (0);
1025         }
1026
1027         kn->kn_proc = p;
1028         kn->kn_flags |= EV_CLEAR;       /* automatically set */
1029         kn->kn_sdata = 0;               /* incoming data is ignored */
1030
1031         proc_klist_lock();
1032
1033         KNOTE_ATTACH(&p->p_klist, kn);
1034
1035         proc_klist_unlock();
1036
1037         proc_rele(p);
1038
1039         /*
1040          * only captures edge-triggered events after this point
1041          * so it can't already be fired.
1042          */
1043         return 0;
1044 }
1045
1046
1047 /*
1048  * The knote may be attached to a different process, which may exit,
1049  * leaving nothing for the knote to be attached to.  In that case,
1050  * the pointer to the process will have already been nulled out.
1051  */
1052 static void
1053 filt_procdetach(struct knote *kn)
1054 {
1055         struct proc *p;
1056
1057         proc_klist_lock();
1058
1059         p = kn->kn_proc;
1060         if (p != PROC_NULL) {
1061                 kn->kn_proc = PROC_NULL;
1062                 KNOTE_DETACH(&p->p_klist, kn);
1063         }
1064
1065         proc_klist_unlock();
1066 }
1067
1068 static int
1069 filt_procevent(struct knote *kn, long hint)
1070 {
1071         u_int event;
1072
1073         /* ALWAYS CALLED WITH proc_klist_lock */
1074
1075         /*
1076          * Note: a lot of bits in hint may be obtained from the knote
1077          * To free some of those bits, see <rdar://problem/12592988> Freeing up
1078          * bits in hint for filt_procevent
1079          *
1080          * mask off extra data
1081          */
1082         event = (u_int)hint & NOTE_PCTRLMASK;
1083
1084         /*
1085          * termination lifecycle events can happen while a debugger
1086          * has reparented a process, in which case notifications
1087          * should be quashed except to the tracing parent. When
1088          * the debugger reaps the child (either via wait4(2) or
1089          * process exit), the child will be reparented to the original
1090          * parent and these knotes re-fired.
1091          */
1092         if (event & NOTE_EXIT) {
1093                 if ((kn->kn_proc->p_oppid != 0)
1094                     && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_proc->p_ppid)) {
1095                         /*
1096                          * This knote is not for the current ptrace(2) parent, ignore.
1097                          */
1098                         return 0;
1099                 }
1100         }
1101
1102         /*
1103          * if the user is interested in this event, record it.
1104          */
1105         if (kn->kn_sfflags & event) {
1106                 kn->kn_fflags |= event;
1107         }
1108
1109 #pragma clang diagnostic push
1110 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1111         if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1112                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1113         }
1114 #pragma clang diagnostic pop
1115
1116
1117         /*
1118          * The kernel has a wrapper in place that returns the same data
1119          * as is collected here, in kn_hook32.  Any changes to how
1120          * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1121          * should also be reflected in the proc_pidnoteexit() wrapper.
1122          */
1123         if (event == NOTE_EXIT) {
1124                 kn->kn_hook32 = 0;
1125                 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1126                         kn->kn_fflags |= NOTE_EXITSTATUS;
1127                         kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1128                 }
1129                 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1130                         kn->kn_fflags |= NOTE_EXIT_DETAIL;
1131                         if ((kn->kn_proc->p_lflag &
1132                             P_LTERM_DECRYPTFAIL) != 0) {
1133                                 kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1134                         }
1135                         if ((kn->kn_proc->p_lflag &
1136                             P_LTERM_JETSAM) != 0) {
1137                                 kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1138                                 switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1139                                 case P_JETSAM_VMPAGESHORTAGE:
1140                                         kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1141                                         break;
1142                                 case P_JETSAM_VMTHRASHING:
1143                                         kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1144                                         break;
1145                                 case P_JETSAM_FCTHRASHING:
1146                                         kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1147                                         break;
1148                                 case P_JETSAM_VNODE:
1149                                         kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1150                                         break;
1151                                 case P_JETSAM_HIWAT:
1152                                         kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1153                                         break;
1154                                 case P_JETSAM_PID:
1155                                         kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1156                                         break;
1157                                 case P_JETSAM_IDLEEXIT:
1158                                         kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1159                                         break;
1160                                 }
1161                         }
1162                         if ((kn->kn_proc->p_csflags &
1163                             CS_KILLED) != 0) {
1164                                 kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1165                         }
1166                 }
1167         }
1168
1169         /* if we have any matching state, activate the knote */
1170         return kn->kn_fflags != 0;
1171 }
1172
1173 static int
1174 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1175 {
1176         int res;
1177
1178         proc_klist_lock();
1179
1180         /* accept new filter flags and mask off output events no long interesting */
1181         kn->kn_sfflags = kev->fflags;
1182
1183         /* restrict the current results to the (smaller?) set of new interest */
1184         /*
1185          * For compatibility with previous implementations, we leave kn_fflags
1186          * as they were before.
1187          */
1188         //kn->kn_fflags &= kn->kn_sfflags;
1189
1190         res = (kn->kn_fflags != 0);
1191
1192         proc_klist_unlock();
1193
1194         return res;
1195 }
1196
1197 static int
1198 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1199 {
1200         int res = 0;
1201
1202         proc_klist_lock();
1203         if (kn->kn_fflags) {
1204                 knote_fill_kevent(kn, kev, kn->kn_hook32);
1205                 kn->kn_hook32 = 0;
1206                 res = 1;
1207         }
1208         proc_klist_unlock();
1209         return res;
1210 }
1211
1212 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1213         .f_attach  = filt_procattach,
1214         .f_detach  = filt_procdetach,
1215         .f_event   = filt_procevent,
1216         .f_touch   = filt_proctouch,
1217         .f_process = filt_procprocess,
1218 };
1219
1220 #pragma mark timer_filtops
1221
1222 struct filt_timer_params {
1223         uint64_t deadline; /* deadline in abs/cont time
1224                             *                      (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1225         uint64_t leeway;   /* leeway in abstime, or 0 if none */
1226         uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1227 };
1228
1229 /*
1230  * Values stored in the knote at rest (using Mach absolute time units)
1231  *
1232  * kn->kn_thcall        where the thread_call object is stored
1233  * kn->kn_ext[0]        next deadline or 0 if immediate expiration
1234  * kn->kn_ext[1]        leeway value
1235  * kn->kn_sdata         interval timer: the interval
1236  *                      absolute/deadline timer: 0
1237  * kn->kn_hook32        timer state (with gencount)
1238  *
1239  * TIMER_IDLE:
1240  *   The timer has either never been scheduled or been cancelled.
1241  *   It is safe to schedule a new one in this state.
1242  *
1243  * TIMER_ARMED:
1244  *   The timer has been scheduled
1245  *
1246  * TIMER_FIRED
1247  *   The timer has fired and an event needs to be delivered.
1248  *   When in this state, the callout may still be running.
1249  *
1250  * TIMER_IMMEDIATE
1251  *   The timer has fired at registration time, and the callout was never
1252  *   dispatched.
1253  */
1254 #define TIMER_IDLE       0x0
1255 #define TIMER_ARMED      0x1
1256 #define TIMER_FIRED      0x2
1257 #define TIMER_IMMEDIATE  0x3
1258 #define TIMER_STATE_MASK 0x3
1259 #define TIMER_GEN_INC    0x4
1260
1261 static void
1262 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1263 {
1264         kn->kn_ext[0] = params->deadline;
1265         kn->kn_ext[1] = params->leeway;
1266         kn->kn_sdata  = params->interval;
1267 }
1268
1269 /*
1270  * filt_timervalidate - process data from user
1271  *
1272  * Sets up the deadline, interval, and leeway from the provided user data
1273  *
1274  * Input:
1275  *      kn_sdata        timer deadline or interval time
1276  *      kn_sfflags      style of timer, unit of measurement
1277  *
1278  * Output:
1279  *      struct filter_timer_params to apply to the filter with
1280  *      filt_timer_set_params when changes are ready to be commited.
1281  *
1282  * Returns:
1283  *      EINVAL          Invalid user data parameters
1284  *      ERANGE          Various overflows with the parameters
1285  *
1286  * Called with timer filter lock held.
1287  */
1288 static int
1289 filt_timervalidate(const struct kevent_qos_s *kev,
1290     struct filt_timer_params *params)
1291 {
1292         /*
1293          * There are 5 knobs that need to be chosen for a timer registration:
1294          *
1295          * A) Units of time (what is the time duration of the specified number)
1296          *      Absolute and interval take:
1297          *              NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1298          *      Defaults to milliseconds if not specified
1299          *
1300          * B) Clock epoch (what is the zero point of the specified number)
1301          *      For interval, there is none
1302          *      For absolute, defaults to the gettimeofday/calendar epoch
1303          *      With NOTE_MACHTIME, uses mach_absolute_time()
1304          *      With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1305          *
1306          * C) The knote's behavior on delivery
1307          *      Interval timer causes the knote to arm for the next interval unless one-shot is set
1308          *      Absolute is a forced one-shot timer which deletes on delivery
1309          *      TODO: Add a way for absolute to be not forced one-shot
1310          *
1311          * D) Whether the time duration is relative to now or absolute
1312          *      Interval fires at now + duration when it is set up
1313          *      Absolute fires at now + difference between now walltime and passed in walltime
1314          *      With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1315          *
1316          * E) Whether the timer continues to tick across sleep
1317          *      By default all three do not.
1318          *      For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1319          *      With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1320          *              expires when mach_continuous_time() is > the passed in value.
1321          */
1322
1323         uint64_t multiplier;
1324
1325         boolean_t use_abstime = FALSE;
1326
1327         switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1328         case NOTE_SECONDS:
1329                 multiplier = NSEC_PER_SEC;
1330                 break;
1331         case NOTE_USECONDS:
1332                 multiplier = NSEC_PER_USEC;
1333                 break;
1334         case NOTE_NSECONDS:
1335                 multiplier = 1;
1336                 break;
1337         case NOTE_MACHTIME:
1338                 multiplier = 0;
1339                 use_abstime = TRUE;
1340                 break;
1341         case 0: /* milliseconds (default) */
1342                 multiplier = NSEC_PER_SEC / 1000;
1343                 break;
1344         default:
1345                 return EINVAL;
1346         }
1347
1348         /* transform the leeway in kn_ext[1] to same time scale */
1349         if (kev->fflags & NOTE_LEEWAY) {
1350                 uint64_t leeway_abs;
1351
1352                 if (use_abstime) {
1353                         leeway_abs = (uint64_t)kev->ext[1];
1354                 } else {
1355                         uint64_t leeway_ns;
1356                         if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1357                                 return ERANGE;
1358                         }
1359
1360                         nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1361                 }
1362
1363                 params->leeway = leeway_abs;
1364         } else {
1365                 params->leeway = 0;
1366         }
1367
1368         if (kev->fflags & NOTE_ABSOLUTE) {
1369                 uint64_t deadline_abs;
1370
1371                 if (use_abstime) {
1372                         deadline_abs = (uint64_t)kev->data;
1373                 } else {
1374                         uint64_t calendar_deadline_ns;
1375
1376                         if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1377                                 return ERANGE;
1378                         }
1379
1380                         /* calendar_deadline_ns is in nanoseconds since the epoch */
1381
1382                         clock_sec_t seconds;
1383                         clock_nsec_t nanoseconds;
1384
1385                         /*
1386                          * Note that the conversion through wall-time is only done once.
1387                          *
1388                          * If the relationship between MAT and gettimeofday changes,
1389                          * the underlying timer does not update.
1390                          *
1391                          * TODO: build a wall-time denominated timer_call queue
1392                          * and a flag to request DTRTing with wall-time timers
1393                          */
1394                         clock_get_calendar_nanotime(&seconds, &nanoseconds);
1395
1396                         uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1397
1398                         /* if deadline is in the future */
1399                         if (calendar_now_ns < calendar_deadline_ns) {
1400                                 uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1401                                 uint64_t interval_abs;
1402
1403                                 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1404
1405                                 /*
1406                                  * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1407                                  * causes the timer to keep ticking across sleep, but
1408                                  * it does not change the calendar timebase.
1409                                  */
1410
1411                                 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1412                                         clock_continuoustime_interval_to_deadline(interval_abs,
1413                                             &deadline_abs);
1414                                 } else {
1415                                         clock_absolutetime_interval_to_deadline(interval_abs,
1416                                             &deadline_abs);
1417                                 }
1418                         } else {
1419                                 deadline_abs = 0; /* cause immediate expiration */
1420                         }
1421                 }
1422
1423                 params->deadline = deadline_abs;
1424                 params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1425         } else if (kev->data < 0) {
1426                 /*
1427                  * Negative interval timers fire immediately, once.
1428                  *
1429                  * Ideally a negative interval would be an error, but certain clients
1430                  * pass negative values on accident, and expect an event back.
1431                  *
1432                  * In the old implementation the timer would repeat with no delay
1433                  * N times until mach_absolute_time() + (N * interval) underflowed,
1434                  * then it would wait ~forever by accidentally arming a timer for the far future.
1435                  *
1436                  * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1437                  */
1438
1439                 params->deadline = 0; /* expire immediately */
1440                 params->interval = 0; /* non-repeating */
1441         } else {
1442                 uint64_t interval_abs = 0;
1443
1444                 if (use_abstime) {
1445                         interval_abs = (uint64_t)kev->data;
1446                 } else {
1447                         uint64_t interval_ns;
1448                         if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1449                                 return ERANGE;
1450                         }
1451
1452                         nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1453                 }
1454
1455                 uint64_t deadline = 0;
1456
1457                 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1458                         clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1459                 } else {
1460                         clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1461                 }
1462
1463                 params->deadline = deadline;
1464                 params->interval = interval_abs;
1465         }
1466
1467         return 0;
1468 }
1469
1470 /*
1471  * filt_timerexpire - the timer callout routine
1472  */
1473 static void
1474 filt_timerexpire(void *knx, void *state_on_arm)
1475 {
1476         struct knote *kn = knx;
1477
1478         uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1479         uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1480
1481         if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1482                 // our f_event always would say FILTER_ACTIVE,
1483                 // so be leaner and just do it.
1484                 struct kqueue *kq = knote_get_kq(kn);
1485                 kqlock(kq);
1486                 knote_activate(kq, kn, FILTER_ACTIVE);
1487                 kqunlock(kq);
1488         } else {
1489                 /*
1490                  * The timer has been reprogrammed or canceled since it was armed,
1491                  * and this is a late firing for the timer, just ignore it.
1492                  */
1493         }
1494 }
1495
1496 /*
1497  * Does this deadline needs a timer armed for it, or has it expired?
1498  */
1499 static bool
1500 filt_timer_is_ready(struct knote *kn)
1501 {
1502         uint64_t now, deadline = kn->kn_ext[0];
1503
1504         if (deadline == 0) {
1505                 return true;
1506         }
1507
1508         if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1509                 now = mach_continuous_time();
1510         } else {
1511                 now = mach_absolute_time();
1512         }
1513         return deadline <= now;
1514 }
1515
1516 /*
1517  * Arm a timer
1518  *
1519  * It is the responsibility of the caller to make sure the timer call
1520  * has completed or been cancelled properly prior to arming it.
1521  */
1522 static void
1523 filt_timerarm(struct knote *kn)
1524 {
1525         uint64_t deadline = kn->kn_ext[0];
1526         uint64_t leeway   = kn->kn_ext[1];
1527         uint32_t state;
1528
1529         int filter_flags = kn->kn_sfflags;
1530         unsigned int timer_flags = 0;
1531
1532         if (filter_flags & NOTE_CRITICAL) {
1533                 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1534         } else if (filter_flags & NOTE_BACKGROUND) {
1535                 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1536         } else {
1537                 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1538         }
1539
1540         if (filter_flags & NOTE_LEEWAY) {
1541                 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1542         }
1543
1544         if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1545                 timer_flags |= THREAD_CALL_CONTINUOUS;
1546         }
1547
1548         /*
1549          * Move to ARMED.
1550          *
1551          * We increase the gencount, and setup the thread call with this expected
1552          * state. It means that if there was a previous generation of the timer in
1553          * flight that needs to be ignored, then 3 things are possible:
1554          *
1555          * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1556          *   but we clobber it with ARMED and a new gencount. The knote will still
1557          *   be activated, but filt_timerprocess() which is serialized with this
1558          *   call will not see the FIRED bit set and will not deliver an event.
1559          *
1560          * - this code runs first, but filt_timerexpire() comes second. Because it
1561          *   knows an old gencount, it will debounce and not activate the knote.
1562          *
1563          * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1564          *   will just cancel it properly.
1565          *
1566          * This is important as userspace expects to never be woken up for past
1567          * timers after filt_timertouch ran.
1568          */
1569         state = os_atomic_load(&kn->kn_hook32, relaxed);
1570         state &= ~TIMER_STATE_MASK;
1571         state += TIMER_GEN_INC + TIMER_ARMED;
1572         os_atomic_store(&kn->kn_hook32, state, relaxed);
1573
1574         thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1575             (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1576 }
1577
1578 /*
1579  * Mark a timer as "already fired" when it is being reprogrammed
1580  *
1581  * If there is a timer in flight, this will do a best effort at canceling it,
1582  * but will not wait. If the thread call was in flight, having set the
1583  * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1584  * cancelation.
1585  */
1586 static void
1587 filt_timerfire_immediate(struct knote *kn)
1588 {
1589         uint32_t state;
1590
1591         static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1592             "validate that this atomic or will transition to IMMEDIATE");
1593         state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1594
1595         if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1596                 thread_call_cancel(kn->kn_thcall);
1597         }
1598 }
1599
1600 /*
1601  * Allocate a thread call for the knote's lifetime, and kick off the timer.
1602  */
1603 static int
1604 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1605 {
1606         thread_call_t callout;
1607         struct filt_timer_params params;
1608         int error;
1609
1610         if ((error = filt_timervalidate(kev, &params)) != 0) {
1611                 knote_set_error(kn, error);
1612                 return 0;
1613         }
1614
1615         callout = thread_call_allocate_with_options(filt_timerexpire,
1616             (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1617             THREAD_CALL_OPTIONS_ONCE);
1618
1619         if (NULL == callout) {
1620                 knote_set_error(kn, ENOMEM);
1621                 return 0;
1622         }
1623
1624         filt_timer_set_params(kn, &params);
1625         kn->kn_thcall = callout;
1626         kn->kn_flags |= EV_CLEAR;
1627         os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1628
1629         /* NOTE_ABSOLUTE implies EV_ONESHOT */
1630         if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1631                 kn->kn_flags |= EV_ONESHOT;
1632         }
1633
1634         if (filt_timer_is_ready(kn)) {
1635                 os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1636                 return FILTER_ACTIVE;
1637         } else {
1638                 filt_timerarm(kn);
1639                 return 0;
1640         }
1641 }
1642
1643 /*
1644  * Shut down the timer if it's running, and free the callout.
1645  */
1646 static void
1647 filt_timerdetach(struct knote *kn)
1648 {
1649         __assert_only boolean_t freed;
1650
1651         /*
1652          * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1653          * running anymore.
1654          */
1655         thread_call_cancel_wait(kn->kn_thcall);
1656         freed = thread_call_free(kn->kn_thcall);
1657         assert(freed);
1658 }
1659
1660 /*
1661  * filt_timertouch - update timer knote with new user input
1662  *
1663  * Cancel and restart the timer based on new user data. When
1664  * the user picks up a knote, clear the count of how many timer
1665  * pops have gone off (in kn_data).
1666  */
1667 static int
1668 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1669 {
1670         struct filt_timer_params params;
1671         uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1672         int error;
1673
1674         if (changed_flags & NOTE_ABSOLUTE) {
1675                 kev->flags |= EV_ERROR;
1676                 kev->data = EINVAL;
1677                 return 0;
1678         }
1679
1680         if ((error = filt_timervalidate(kev, &params)) != 0) {
1681                 kev->flags |= EV_ERROR;
1682                 kev->data = error;
1683                 return 0;
1684         }
1685
1686         /* capture the new values used to compute deadline */
1687         filt_timer_set_params(kn, &params);
1688         kn->kn_sfflags = kev->fflags;
1689
1690         if (filt_timer_is_ready(kn)) {
1691                 filt_timerfire_immediate(kn);
1692                 return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1693         } else {
1694                 filt_timerarm(kn);
1695                 return FILTER_UPDATE_REQ_QOS;
1696         }
1697 }
1698
1699 /*
1700  * filt_timerprocess - query state of knote and snapshot event data
1701  *
1702  * Determine if the timer has fired in the past, snapshot the state
1703  * of the kevent for returning to user-space, and clear pending event
1704  * counters for the next time.
1705  */
1706 static int
1707 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1708 {
1709         uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1710
1711         /*
1712          * filt_timerprocess is serialized with any filter routine except for
1713          * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1714          * transition, and on success, activates the knote.
1715          *
1716          * Hence, we don't need atomic modifications of the state, only to peek at
1717          * whether we see any of the "FIRED" state, and if we do, it is safe to
1718          * do simple state machine transitions.
1719          */
1720         switch (state & TIMER_STATE_MASK) {
1721         case TIMER_IDLE:
1722         case TIMER_ARMED:
1723                 /*
1724                  * This can happen if a touch resets a timer that had fired
1725                  * without being processed
1726                  */
1727                 return 0;
1728         }
1729
1730         os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1731
1732         /*
1733          * Copy out the interesting kevent state,
1734          * but don't leak out the raw time calculations.
1735          *
1736          * TODO: potential enhancements - tell the user about:
1737          *      - deadline to which this timer thought it was expiring
1738          *      - return kn_sfflags in the fflags field so the client can know
1739          *        under what flags the timer fired
1740          */
1741         knote_fill_kevent(kn, kev, 1);
1742         kev->ext[0] = 0;
1743         /* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
1744
1745         if (kn->kn_sdata != 0) {
1746                 /*
1747                  * This is a 'repeating' timer, so we have to emit
1748                  * how many intervals expired between the arm
1749                  * and the process.
1750                  *
1751                  * A very strange style of interface, because
1752                  * this could easily be done in the client...
1753                  */
1754
1755                 uint64_t now;
1756
1757                 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1758                         now = mach_continuous_time();
1759                 } else {
1760                         now = mach_absolute_time();
1761                 }
1762
1763                 uint64_t first_deadline = kn->kn_ext[0];
1764                 uint64_t interval_abs   = kn->kn_sdata;
1765                 uint64_t orig_arm_time  = first_deadline - interval_abs;
1766
1767                 assert(now > orig_arm_time);
1768                 assert(now > first_deadline);
1769
1770                 uint64_t elapsed = now - orig_arm_time;
1771
1772                 uint64_t num_fired = elapsed / interval_abs;
1773
1774                 /*
1775                  * To reach this code, we must have seen the timer pop
1776                  * and be in repeating mode, so therefore it must have been
1777                  * more than 'interval' time since the attach or last
1778                  * successful touch.
1779                  */
1780                 assert(num_fired > 0);
1781
1782                 /* report how many intervals have elapsed to the user */
1783                 kev->data = (int64_t)num_fired;
1784
1785                 /* We only need to re-arm the timer if it's not about to be destroyed */
1786                 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1787                         /* fire at the end of the next interval */
1788                         uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1789
1790                         assert(new_deadline > now);
1791
1792                         kn->kn_ext[0] = new_deadline;
1793
1794                         /*
1795                          * This can't shortcut setting up the thread call, because
1796                          * knote_process deactivates EV_CLEAR knotes unconditionnally.
1797                          */
1798                         filt_timerarm(kn);
1799                 }
1800         }
1801
1802         return FILTER_ACTIVE;
1803 }
1804
1805 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1806         .f_extended_codes = true,
1807         .f_attach   = filt_timerattach,
1808         .f_detach   = filt_timerdetach,
1809         .f_event    = filt_bad_event,
1810         .f_touch    = filt_timertouch,
1811         .f_process  = filt_timerprocess,
1812 };
1813
1814 #pragma mark user_filtops
1815
1816 static int
1817 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1818 {
1819         if (kn->kn_sfflags & NOTE_TRIGGER) {
1820                 kn->kn_hook32 = FILTER_ACTIVE;
1821         } else {
1822                 kn->kn_hook32 = 0;
1823         }
1824         return kn->kn_hook32;
1825 }
1826
1827 static int
1828 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1829 {
1830         uint32_t ffctrl;
1831         int fflags;
1832
1833         ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1834         fflags = kev->fflags & NOTE_FFLAGSMASK;
1835         switch (ffctrl) {
1836         case NOTE_FFNOP:
1837                 break;
1838         case NOTE_FFAND:
1839                 kn->kn_sfflags &= fflags;
1840                 break;
1841         case NOTE_FFOR:
1842                 kn->kn_sfflags |= fflags;
1843                 break;
1844         case NOTE_FFCOPY:
1845                 kn->kn_sfflags = fflags;
1846                 break;
1847         }
1848         kn->kn_sdata = kev->data;
1849
1850         if (kev->fflags & NOTE_TRIGGER) {
1851                 kn->kn_hook32 = FILTER_ACTIVE;
1852         }
1853         return (int)kn->kn_hook32;
1854 }
1855
1856 static int
1857 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1858 {
1859         int result = (int)kn->kn_hook32;
1860
1861         if (result) {
1862                 /* EVFILT_USER returns the data that was passed in */
1863                 knote_fill_kevent_with_sdata(kn, kev);
1864                 kev->fflags = kn->kn_sfflags;
1865                 if (kn->kn_flags & EV_CLEAR) {
1866                         /* knote_fill_kevent cleared kn_fflags */
1867                         kn->kn_hook32 = 0;
1868                 }
1869         }
1870
1871         return result;
1872 }
1873
1874 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1875         .f_extended_codes = true,
1876         .f_attach  = filt_userattach,
1877         .f_detach  = filt_no_detach,
1878         .f_event   = filt_bad_event,
1879         .f_touch   = filt_usertouch,
1880         .f_process = filt_userprocess,
1881 };
1882
1883 #pragma mark workloop_filtops
1884
1885 #define EPREEMPTDISABLED (-1)
1886
1887 static inline void
1888 filt_wllock(struct kqworkloop *kqwl)
1889 {
1890         lck_spin_lock(&kqwl->kqwl_statelock);
1891 }
1892
1893 static inline void
1894 filt_wlunlock(struct kqworkloop *kqwl)
1895 {
1896         lck_spin_unlock(&kqwl->kqwl_statelock);
1897 }
1898
1899 /*
1900  * Returns true when the interlock for the turnstile is the workqueue lock
1901  *
1902  * When this is the case, all turnstiles operations are delegated
1903  * to the workqueue subsystem.
1904  *
1905  * This is required because kqueue_threadreq_bind_prepost only holds the
1906  * workqueue lock but needs to move the inheritor from the workloop turnstile
1907  * away from the creator thread, so that this now fulfilled request cannot be
1908  * picked anymore by other threads.
1909  */
1910 static inline bool
1911 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
1912 {
1913         return kqr_thread_requested_pending(&kqwl->kqwl_request);
1914 }
1915
1916 static void
1917 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
1918     turnstile_update_flags_t flags)
1919 {
1920         turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
1921         workq_threadreq_t kqr = &kqwl->kqwl_request;
1922
1923         /*
1924          * binding to the workq should always happen through
1925          * workq_kern_threadreq_update_inheritor()
1926          */
1927         assert(!filt_wlturnstile_interlock_is_workq(kqwl));
1928
1929         if ((inheritor = kqwl->kqwl_owner)) {
1930                 flags |= TURNSTILE_INHERITOR_THREAD;
1931         } else if ((inheritor = kqr_thread(kqr))) {
1932                 flags |= TURNSTILE_INHERITOR_THREAD;
1933         }
1934
1935         turnstile_update_inheritor(ts, inheritor, flags);
1936 }
1937
1938 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
1939 #define FILT_WLATTACH 0
1940 #define FILT_WLTOUCH  1
1941 #define FILT_WLDROP   2
1942
1943 __result_use_check
1944 static int
1945 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
1946     struct kevent_qos_s *kev, kq_index_t qos_index, int op)
1947 {
1948         user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
1949         workq_threadreq_t kqr = &kqwl->kqwl_request;
1950         thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
1951         kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
1952         int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
1953         int action = KQWL_UTQ_NONE, error = 0;
1954         bool wl_inheritor_updated = false, needs_wake = false;
1955         uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
1956         uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
1957         uint64_t udata = 0;
1958         struct turnstile *ts = TURNSTILE_NULL;
1959
1960         filt_wllock(kqwl);
1961
1962 again:
1963         new_owner = cur_owner = kqwl->kqwl_owner;
1964
1965         /*
1966          * Phase 1:
1967          *
1968          * If asked, load the uint64 value at the user provided address and compare
1969          * it against the passed in mask and expected value.
1970          *
1971          * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
1972          * a thread reference.
1973          *
1974          * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
1975          * the current thread, then end ownership.
1976          *
1977          * Lastly decide whether we need to perform a QoS update.
1978          */
1979         if (uaddr) {
1980                 /*
1981                  * Until <rdar://problem/24999882> exists,
1982                  * disabling preemption copyin forces any
1983                  * vm_fault we encounter to fail.
1984                  */
1985                 error = copyin_atomic64(uaddr, &udata);
1986
1987                 /*
1988                  * If we get EFAULT, drop locks, and retry.
1989                  * If we still get an error report it,
1990                  * else assume the memory has been faulted
1991                  * and attempt to copyin under lock again.
1992                  */
1993                 switch (error) {
1994                 case 0:
1995                         break;
1996                 case EFAULT:
1997                         if (efault_retry-- > 0) {
1998                                 filt_wlunlock(kqwl);
1999                                 error = copyin_atomic64(uaddr, &udata);
2000                                 filt_wllock(kqwl);
2001                                 if (error == 0) {
2002                                         goto again;
2003                                 }
2004                         }
2005                         OS_FALLTHROUGH;
2006                 default:
2007                         goto out;
2008                 }
2009
2010                 /* Update state as copied in.  */
2011                 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2012
2013                 if ((udata & mask) != (kdata & mask)) {
2014                         error = ESTALE;
2015                 } else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2016                         /*
2017                          * Decipher the owner port name, and translate accordingly.
2018                          * The low 2 bits were borrowed for other flags, so mask them off.
2019                          *
2020                          * Then attempt translation to a thread reference or fail.
2021                          */
2022                         mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2023                         if (name != MACH_PORT_NULL) {
2024                                 name = ipc_entry_name_mask(name);
2025                                 extra_thread_ref = port_name_to_thread(name,
2026                                     PORT_TO_THREAD_IN_CURRENT_TASK);
2027                                 if (extra_thread_ref == THREAD_NULL) {
2028                                         error = EOWNERDEAD;
2029                                         goto out;
2030                                 }
2031                                 new_owner = extra_thread_ref;
2032                         }
2033                 }
2034         }
2035
2036         if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2037                 new_owner = THREAD_NULL;
2038         }
2039
2040         if (error == 0) {
2041                 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2042                         action = KQWL_UTQ_SET_QOS_INDEX;
2043                 } else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2044                         action = KQWL_UTQ_SET_QOS_INDEX;
2045                 }
2046
2047                 if (op == FILT_WLTOUCH) {
2048                         /*
2049                          * Save off any additional fflags/data we just accepted
2050                          * But only keep the last round of "update" bits we acted on which helps
2051                          * debugging a lot.
2052                          */
2053                         kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2054                         kn->kn_sfflags |= kev->fflags;
2055                         if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2056                                 needs_wake = (kn->kn_thread != THREAD_NULL);
2057                         }
2058                 } else if (op == FILT_WLDROP) {
2059                         if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2060                             NOTE_WL_SYNC_WAIT) {
2061                                 /*
2062                                  * When deleting a SYNC_WAIT knote that hasn't been woken up
2063                                  * explicitly, issue a wake up.
2064                                  */
2065                                 kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2066                                 needs_wake = (kn->kn_thread != THREAD_NULL);
2067                         }
2068                 }
2069         }
2070
2071         /*
2072          * Phase 2:
2073          *
2074          * Commit ownership and QoS changes if any, possibly wake up waiters
2075          */
2076
2077         if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2078                 goto out;
2079         }
2080
2081         kqlock(kqwl);
2082
2083         /* If already tracked as servicer, don't track as owner */
2084         if (new_owner == kqr_thread(kqr)) {
2085                 new_owner = THREAD_NULL;
2086         }
2087
2088         if (cur_owner != new_owner) {
2089                 kqwl->kqwl_owner = new_owner;
2090                 if (new_owner == extra_thread_ref) {
2091                         /* we just transfered this ref to kqwl_owner */
2092                         extra_thread_ref = THREAD_NULL;
2093                 }
2094                 cur_override = kqworkloop_override(kqwl);
2095
2096                 if (new_owner) {
2097                         /* override it before we drop the old */
2098                         if (cur_override != THREAD_QOS_UNSPECIFIED) {
2099                                 thread_add_kevent_override(new_owner, cur_override);
2100                         }
2101                         if (kqr_thread_requested_pending(kqr)) {
2102                                 if (action == KQWL_UTQ_NONE) {
2103                                         action = KQWL_UTQ_REDRIVE_EVENTS;
2104                                 }
2105                         }
2106                 } else {
2107                         if (!kqr_thread_requested(kqr) && kqr->tr_kq_wakeup) {
2108                                 if (action == KQWL_UTQ_NONE) {
2109                                         action = KQWL_UTQ_REDRIVE_EVENTS;
2110                                 }
2111                         }
2112                 }
2113         }
2114
2115         if (action != KQWL_UTQ_NONE) {
2116                 kqworkloop_update_threads_qos(kqwl, action, qos_index);
2117         }
2118
2119         ts = kqwl->kqwl_turnstile;
2120         if (cur_owner != new_owner && ts) {
2121                 if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2122                         /*
2123                          * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2124                          * the code went through workq_kern_threadreq_initiate()
2125                          * and the workqueue has set the inheritor already
2126                          */
2127                         assert(filt_wlturnstile_interlock_is_workq(kqwl));
2128                 } else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2129                         workq_kern_threadreq_lock(kqwl->kqwl_p);
2130                         workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2131                             ts, TURNSTILE_IMMEDIATE_UPDATE);
2132                         workq_kern_threadreq_unlock(kqwl->kqwl_p);
2133                         if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2134                                 /*
2135                                  * If the workq is no longer the interlock, then
2136                                  * workq_kern_threadreq_update_inheritor() has finished a bind
2137                                  * and we need to fallback to the regular path.
2138                                  */
2139                                 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2140                         }
2141                         wl_inheritor_updated = true;
2142                 } else {
2143                         filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2144                         wl_inheritor_updated = true;
2145                 }
2146
2147                 /*
2148                  * We need a turnstile reference because we are dropping the interlock
2149                  * and the caller has not called turnstile_prepare.
2150                  */
2151                 if (wl_inheritor_updated) {
2152                         turnstile_reference(ts);
2153                 }
2154         }
2155
2156         if (needs_wake && ts) {
2157                 waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2158                     kn->kn_thread, THREAD_AWAKENED);
2159                 if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2160                         disable_preemption();
2161                         error = EPREEMPTDISABLED;
2162                 }
2163         }
2164
2165         kqunlock(kqwl);
2166
2167 out:
2168         /*
2169          * Phase 3:
2170          *
2171          * Unlock and cleanup various lingering references and things.
2172          */
2173         filt_wlunlock(kqwl);
2174
2175 #if CONFIG_WORKLOOP_DEBUG
2176         KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2177                 .updater = current_thread(),
2178                 .servicer = kqr_thread(kqr), /* Note: racy */
2179                 .old_owner = cur_owner,
2180                 .new_owner = new_owner,
2181
2182                 .kev_ident  = kev->ident,
2183                 .error      = (int16_t)error,
2184                 .kev_flags  = kev->flags,
2185                 .kev_fflags = kev->fflags,
2186
2187                 .kev_mask   = mask,
2188                 .kev_value  = kdata,
2189                 .in_value   = udata,
2190         });
2191 #endif // CONFIG_WORKLOOP_DEBUG
2192
2193         if (wl_inheritor_updated) {
2194                 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2195                 turnstile_deallocate_safe(ts);
2196         }
2197
2198         if (cur_owner && new_owner != cur_owner) {
2199                 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2200                         thread_drop_kevent_override(cur_owner);
2201                 }
2202                 thread_deallocate_safe(cur_owner);
2203         }
2204         if (extra_thread_ref) {
2205                 thread_deallocate_safe(extra_thread_ref);
2206         }
2207         return error;
2208 }
2209
2210 /*
2211  * Remembers the last updated that came in from userspace for debugging reasons.
2212  * - fflags is mirrored from the userspace kevent
2213  * - ext[i, i != VALUE] is mirrored from the userspace kevent
2214  * - ext[VALUE] is set to what the kernel loaded atomically
2215  * - data is set to the error if any
2216  */
2217 static inline void
2218 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2219     int error)
2220 {
2221         kn->kn_fflags = kev->fflags;
2222         kn->kn_sdata = error;
2223         memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2224 }
2225
2226 static int
2227 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2228     struct kevent_qos_s *kev, int op)
2229 {
2230         user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2231         uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2232         uint64_t mask  = kev->ext[EV_EXTIDX_WL_MASK];
2233         uint64_t udata = 0;
2234         int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2235         int error = 0;
2236
2237         if (op == FILT_WLATTACH) {
2238                 (void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2239         } else if (uaddr == 0) {
2240                 return 0;
2241         }
2242
2243         filt_wllock(kqwl);
2244
2245 again:
2246
2247         /*
2248          * Do the debounce thing, the lock serializing the state is the knote lock.
2249          */
2250         if (uaddr) {
2251                 /*
2252                  * Until <rdar://problem/24999882> exists,
2253                  * disabling preemption copyin forces any
2254                  * vm_fault we encounter to fail.
2255                  */
2256                 error = copyin_atomic64(uaddr, &udata);
2257
2258                 /*
2259                  * If we get EFAULT, drop locks, and retry.
2260                  * If we still get an error report it,
2261                  * else assume the memory has been faulted
2262                  * and attempt to copyin under lock again.
2263                  */
2264                 switch (error) {
2265                 case 0:
2266                         break;
2267                 case EFAULT:
2268                         if (efault_retry-- > 0) {
2269                                 filt_wlunlock(kqwl);
2270                                 error = copyin_atomic64(uaddr, &udata);
2271                                 filt_wllock(kqwl);
2272                                 if (error == 0) {
2273                                         goto again;
2274                                 }
2275                         }
2276                         OS_FALLTHROUGH;
2277                 default:
2278                         goto out;
2279                 }
2280
2281                 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2282                 kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2283
2284                 if ((udata & mask) != (kdata & mask)) {
2285                         error = ESTALE;
2286                         goto out;
2287                 }
2288         }
2289
2290         if (op == FILT_WLATTACH) {
2291                 error = filt_wlattach_sync_ipc(kn);
2292                 if (error == 0) {
2293                         disable_preemption();
2294                         error = EPREEMPTDISABLED;
2295                 }
2296         }
2297
2298 out:
2299         filt_wlunlock(kqwl);
2300         return error;
2301 }
2302
2303 static int
2304 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2305 {
2306         struct kqueue *kq = knote_get_kq(kn);
2307         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2308         int error = 0, result = 0;
2309         kq_index_t qos_index = 0;
2310
2311         if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2312                 error = ENOTSUP;
2313                 goto out;
2314         }
2315
2316         uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2317         switch (command) {
2318         case NOTE_WL_THREAD_REQUEST:
2319                 if (kn->kn_id != kqwl->kqwl_dynamicid) {
2320                         error = EINVAL;
2321                         goto out;
2322                 }
2323                 qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2324                 if (qos_index == THREAD_QOS_UNSPECIFIED) {
2325                         error = ERANGE;
2326                         goto out;
2327                 }
2328                 if (kqwl->kqwl_request.tr_kq_qos_index) {
2329                         /*
2330                          * There already is a thread request, and well, you're only allowed
2331                          * one per workloop, so fail the attach.
2332                          */
2333                         error = EALREADY;
2334                         goto out;
2335                 }
2336                 break;
2337         case NOTE_WL_SYNC_WAIT:
2338         case NOTE_WL_SYNC_WAKE:
2339                 if (kn->kn_id == kqwl->kqwl_dynamicid) {
2340                         error = EINVAL;
2341                         goto out;
2342                 }
2343                 if ((kn->kn_flags & EV_DISABLE) == 0) {
2344                         error = EINVAL;
2345                         goto out;
2346                 }
2347                 if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2348                         error = EINVAL;
2349                         goto out;
2350                 }
2351                 break;
2352
2353         case NOTE_WL_SYNC_IPC:
2354                 if ((kn->kn_flags & EV_DISABLE) == 0) {
2355                         error = EINVAL;
2356                         goto out;
2357                 }
2358                 if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2359                         error = EINVAL;
2360                         goto out;
2361                 }
2362                 break;
2363         default:
2364                 error = EINVAL;
2365                 goto out;
2366         }
2367
2368         if (command == NOTE_WL_SYNC_IPC) {
2369                 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2370         } else {
2371                 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2372         }
2373
2374         if (error == EPREEMPTDISABLED) {
2375                 error = 0;
2376                 result = FILTER_THREADREQ_NODEFEER;
2377         }
2378 out:
2379         if (error) {
2380                 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2381                 if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2382                         error = 0;
2383                 }
2384                 knote_set_error(kn, error);
2385                 return result;
2386         }
2387         if (command == NOTE_WL_SYNC_WAIT) {
2388                 return kevent_register_wait_prepare(kn, kev, result);
2389         }
2390         /* Just attaching the thread request successfully will fire it */
2391         if (command == NOTE_WL_THREAD_REQUEST) {
2392                 /*
2393                  * Thread Request knotes need an explicit touch to be active again,
2394                  * so delivering an event needs to also consume it.
2395                  */
2396                 kn->kn_flags |= EV_CLEAR;
2397                 return result | FILTER_ACTIVE;
2398         }
2399         return result;
2400 }
2401
2402 static void __dead2
2403 filt_wlwait_continue(void *parameter, wait_result_t wr)
2404 {
2405         struct _kevent_register *cont_args = parameter;
2406         struct kqworkloop *kqwl = cont_args->kqwl;
2407
2408         kqlock(kqwl);
2409         if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2410                 workq_kern_threadreq_lock(kqwl->kqwl_p);
2411                 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2412                 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2413         } else {
2414                 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2415         }
2416         kqunlock(kqwl);
2417
2418         turnstile_cleanup();
2419
2420         if (wr == THREAD_INTERRUPTED) {
2421                 cont_args->kev.flags |= EV_ERROR;
2422                 cont_args->kev.data = EINTR;
2423         } else if (wr != THREAD_AWAKENED) {
2424                 panic("Unexpected wait result: %d", wr);
2425         }
2426
2427         kevent_register_wait_return(cont_args);
2428 }
2429
2430 /*
2431  * Called with the workloop mutex held, most of the time never returns as it
2432  * calls filt_wlwait_continue through a continuation.
2433  */
2434 static void __dead2
2435 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2436     struct _kevent_register *cont_args)
2437 {
2438         struct kqworkloop *kqwl = cont_args->kqwl;
2439         workq_threadreq_t kqr = &kqwl->kqwl_request;
2440         struct turnstile *ts;
2441         bool workq_locked = false;
2442
2443         kqlock_held(kqwl);
2444
2445         if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2446                 workq_kern_threadreq_lock(kqwl->kqwl_p);
2447                 workq_locked = true;
2448         }
2449
2450         ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2451             TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2452
2453         if (workq_locked) {
2454                 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2455                     &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2456                     TURNSTILE_DELAYED_UPDATE);
2457                 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2458                         /*
2459                          * if the interlock is no longer the workqueue lock,
2460                          * then we don't need to hold it anymore.
2461                          */
2462                         workq_kern_threadreq_unlock(kqwl->kqwl_p);
2463                         workq_locked = false;
2464                 }
2465         }
2466         if (!workq_locked) {
2467                 /*
2468                  * If the interlock is the workloop's, then it's our responsibility to
2469                  * call update_inheritor, so just do it.
2470                  */
2471                 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2472         }
2473
2474         thread_set_pending_block_hint(uth->uu_thread, kThreadWaitWorkloopSyncWait);
2475         waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2476             THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2477
2478         if (workq_locked) {
2479                 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2480         }
2481
2482         thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2483         if (thread) {
2484                 thread_reference(thread);
2485         }
2486
2487         kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2488 }
2489
2490 /* called in stackshot context to report the thread responsible for blocking this thread */
2491 void
2492 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2493     event64_t event, thread_waitinfo_t *waitinfo)
2494 {
2495         extern zone_t thread_zone;
2496         struct knote *kn = (struct knote *)event;
2497
2498         zone_require(knote_zone, kn);
2499
2500         assert(kn->kn_thread == thread);
2501
2502         struct kqueue *kq = knote_get_kq(kn);
2503
2504         zone_require(kqworkloop_zone, kq);
2505         assert(kq->kq_state & KQ_WORKLOOP);
2506
2507         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2508         workq_threadreq_t kqr = &kqwl->kqwl_request;
2509
2510         thread_t kqwl_owner = kqwl->kqwl_owner;
2511
2512         if (kqwl_owner != THREAD_NULL) {
2513                 zone_require(thread_zone, kqwl_owner);
2514                 waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2515         } else if (kqr_thread_requested_pending(kqr)) {
2516                 waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2517         } else if (kqr->tr_state >= WORKQ_TR_STATE_BINDING) {
2518                 zone_require(thread_zone, kqr->tr_thread);
2519                 waitinfo->owner = thread_tid(kqr->tr_thread);
2520         } else {
2521                 waitinfo->owner = 0;
2522         }
2523
2524         waitinfo->context = kqwl->kqwl_dynamicid;
2525 }
2526
2527 static void
2528 filt_wldetach(struct knote *kn)
2529 {
2530         if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2531                 filt_wldetach_sync_ipc(kn);
2532         } else if (kn->kn_thread) {
2533                 kevent_register_wait_cleanup(kn);
2534         }
2535 }
2536
2537 static int
2538 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2539     thread_qos_t *qos_index)
2540 {
2541         uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2542         uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2543
2544         if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2545                 return EINVAL;
2546         }
2547         if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2548                 if (kev->flags & EV_DELETE) {
2549                         return EINVAL;
2550                 }
2551                 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2552                         return EINVAL;
2553                 }
2554                 if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2555                         return ERANGE;
2556                 }
2557         }
2558
2559         switch (new_commands) {
2560         case NOTE_WL_THREAD_REQUEST:
2561                 /* thread requests can only update themselves */
2562                 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2563                         return EINVAL;
2564                 }
2565                 break;
2566
2567         case NOTE_WL_SYNC_WAIT:
2568                 if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2569                         return EINVAL;
2570                 }
2571                 goto sync_checks;
2572
2573         case NOTE_WL_SYNC_WAKE:
2574 sync_checks:
2575                 if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2576                         return EINVAL;
2577                 }
2578                 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2579                         return EINVAL;
2580                 }
2581                 break;
2582
2583         case NOTE_WL_SYNC_IPC:
2584                 if (sav_commands != NOTE_WL_SYNC_IPC) {
2585                         return EINVAL;
2586                 }
2587                 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2588                         return EINVAL;
2589                 }
2590                 break;
2591
2592         default:
2593                 return EINVAL;
2594         }
2595         return 0;
2596 }
2597
2598 static int
2599 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2600 {
2601         struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2602         thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2603         int result = 0;
2604
2605         int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2606         if (error) {
2607                 goto out;
2608         }
2609
2610         uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2611         if (command == NOTE_WL_SYNC_IPC) {
2612                 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2613         } else {
2614                 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2615                 filt_wlremember_last_update(kn, kev, error);
2616         }
2617         if (error == EPREEMPTDISABLED) {
2618                 error = 0;
2619                 result = FILTER_THREADREQ_NODEFEER;
2620         }
2621
2622 out:
2623         if (error) {
2624                 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2625                         /* If userland wants ESTALE to be hidden, do not activate */
2626                         return result;
2627                 }
2628                 kev->flags |= EV_ERROR;
2629                 kev->data = error;
2630                 return result;
2631         }
2632         if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2633                 return kevent_register_wait_prepare(kn, kev, result);
2634         }
2635         /* Just touching the thread request successfully will fire it */
2636         if (command == NOTE_WL_THREAD_REQUEST) {
2637                 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2638                         result |= FILTER_UPDATE_REQ_QOS;
2639                 }
2640                 result |= FILTER_ACTIVE;
2641         }
2642         return result;
2643 }
2644
2645 static bool
2646 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2647 {
2648         struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2649
2650         int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2651         if (error) {
2652                 goto out;
2653         }
2654
2655         uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2656         if (command == NOTE_WL_SYNC_IPC) {
2657                 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2658         } else {
2659                 error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2660                 filt_wlremember_last_update(kn, kev, error);
2661         }
2662         assert(error != EPREEMPTDISABLED);
2663
2664 out:
2665         if (error) {
2666                 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2667                         return false;
2668                 }
2669                 kev->flags |= EV_ERROR;
2670                 kev->data = error;
2671                 return false;
2672         }
2673         return true;
2674 }
2675
2676 static int
2677 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2678 {
2679         struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2680         int rc = 0;
2681
2682         assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2683
2684         kqlock(kqwl);
2685
2686         if (kqwl->kqwl_owner) {
2687                 /*
2688                  * <rdar://problem/33584321> userspace sometimes due to events being
2689                  * delivered but not triggering a drain session can cause a process
2690                  * of the thread request knote.
2691                  *
2692                  * When that happens, the automatic deactivation due to process
2693                  * would swallow the event, so we have to activate the knote again.
2694                  */
2695                 knote_activate(kqwl, kn, FILTER_ACTIVE);
2696         } else {
2697 #if DEBUG || DEVELOPMENT
2698                 if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2699                         /*
2700                          * see src/queue_internal.h in libdispatch
2701                          */
2702 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2703                         user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2704                         task_t t = current_task();
2705                         uint64_t val;
2706                         if (addr && task_is_active(t) && !task_is_halting(t) &&
2707                             copyin_atomic64(addr, &val) == 0 &&
2708                             val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2709                             (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2710                                 panic("kevent: workloop %#016llx is not enqueued "
2711                                     "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2712                                     kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2713                         }
2714                 }
2715 #endif
2716                 knote_fill_kevent(kn, kev, 0);
2717                 kev->fflags = kn->kn_sfflags;
2718                 rc |= FILTER_ACTIVE;
2719         }
2720
2721         kqunlock(kqwl);
2722
2723         if (rc & FILTER_ACTIVE) {
2724                 workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2725         }
2726         return rc;
2727 }
2728
2729 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2730         .f_extended_codes = true,
2731         .f_attach  = filt_wlattach,
2732         .f_detach  = filt_wldetach,
2733         .f_event   = filt_bad_event,
2734         .f_touch   = filt_wltouch,
2735         .f_process = filt_wlprocess,
2736         .f_allow_drop = filt_wlallow_drop,
2737         .f_post_register_wait = filt_wlpost_register_wait,
2738 };
2739
2740 #pragma mark - kqueues allocation and deallocation
2741
2742 /*!
2743  * @enum kqworkloop_dealloc_flags_t
2744  *
2745  * @brief
2746  * Flags that alter kqworkloop_dealloc() behavior.
2747  *
2748  * @const KQWL_DEALLOC_NONE
2749  * Convenient name for "no flags".
2750  *
2751  * @const KQWL_DEALLOC_SKIP_HASH_REMOVE
2752  * Do not remove the workloop fromt he hash table.
2753  * This is used for process tear-down codepaths as the workloops have been
2754  * removed by the caller already.
2755  */
2756 OS_OPTIONS(kqworkloop_dealloc_flags, unsigned,
2757     KQWL_DEALLOC_NONE               = 0x0000,
2758     KQWL_DEALLOC_SKIP_HASH_REMOVE   = 0x0001,
2759     );
2760
2761 static void
2762 kqworkloop_dealloc(struct kqworkloop *, kqworkloop_dealloc_flags_t, uint32_t);
2763
2764 OS_NOINLINE OS_COLD OS_NORETURN
2765 static void
2766 kqworkloop_retain_panic(struct kqworkloop *kqwl, uint32_t previous)
2767 {
2768         if (previous == 0) {
2769                 panic("kq(%p) resurrection", kqwl);
2770         } else {
2771                 panic("kq(%p) retain overflow", kqwl);
2772         }
2773 }
2774
2775 OS_NOINLINE OS_COLD OS_NORETURN
2776 static void
2777 kqworkloop_release_panic(struct kqworkloop *kqwl)
2778 {
2779         panic("kq(%p) over-release", kqwl);
2780 }
2781
2782 OS_ALWAYS_INLINE
2783 static inline bool
2784 kqworkloop_try_retain(struct kqworkloop *kqwl)
2785 {
2786         uint32_t old_ref, new_ref;
2787         os_atomic_rmw_loop(&kqwl->kqwl_retains, old_ref, new_ref, relaxed, {
2788                 if (__improbable(old_ref == 0)) {
2789                         os_atomic_rmw_loop_give_up(return false);
2790                 }
2791                 if (__improbable(old_ref >= KQ_WORKLOOP_RETAINS_MAX)) {
2792                         kqworkloop_retain_panic(kqwl, old_ref);
2793                 }
2794                 new_ref = old_ref + 1;
2795         });
2796         return true;
2797 }
2798
2799 OS_ALWAYS_INLINE
2800 static inline void
2801 kqworkloop_retain(struct kqworkloop *kqwl)
2802 {
2803         uint32_t previous = os_atomic_inc_orig(&kqwl->kqwl_retains, relaxed);
2804         if (__improbable(previous == 0 || previous >= KQ_WORKLOOP_RETAINS_MAX)) {
2805                 kqworkloop_retain_panic(kqwl, previous);
2806         }
2807 }
2808
2809 OS_ALWAYS_INLINE
2810 static inline void
2811 kqueue_retain(kqueue_t kqu)
2812 {
2813         if (kqu.kq->kq_state & KQ_DYNAMIC) {
2814                 kqworkloop_retain(kqu.kqwl);
2815         }
2816 }
2817
2818 OS_ALWAYS_INLINE
2819 static inline void
2820 kqworkloop_release_live(struct kqworkloop *kqwl)
2821 {
2822         uint32_t refs = os_atomic_dec_orig(&kqwl->kqwl_retains, relaxed);
2823         if (__improbable(refs <= 1)) {
2824                 kqworkloop_release_panic(kqwl);
2825         }
2826 }
2827
2828 OS_ALWAYS_INLINE
2829 static inline void
2830 kqueue_release_live(kqueue_t kqu)
2831 {
2832         if (kqu.kq->kq_state & KQ_DYNAMIC) {
2833                 kqworkloop_release_live(kqu.kqwl);
2834         }
2835 }
2836
2837 OS_ALWAYS_INLINE
2838 static inline void
2839 kqworkloop_release(struct kqworkloop *kqwl)
2840 {
2841         uint32_t refs = os_atomic_dec_orig(&kqwl->kqwl_retains, relaxed);
2842
2843         if (__improbable(refs <= 1)) {
2844                 kqworkloop_dealloc(kqwl, KQWL_DEALLOC_NONE, refs - 1);
2845         }
2846 }
2847
2848 OS_ALWAYS_INLINE
2849 static inline void
2850 kqueue_release(kqueue_t kqu)
2851 {
2852         if (kqu.kq->kq_state & KQ_DYNAMIC) {
2853                 kqworkloop_release(kqu.kqwl);
2854         }
2855 }
2856
2857 /*!
2858  * @function kqueue_destroy
2859  *
2860  * @brief
2861  * Common part to all kqueue dealloc functions.
2862  */
2863 OS_NOINLINE
2864 static void
2865 kqueue_destroy(kqueue_t kqu, zone_t zone)
2866 {
2867         /*
2868          * waitq_set_deinit() remove the KQ's waitq set from
2869          * any select sets to which it may belong.
2870          *
2871          * The order of these deinits matter: before waitq_set_deinit() returns,
2872          * waitq_set__CALLING_PREPOST_HOOK__ may be called and it will take the
2873          * kq_lock.
2874          */
2875         waitq_set_deinit(&kqu.kq->kq_wqs);
2876         lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2877
2878         zfree(zone, kqu.kq);
2879 }
2880
2881 /*!
2882  * @function kqueue_init
2883  *
2884  * @brief
2885  * Common part to all kqueue alloc functions.
2886  */
2887 static kqueue_t
2888 kqueue_init(kqueue_t kqu, waitq_set_prepost_hook_t *hook, int policy)
2889 {
2890         waitq_set_init(&kqu.kq->kq_wqs, policy, NULL, hook);
2891         lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2892         return kqu;
2893 }
2894
2895 #pragma mark kqfile allocation and deallocation
2896
2897 /*!
2898  * @function kqueue_dealloc
2899  *
2900  * @brief
2901  * Detach all knotes from a kqfile and free it.
2902  *
2903  * @discussion
2904  * We walk each list looking for knotes referencing this
2905  * this kqueue.  If we find one, we try to drop it.  But
2906  * if we fail to get a drop reference, that will wait
2907  * until it is dropped.  So, we can just restart again
2908  * safe in the assumption that the list will eventually
2909  * not contain any more references to this kqueue (either
2910  * we dropped them all, or someone else did).
2911  *
2912  * Assumes no new events are being added to the kqueue.
2913  * Nothing locked on entry or exit.
2914  */
2915 void
2916 kqueue_dealloc(struct kqueue *kq)
2917 {
2918         KNOTE_LOCK_CTX(knlc);
2919         struct proc *p = kq->kq_p;
2920         struct filedesc *fdp = p->p_fd;
2921         struct knote *kn;
2922
2923         assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2924
2925         proc_fdlock(p);
2926         for (int i = 0; i < fdp->fd_knlistsize; i++) {
2927                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2928                 while (kn != NULL) {
2929                         if (kq == knote_get_kq(kn)) {
2930                                 kqlock(kq);
2931                                 proc_fdunlock(p);
2932                                 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2933                                         knote_drop(kq, kn, &knlc);
2934                                 }
2935                                 proc_fdlock(p);
2936                                 /* start over at beginning of list */
2937                                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2938                                 continue;
2939                         }
2940                         kn = SLIST_NEXT(kn, kn_link);
2941                 }
2942         }
2943
2944         knhash_lock(fdp);
2945         proc_fdunlock(p);
2946
2947         if (fdp->fd_knhashmask != 0) {
2948                 for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2949                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2950                         while (kn != NULL) {
2951                                 if (kq == knote_get_kq(kn)) {
2952                                         kqlock(kq);
2953                                         knhash_unlock(fdp);
2954                                         if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2955                                                 knote_drop(kq, kn, &knlc);
2956                                         }
2957                                         knhash_lock(fdp);
2958                                         /* start over at beginning of list */
2959                                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2960                                         continue;
2961                                 }
2962                                 kn = SLIST_NEXT(kn, kn_link);
2963                         }
2964                 }
2965         }
2966         knhash_unlock(fdp);
2967
2968         kqueue_destroy(kq, kqfile_zone);
2969 }
2970
2971 /*!
2972  * @function kqueue_alloc
2973  *
2974  * @brief
2975  * Allocate a kqfile.
2976  */
2977 struct kqueue *
2978 kqueue_alloc(struct proc *p)
2979 {
2980         struct kqfile *kqf;
2981
2982         /*
2983          * kqfiles are created with kqueue() so we need to wait for
2984          * the first kevent syscall to know which bit among
2985          * KQ_KEV_{32,64,QOS} will be set in kqf_state
2986          */
2987         kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
2988         kqf->kqf_p = p;
2989         TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
2990         TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
2991
2992         return kqueue_init(kqf, NULL, SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST).kq;
2993 }
2994
2995 /*!
2996  * @function kqueue_internal
2997  *
2998  * @brief
2999  * Core implementation for kqueue and guarded_kqueue_np()
3000  */
3001 int
3002 kqueue_internal(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
3003 {
3004         struct kqueue *kq;
3005         struct fileproc *fp;
3006         int fd, error;
3007
3008         error = falloc_withalloc(p, &fp, &fd, vfs_context_current(), fp_zalloc, cra);
3009         if (error) {
3010                 return error;
3011         }
3012
3013         kq = kqueue_alloc(p);
3014         if (kq == NULL) {
3015                 fp_free(p, fd, fp);
3016                 return ENOMEM;
3017         }
3018
3019         fp->f_flag = FREAD | FWRITE;
3020         fp->f_ops = &kqueueops;
3021         fp->f_data = kq;
3022         fp->f_lflags |= FG_CONFINED;
3023
3024         proc_fdlock(p);
3025         *fdflags(p, fd) |= UF_EXCLOSE | UF_FORKCLOSE;
3026         procfdtbl_releasefd(p, fd, NULL);
3027         fp_drop(p, fd, fp, 1);
3028         proc_fdunlock(p);
3029
3030         *retval = fd;
3031         return error;
3032 }
3033
3034 /*!
3035  * @function kqueue
3036  *
3037  * @brief
3038  * The kqueue syscall.
3039  */
3040 int
3041 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3042 {
3043         return kqueue_internal(p, fileproc_alloc_init, NULL, retval);
3044 }
3045
3046 #pragma mark kqworkq allocation and deallocation
3047
3048 /*!
3049  * @function kqworkq_dealloc
3050  *
3051  * @brief
3052  * Deallocates a workqueue kqueue.
3053  *
3054  * @discussion
3055  * This only happens at process death, or for races with concurrent
3056  * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3057  * this kqueue, either there are none, or someone else took care of them.
3058  */
3059 void
3060 kqworkq_dealloc(struct kqworkq *kqwq)
3061 {
3062         kqueue_destroy(kqwq, kqworkq_zone);
3063 }
3064
3065 /*!
3066  * @function kqworkq_alloc
3067  *
3068  * @brief
3069  * Allocates a workqueue kqueue.
3070  *
3071  * @discussion
3072  * This is the slow path of kevent_get_kqwq.
3073  * This takes care of making sure procs have a single workq kqueue.
3074  */
3075 OS_NOINLINE
3076 static struct kqworkq *
3077 kqworkq_alloc(struct proc *p, unsigned int flags)
3078 {
3079         struct kqworkq *kqwq, *tmp;
3080
3081         kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3082
3083         assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3084         if (flags & KEVENT_FLAG_LEGACY64) {
3085                 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3086         } else {
3087                 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3088         }
3089         kqwq->kqwq_p = p;
3090
3091         for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3092                 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3093                 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3094         }
3095         for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3096                 /*
3097                  * Because of how the bucketized system works, we mix overcommit
3098                  * sources with not overcommit: each time we move a knote from
3099                  * one bucket to the next due to overrides, we'd had to track
3100                  * overcommitness, and it's really not worth it in the workloop
3101                  * enabled world that track this faithfully.
3102                  *
3103                  * Incidentally, this behaves like the original manager-based
3104                  * kqwq where event delivery always happened (hence is
3105                  * "overcommit")
3106                  */
3107                 kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3108                 kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3109                 if (i != KQWQ_QOS_MANAGER) {
3110                         kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3111                 }
3112                 kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i;
3113         }
3114
3115         kqueue_init(kqwq, &kqwq->kqwq_waitq_hook, SYNC_POLICY_FIFO);
3116
3117         if (!os_atomic_cmpxchgv(&p->p_fd->fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3118                 kqworkq_dealloc(kqwq);
3119                 return tmp;
3120         }
3121
3122         return kqwq;
3123 }
3124
3125 #pragma mark kqworkloop allocation and deallocation
3126
3127 #define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
3128 #define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
3129
3130 OS_ALWAYS_INLINE
3131 static inline void
3132 kqhash_lock(struct filedesc *fdp)
3133 {
3134         lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3135 }
3136
3137 OS_ALWAYS_INLINE
3138 static inline void
3139 kqhash_unlock(struct filedesc *fdp)
3140 {
3141         lck_mtx_unlock(&fdp->fd_kqhashlock);
3142 }
3143
3144 OS_ALWAYS_INLINE
3145 static inline void
3146 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3147     struct kqworkloop *kqwl)
3148 {
3149         struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3150         LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3151 }
3152
3153 OS_ALWAYS_INLINE
3154 static inline struct kqworkloop *
3155 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3156 {
3157         struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3158         struct kqworkloop *kqwl;
3159
3160         LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3161                 if (kqwl->kqwl_dynamicid == id) {
3162                         return kqwl;
3163                 }
3164         }
3165         return NULL;
3166 }
3167
3168 static struct kqworkloop *
3169 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3170 {
3171         struct kqworkloop *kqwl = NULL;
3172
3173         kqhash_lock(fdp);
3174         if (__probable(fdp->fd_kqhash)) {
3175                 kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3176                 if (kqwl && !kqworkloop_try_retain(kqwl)) {
3177                         kqwl = NULL;
3178                 }
3179         }
3180         kqhash_unlock(fdp);
3181         return kqwl;
3182 }
3183
3184 OS_NOINLINE
3185 static void
3186 kqworkloop_hash_init(struct filedesc *fdp)
3187 {
3188         struct kqwllist *alloc_hash;
3189         u_long alloc_mask;
3190
3191         kqhash_unlock(fdp);
3192         alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3193         kqhash_lock(fdp);
3194
3195         /* See if we won the race */
3196         if (__probable(fdp->fd_kqhashmask == 0)) {
3197                 fdp->fd_kqhash = alloc_hash;
3198                 fdp->fd_kqhashmask = alloc_mask;
3199         } else {
3200                 kqhash_unlock(fdp);
3201                 hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3202                 kqhash_lock(fdp);
3203         }
3204 }
3205
3206 /*!
3207  * @function kqworkloop_dealloc
3208  *
3209  * @brief
3210  * Deallocates a workloop kqueue.
3211  *
3212  * @discussion
3213  * Knotes hold references on the workloop, so we can't really reach this
3214  * function unless all of these are already gone.
3215  *
3216  * Nothing locked on entry or exit.
3217  *
3218  * @param flags
3219  * Unless KQWL_DEALLOC_SKIP_HASH_REMOVE is set, the workloop is removed
3220  * from its hash table.
3221  *
3222  * @param current_ref
3223  * This function is also called to undo a kqworkloop_alloc in case of
3224  * allocation races, expected_ref is the current refcount that is expected
3225  * on the workloop object, usually 0, and 1 when a dealloc race is resolved.
3226  */
3227 static void
3228 kqworkloop_dealloc(struct kqworkloop *kqwl, kqworkloop_dealloc_flags_t flags,
3229     uint32_t current_ref)
3230 {
3231         thread_t cur_owner;
3232
3233         if (__improbable(current_ref > 1)) {
3234                 kqworkloop_release_panic(kqwl);
3235         }
3236         assert(kqwl->kqwl_retains == current_ref);
3237
3238         /* pair with kqunlock() and other kq locks */
3239         os_atomic_thread_fence(acquire);
3240
3241         cur_owner = kqwl->kqwl_owner;
3242         if (cur_owner) {
3243                 if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3244                         thread_drop_kevent_override(cur_owner);
3245                 }
3246                 thread_deallocate(cur_owner);
3247                 kqwl->kqwl_owner = THREAD_NULL;
3248         }
3249
3250         if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3251                 struct turnstile *ts;
3252                 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3253                     &ts, TURNSTILE_WORKLOOPS);
3254                 turnstile_cleanup();
3255                 turnstile_deallocate(ts);
3256         }
3257
3258         if ((flags & KQWL_DEALLOC_SKIP_HASH_REMOVE) == 0) {
3259                 struct filedesc *fdp = kqwl->kqwl_p->p_fd;
3260
3261                 kqhash_lock(fdp);
3262                 LIST_REMOVE(kqwl, kqwl_hashlink);
3263                 kqhash_unlock(fdp);
3264         }
3265
3266         assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3267         assert(kqwl->kqwl_owner == THREAD_NULL);
3268         assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3269
3270         lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3271         kqueue_destroy(kqwl, kqworkloop_zone);
3272 }
3273
3274 /*!
3275  * @function kqworkloop_alloc
3276  *
3277  * @brief
3278  * Allocates a workloop kqueue.
3279  */
3280 static void
3281 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3282     kqueue_id_t id, workq_threadreq_param_t *trp)
3283 {
3284         kqwl->kqwl_state     = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3285         kqwl->kqwl_retains   = 1; /* donate a retain to creator */
3286         kqwl->kqwl_dynamicid = id;
3287         kqwl->kqwl_p         = p;
3288         if (trp) {
3289                 kqwl->kqwl_params = trp->trp_value;
3290         }
3291
3292         workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3293         if (trp) {
3294                 if (trp->trp_flags & TRP_PRIORITY) {
3295                         tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3296                 }
3297                 if (trp->trp_flags) {
3298                         tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3299                 }
3300         }
3301         kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3302         kqwl->kqwl_request.tr_flags = tr_flags;
3303
3304         for (int i = 0; i < KQWL_NBUCKETS; i++) {
3305                 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3306         }
3307         TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3308
3309         lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3310
3311         kqueue_init(kqwl, &kqwl->kqwl_waitq_hook, SYNC_POLICY_FIFO);
3312 }
3313
3314 /*!
3315  * @function kqworkloop_get_or_create
3316  *
3317  * @brief
3318  * Wrapper around kqworkloop_alloc that handles the uniquing of workloops.
3319  *
3320  * @returns
3321  * 0:      success
3322  * EINVAL: invalid parameters
3323  * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3324  * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3325  * ENOMEM: allocation failed
3326  */
3327 static int
3328 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3329     workq_threadreq_param_t *trp, unsigned int flags, struct kqworkloop **kqwlp)
3330 {
3331         struct filedesc *fdp = p->p_fd;
3332         struct kqworkloop *alloc_kqwl = NULL;
3333         struct kqworkloop *kqwl = NULL;
3334         int error = 0;
3335
3336         assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3337
3338         if (id == 0 || id == (kqueue_id_t)-1) {
3339                 return EINVAL;
3340         }
3341
3342         for (;;) {
3343                 kqhash_lock(fdp);
3344                 if (__improbable(fdp->fd_kqhash == NULL)) {
3345                         kqworkloop_hash_init(fdp);
3346                 }
3347
3348                 kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3349                 if (kqwl) {
3350                         if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3351                                 /*
3352                                  * If MUST_NOT_EXIST was passed, even if we would have failed
3353                                  * the try_retain, it could have gone the other way, and
3354                                  * userspace can't tell. Let'em fix their race.
3355                                  */
3356                                 error = EEXIST;
3357                                 break;
3358                         }
3359
3360                         if (__probable(kqworkloop_try_retain(kqwl))) {
3361                                 /*
3362                                  * This is a valid live workloop !
3363                                  */
3364                                 *kqwlp = kqwl;
3365                                 error = 0;
3366                                 break;
3367                         }
3368                 }
3369
3370                 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3371                         error = ENOENT;
3372                         break;
3373                 }
3374
3375                 /*
3376                  * We didn't find what we were looking for.
3377                  *
3378                  * If this is the second time we reach this point (alloc_kqwl != NULL),
3379                  * then we're done.
3380                  *
3381                  * If this is the first time we reach this point (alloc_kqwl == NULL),
3382                  * then try to allocate one without blocking.
3383                  */
3384                 if (__probable(alloc_kqwl == NULL)) {
3385                         alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3386                 }
3387                 if (__probable(alloc_kqwl)) {
3388                         kqworkloop_init(alloc_kqwl, p, id, trp);
3389                         kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3390                         kqhash_unlock(fdp);
3391                         *kqwlp = alloc_kqwl;
3392                         return 0;
3393                 }
3394
3395                 /*
3396                  * We have to block to allocate a workloop, drop the lock,
3397                  * allocate one, but then we need to retry lookups as someone
3398                  * else could race with us.
3399                  */
3400                 kqhash_unlock(fdp);
3401
3402                 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3403         }
3404
3405         kqhash_unlock(fdp);
3406
3407         if (__improbable(alloc_kqwl)) {
3408                 zfree(kqworkloop_zone, alloc_kqwl);
3409         }
3410
3411         return error;
3412 }
3413
3414 #pragma mark - knotes
3415
3416 static int
3417 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3418 {
3419         knote_set_error(kn, ENOTSUP);
3420         return 0;
3421 }
3422
3423 static void
3424 filt_no_detach(__unused struct knote *kn)
3425 {
3426 }
3427
3428 static int __dead2
3429 filt_bad_event(struct knote *kn, long hint)
3430 {
3431         panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3432 }
3433
3434 static int __dead2
3435 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3436 {
3437         panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3438 }
3439
3440 static int __dead2
3441 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3442 {
3443         panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3444 }
3445
3446 /*
3447  * knotes_dealloc - detach all knotes for the process and drop them
3448  *
3449  *              Called with proc_fdlock held.
3450  *              Returns with it locked.
3451  *              May drop it temporarily.
3452  *              Process is in such a state that it will not try to allocate
3453  *              any more knotes during this process (stopped for exit or exec).
3454  */
3455 void
3456 knotes_dealloc(proc_t p)
3457 {
3458         struct filedesc *fdp = p->p_fd;
3459         struct kqueue *kq;
3460         struct knote *kn;
3461         struct  klist *kn_hash = NULL;
3462         u_long kn_hashmask;
3463         int i;
3464
3465         /* Close all the fd-indexed knotes up front */
3466         if (fdp->fd_knlistsize > 0) {
3467                 for (i = 0; i < fdp->fd_knlistsize; i++) {
3468                         while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3469                                 kq = knote_get_kq(kn);
3470                                 kqlock(kq);
3471                                 proc_fdunlock(p);
3472                                 knote_drop(kq, kn, NULL);
3473                                 proc_fdlock(p);
3474                         }
3475                 }
3476                 /* free the table */
3477                 FREE(fdp->fd_knlist, M_KQUEUE);
3478                 fdp->fd_knlist = NULL;
3479         }
3480         fdp->fd_knlistsize = 0;
3481
3482         knhash_lock(fdp);
3483         proc_fdunlock(p);
3484
3485         /* Clean out all the hashed knotes as well */
3486         if (fdp->fd_knhashmask != 0) {
3487                 for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3488                         while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3489                                 kq = knote_get_kq(kn);
3490                                 kqlock(kq);
3491                                 knhash_unlock(fdp);
3492                                 knote_drop(kq, kn, NULL);
3493                                 knhash_lock(fdp);
3494                         }
3495                 }
3496                 kn_hash = fdp->fd_knhash;
3497                 kn_hashmask = fdp->fd_knhashmask;
3498                 fdp->fd_knhashmask = 0;
3499                 fdp->fd_knhash = NULL;
3500         }
3501
3502         knhash_unlock(fdp);
3503
3504         if (kn_hash) {
3505                 hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3506         }
3507
3508         proc_fdlock(p);
3509 }
3510
3511 /*
3512  * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3513  * scheduling parameters
3514  *
3515  *              Called with proc_fdlock held.
3516  *              Returns with it locked.
3517  *              Process is in such a state that it will not try to allocate
3518  *              any more knotes during this process (stopped for exit or exec).
3519  */
3520 void
3521 kqworkloops_dealloc(proc_t p)
3522 {
3523         struct filedesc *fdp = p->p_fd;
3524         struct kqworkloop *kqwl, *kqwln;
3525         struct kqwllist tofree;
3526
3527         if (!(fdp->fd_flags & FD_WORKLOOP)) {
3528                 return;
3529         }
3530
3531         kqhash_lock(fdp);
3532
3533         if (fdp->fd_kqhashmask == 0) {
3534                 kqhash_unlock(fdp);
3535                 return;
3536         }
3537
3538         LIST_INIT(&tofree);
3539
3540         for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3541                 LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3542                         /*
3543                          * kqworkloops that have scheduling parameters have an
3544                          * implicit retain from kqueue_workloop_ctl that needs
3545                          * to be balanced on process exit.
3546                          */
3547                         assert(kqwl->kqwl_params);
3548                         LIST_REMOVE(kqwl, kqwl_hashlink);
3549                         LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3550                 }
3551         }
3552
3553         kqhash_unlock(fdp);
3554
3555         LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3556                 kqworkloop_dealloc(kqwl, KQWL_DEALLOC_SKIP_HASH_REMOVE, 1);
3557         }
3558 }
3559
3560 static int
3561 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3562     struct kevent_qos_s *kev)
3563 {
3564         /* We don't care about the priority of a disabled or deleted knote */
3565         if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3566                 return 0;
3567         }
3568
3569         if (kq->kq_state & KQ_WORKLOOP) {
3570                 /*
3571                  * Workloops need valid priorities with a QOS (excluding manager) for
3572                  * any enabled knote.
3573                  *
3574                  * When it is pre-existing, just make sure it has a valid QoS as
3575                  * kevent_register() will not use the incoming priority (filters who do
3576                  * have the responsibility to validate it again, see filt_wltouch).
3577                  *
3578                  * If the knote is being made, validate the incoming priority.
3579                  */
3580                 if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3581                         return ERANGE;
3582                 }
3583         }
3584
3585         return 0;
3586 }
3587
3588 /*
3589  * Prepare a filter for waiting after register.
3590  *
3591  * The f_post_register_wait hook will be called later by kevent_register()
3592  * and should call kevent_register_wait_block()
3593  */
3594 static int
3595 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3596 {
3597         thread_t thread = current_thread();
3598
3599         assert(knote_fops(kn)->f_extended_codes);
3600
3601         if (kn->kn_thread == NULL) {
3602                 thread_reference(thread);
3603                 kn->kn_thread = thread;
3604         } else if (kn->kn_thread != thread) {
3605                 /*
3606                  * kn_thread may be set from a previous aborted wait
3607                  * However, it has to be from the same thread.
3608                  */
3609                 kev->flags |= EV_ERROR;
3610                 kev->data = EXDEV;
3611                 return 0;
3612         }
3613
3614         return FILTER_REGISTER_WAIT | rc;
3615 }
3616
3617 /*
3618  * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3619  * aborted instead of properly woken up with thread_wakeup_thread().
3620  */
3621 static void
3622 kevent_register_wait_cleanup(struct knote *kn)
3623 {
3624         thread_t thread = kn->kn_thread;
3625         kn->kn_thread = NULL;
3626         thread_deallocate(thread);
3627 }
3628
3629 /*
3630  * Must be called at the end of a f_post_register_wait call from a filter.
3631  */
3632 static void
3633 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3634     thread_continue_t cont, struct _kevent_register *cont_args)
3635 {
3636         turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3637         kqunlock(cont_args->kqwl);
3638         cont_args->handoff_thread = thread;
3639         thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3640 }
3641
3642 /*
3643  * Called by Filters using a f_post_register_wait to return from their wait.
3644  */
3645 static void
3646 kevent_register_wait_return(struct _kevent_register *cont_args)
3647 {
3648         struct kqworkloop *kqwl = cont_args->kqwl;
3649         struct kevent_qos_s *kev = &cont_args->kev;
3650         int error = 0;
3651
3652         if (cont_args->handoff_thread) {
3653                 thread_deallocate(cont_args->handoff_thread);
3654         }
3655
3656         if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3657                 if ((kev->flags & EV_ERROR) == 0) {
3658                         kev->flags |= EV_ERROR;
3659                         kev->data = 0;
3660                 }
3661                 error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3662                 if (error == 0) {
3663                         cont_args->eventout++;
3664                 }
3665         }
3666
3667         kqworkloop_release(kqwl);
3668         if (error == 0) {
3669                 *(int32_t *)&current_uthread()->uu_rval = cont_args->eventout;
3670         }
3671         unix_syscall_return(error);
3672 }
3673
3674 /*
3675  * kevent_register - add a new event to a kqueue
3676  *
3677  *      Creates a mapping between the event source and
3678  *      the kqueue via a knote data structure.
3679  *
3680  *      Because many/most the event sources are file
3681  *      descriptor related, the knote is linked off
3682  *      the filedescriptor table for quick access.
3683  *
3684  *      called with nothing locked
3685  *      caller holds a reference on the kqueue
3686  */
3687
3688 int
3689 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
3690     struct knote **kn_out)
3691 {
3692         struct proc *p = kq->kq_p;
3693         const struct filterops *fops;
3694         struct knote *kn = NULL;
3695         int result = 0, error = 0;
3696         unsigned short kev_flags = kev->flags;
3697         KNOTE_LOCK_CTX(knlc);
3698
3699         if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
3700                 fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
3701         } else {
3702                 error = EINVAL;
3703                 goto out;
3704         }
3705
3706         /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
3707         if (__improbable((kev->flags & EV_VANISHED) &&
3708             (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
3709                 error = EINVAL;
3710                 goto out;
3711         }
3712
3713         /* Simplify the flags - delete and disable overrule */
3714         if (kev->flags & EV_DELETE) {
3715                 kev->flags &= ~EV_ADD;
3716         }
3717         if (kev->flags & EV_DISABLE) {
3718                 kev->flags &= ~EV_ENABLE;
3719         }
3720
3721         if (kq->kq_state & KQ_WORKLOOP) {
3722                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
3723                     ((struct kqworkloop *)kq)->kqwl_dynamicid,
3724                     kev->udata, kev->flags, kev->filter);
3725         } else if (kq->kq_state & KQ_WORKQ) {
3726                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
3727                     0, kev->udata, kev->flags, kev->filter);
3728         } else {
3729                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
3730                     VM_KERNEL_UNSLIDE_OR_PERM(kq),
3731                     kev->udata, kev->flags, kev->filter);
3732         }
3733
3734 restart:
3735         /* find the matching knote from the fd tables/hashes */
3736         kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
3737         error = kevent_register_validate_priority(kq, kn, kev);
3738         result = 0;
3739         if (error) {
3740                 goto out;
3741         }
3742
3743         if (kn == NULL && (kev->flags & EV_ADD) == 0) {
3744                 /*
3745                  * No knote found, EV_ADD wasn't specified
3746                  */
3747
3748                 if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
3749                     (kq->kq_state & KQ_WORKLOOP)) {
3750                         /*
3751                          * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
3752                          * that doesn't care about ENOENT, so just pretend the deletion
3753                          * happened.
3754                          */
3755                 } else {
3756                         error = ENOENT;
3757                 }
3758                 goto out;
3759         } else if (kn == NULL) {
3760                 /*
3761                  * No knote found, need to attach a new one (attach)
3762                  */
3763
3764                 struct fileproc *knote_fp = NULL;
3765
3766                 /* grab a file reference for the new knote */
3767                 if (fops->f_isfd) {
3768                         if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
3769                                 goto out;
3770                         }
3771                 }
3772
3773                 kn = knote_alloc();
3774                 if (kn == NULL) {
3775                         error = ENOMEM;
3776                         if (knote_fp != NULL) {
3777                                 fp_drop(p, (int)kev->ident, knote_fp, 0);
3778                         }
3779                         goto out;
3780                 }
3781
3782                 kn->kn_fp = knote_fp;
3783                 kn->kn_is_fd = fops->f_isfd;
3784                 kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
3785                 kn->kn_status = 0;
3786
3787                 /* was vanish support requested */
3788                 if (kev->flags & EV_VANISHED) {
3789                         kev->flags &= ~EV_VANISHED;
3790                         kn->kn_status |= KN_REQVANISH;
3791                 }
3792
3793                 /* snapshot matching/dispatching protocol flags into knote */
3794                 if (kev->flags & EV_DISABLE) {
3795                         kn->kn_status |= KN_DISABLED;
3796                 }
3797
3798                 /*
3799                  * copy the kevent state into knote
3800                  * protocol is that fflags and data
3801                  * are saved off, and cleared before
3802                  * calling the attach routine.
3803                  *
3804                  * - kn->kn_sfflags aliases with kev->xflags
3805                  * - kn->kn_sdata   aliases with kev->data
3806                  * - kn->kn_filter  is the top 8 bits of kev->filter
3807                  */
3808                 kn->kn_kevent  = *(struct kevent_internal_s *)kev;
3809                 kn->kn_sfflags = kev->fflags;
3810                 kn->kn_filtid  = (uint8_t)~kev->filter;
3811                 kn->kn_fflags  = 0;
3812                 knote_reset_priority(kq, kn, kev->qos);
3813
3814                 /* Add the knote for lookup thru the fd table */
3815                 error = kq_add_knote(kq, kn, &knlc, p);
3816                 if (error) {
3817                         knote_free(kn);
3818                         if (knote_fp != NULL) {
3819                                 fp_drop(p, (int)kev->ident, knote_fp, 0);
3820                         }
3821
3822                         if (error == ERESTART) {
3823                                 goto restart;
3824                         }
3825                         goto out;
3826                 }
3827
3828                 /* fp reference count now applies to knote */
3829
3830                 /*
3831                  * we can't use filter_call() because f_attach can change the filter ops
3832                  * for a filter that supports f_extended_codes, so we need to reload
3833                  * knote_fops() and not use `fops`.
3834                  */
3835                 result = fops->f_attach(kn, kev);
3836                 if (result && !knote_fops(kn)->f_extended_codes) {
3837                         result = FILTER_ACTIVE;
3838                 }
3839
3840                 kqlock(kq);
3841
3842                 if (result & FILTER_THREADREQ_NODEFEER) {
3843                         enable_preemption();
3844                 }
3845
3846                 if (kn->kn_flags & EV_ERROR) {
3847                         /*
3848                          * Failed to attach correctly, so drop.
3849                          */
3850                         kn->kn_filtid = EVFILTID_DETACHED;
3851                         error = (int)kn->kn_sdata;
3852                         knote_drop(kq, kn, &knlc);
3853                         result = 0;
3854                         goto out;
3855                 }
3856
3857                 /*
3858                  * end "attaching" phase - now just attached
3859                  *
3860                  * Mark the thread request overcommit, if appropos
3861                  *
3862                  * If the attach routine indicated that an
3863                  * event is already fired, activate the knote.
3864                  */
3865                 if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
3866                     (kq->kq_state & KQ_WORKLOOP)) {
3867                         kqworkloop_set_overcommit((struct kqworkloop *)kq);
3868                 }
3869         } else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
3870                 /*
3871                  * The knote was dropped while we were waiting for the lock,
3872                  * we need to re-evaluate entirely
3873                  */
3874
3875                 goto restart;
3876         } else if (kev->flags & EV_DELETE) {
3877                 /*
3878                  * Deletion of a knote (drop)
3879                  *
3880                  * If the filter wants to filter drop events, let it do so.
3881                  *
3882                  * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
3883                  * we must wait for the knote to be re-enabled (unless it is being
3884                  * re-enabled atomically here).
3885                  */
3886
3887                 if (knote_fops(kn)->f_allow_drop) {
3888                         bool drop;
3889
3890                         kqunlock(kq);
3891                         drop = knote_fops(kn)->f_allow_drop(kn, kev);
3892                         kqlock(kq);
3893
3894                         if (!drop) {
3895                                 goto out_unlock;
3896                         }
3897                 }
3898
3899                 if ((kev->flags & EV_ENABLE) == 0 &&
3900                     (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
3901                     (kn->kn_status & KN_DISABLED) != 0) {
3902                         kn->kn_status |= KN_DEFERDELETE;
3903                         error = EINPROGRESS;
3904                         goto out_unlock;
3905                 }
3906
3907                 knote_drop(kq, kn, &knlc);
3908                 goto out;
3909         } else {
3910                 /*
3911                  * Regular update of a knote (touch)
3912                  *
3913                  * Call touch routine to notify filter of changes in filter values
3914                  * (and to re-determine if any events are fired).
3915                  *
3916                  * If the knote is in defer-delete, avoid calling the filter touch
3917                  * routine (it has delivered its last event already).
3918                  *
3919                  * If the touch routine had no failure,
3920                  * apply the requested side effects to the knote.
3921                  */
3922
3923                 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
3924                         if (kev->flags & EV_ENABLE) {
3925                                 result = FILTER_ACTIVE;
3926                         }
3927                 } else {
3928                         kqunlock(kq);
3929                         result = filter_call(knote_fops(kn), f_touch(kn, kev));
3930                         kqlock(kq);
3931                         if (result & FILTER_THREADREQ_NODEFEER) {
3932                                 enable_preemption();
3933                         }
3934                 }
3935
3936                 if (kev->flags & EV_ERROR) {
3937                         result = 0;
3938                         goto out_unlock;
3939                 }
3940
3941                 if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
3942                     kn->kn_udata != kev->udata) {
3943                         // this allows klist_copy_udata() not to take locks
3944                         os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
3945                 }
3946                 if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
3947                         kn->kn_status |= KN_DISABLED;
3948                         knote_dequeue(kq, kn);
3949                 }
3950         }
3951
3952         /* accept new kevent state */
3953         knote_apply_touch(kq, kn, kev, result);
3954
3955 out_unlock:
3956         /*
3957          * When the filter asked for a post-register wait,
3958          * we leave the kqueue locked for kevent_register()
3959          * to call the filter's f_post_register_wait hook.
3960          */
3961         if (result & FILTER_REGISTER_WAIT) {
3962                 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
3963                 *kn_out = kn;
3964         } else {
3965                 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
3966         }
3967
3968 out:
3969         /* output local errors through the kevent */
3970         if (error) {
3971                 kev->flags |= EV_ERROR;
3972                 kev->data = error;
3973         }
3974         return result;
3975 }
3976
3977 /*
3978  * knote_process - process a triggered event
3979  *
3980  *      Validate that it is really still a triggered event
3981  *      by calling the filter routines (if necessary).  Hold
3982  *      a use reference on the knote to avoid it being detached.
3983  *
3984  *      If it is still considered triggered, we will have taken
3985  *      a copy of the state under the filter lock.  We use that
3986  *      snapshot to dispatch the knote for future processing (or
3987  *      not, if this was a lost event).
3988  *
3989  *      Our caller assures us that nobody else can be processing
3990  *      events from this knote during the whole operation. But
3991  *      others can be touching or posting events to the knote
3992  *      interspersed with our processing it.
3993  *
3994  *      caller holds a reference on the kqueue.
3995  *      kqueue locked on entry and exit - but may be dropped
3996  */
3997 static int
3998 knote_process(struct knote *kn, kevent_ctx_t kectx,
3999     kevent_callback_t callback)
4000 {
4001         struct kevent_qos_s kev;
4002         struct kqueue *kq = knote_get_kq(kn);
4003         KNOTE_LOCK_CTX(knlc);
4004         int result = FILTER_ACTIVE;
4005         int error = 0;
4006         bool drop = false;
4007
4008         /*
4009          * Must be active or stayactive
4010          * Must be queued and not disabled/suppressed or dropping
4011          */
4012         assert(kn->kn_status & KN_QUEUED);
4013         assert(kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE));
4014         assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4015
4016         if (kq->kq_state & KQ_WORKLOOP) {
4017                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4018                     ((struct kqworkloop *)kq)->kqwl_dynamicid,
4019                     kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4020                     kn->kn_filtid);
4021         } else if (kq->kq_state & KQ_WORKQ) {
4022                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4023                     0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4024                     kn->kn_filtid);
4025         } else {
4026                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4027                     VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4028                     kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4029         }
4030
4031         if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4032                 /*
4033                  * When the knote is dropping or has dropped,
4034                  * then there's nothing we want to process.
4035                  */
4036                 return EJUSTRETURN;
4037         }
4038
4039         /*
4040          * While waiting for the knote lock, we may have dropped the kq lock.
4041          * and a touch may have disabled and dequeued the knote.
4042          */
4043         if (!(kn->kn_status & KN_QUEUED)) {
4044                 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4045                 return EJUSTRETURN;
4046         }
4047
4048         /*
4049          * For deferred-drop or vanished events, we just create a fake
4050          * event to acknowledge end-of-life.  Otherwise, we call the
4051          * filter's process routine to snapshot the kevent state under
4052          * the filter's locking protocol.
4053          *
4054          * suppress knotes to avoid returning the same event multiple times in
4055          * a single call.
4056          */
4057         knote_suppress(kq, kn);
4058
4059         if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4060                 uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4061                 if (kn->kn_status & KN_DEFERDELETE) {
4062                         kev_flags |= EV_DELETE;
4063                 } else {
4064                         kev_flags |= EV_VANISHED;
4065                 }
4066
4067                 /* create fake event */
4068                 kev = (struct kevent_qos_s){
4069                         .filter = kn->kn_filter,
4070                         .ident  = kn->kn_id,
4071                         .flags  = kev_flags,
4072                         .udata  = kn->kn_udata,
4073                 };
4074         } else {
4075                 kqunlock(kq);
4076                 kev = (struct kevent_qos_s) { };
4077                 result = filter_call(knote_fops(kn), f_process(kn, &kev));
4078                 kqlock(kq);
4079         }
4080
4081         /*
4082          * Determine how to dispatch the knote for future event handling.
4083          * not-fired: just return (do not callout, leave deactivated).
4084          * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
4085          *            is the deferred delete event delivery itself).  Otherwise,
4086          *            drop it.
4087          * Dispatch:  don't clear state, just mark it disabled.
4088          * Cleared:   just leave it deactivated.
4089          * Others:    re-activate as there may be more events to handle.
4090          *            This will not wake up more handlers right now, but
4091          *            at the completion of handling events it may trigger
4092          *            more handler threads (TODO: optimize based on more than
4093          *            just this one event being detected by the filter).
4094          */
4095         if ((result & FILTER_ACTIVE) == 0) {
4096                 if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
4097                         /*
4098                          * Stay active knotes should not be unsuppressed or we'd create an
4099                          * infinite loop.
4100                          *
4101                          * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4102                          * within f_process() but that doesn't necessarily make them
4103                          * ready to process, so we should leave them be.
4104                          *
4105                          * For other knotes, since we will not return an event,
4106                          * there's no point keeping the knote suppressed.
4107                          */
4108                         knote_unsuppress(kq, kn);
4109                 }
4110                 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4111                 return EJUSTRETURN;
4112         }
4113
4114         if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4115                 knote_adjust_qos(kq, kn, result);
4116         }
4117         kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4118
4119         if (kev.flags & EV_ONESHOT) {
4120                 if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4121                     (kn->kn_status & KN_DEFERDELETE) == 0) {
4122                         /* defer dropping non-delete oneshot dispatch2 events */
4123                         kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4124                 } else {
4125                         drop = true;
4126                 }
4127         } else if (kn->kn_flags & EV_DISPATCH) {
4128                 /* disable all dispatch knotes */
4129                 kn->kn_status |= KN_DISABLED;
4130         } else if ((kn->kn_flags & EV_CLEAR) == 0) {
4131                 /* re-activate in case there are more events */
4132                 knote_activate(kq, kn, FILTER_ACTIVE);
4133         }
4134
4135         /*
4136          * callback to handle each event as we find it.
4137          * If we have to detach and drop the knote, do
4138          * it while we have the kq unlocked.
4139          */
4140         if (drop) {
4141                 knote_drop(kq, kn, &knlc);
4142         } else {
4143                 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4144         }
4145
4146         if (kev.flags & EV_VANISHED) {
4147                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4148                     kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4149                     kn->kn_filtid);
4150         }
4151
4152         error = (callback)(&kev, kectx);
4153         kqlock(kq);
4154         return error;
4155 }
4156
4157 /*
4158  * Returns -1 if the kqueue was unbound and processing should not happen
4159  */
4160 #define KQWQAE_BEGIN_PROCESSING 1
4161 #define KQWQAE_END_PROCESSING   2
4162 #define KQWQAE_UNBIND           3
4163 static int
4164 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4165     int kevent_flags, int kqwqae_op)
4166 {
4167         thread_qos_t old_override = THREAD_QOS_UNSPECIFIED;
4168         thread_t thread = kqr_thread_fast(kqr);
4169         struct knote *kn;
4170         int rc = 0;
4171         bool unbind;
4172         struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index];
4173
4174         kqlock_held(&kqwq->kqwq_kqueue);
4175
4176         if (!TAILQ_EMPTY(suppressq)) {
4177                 /*
4178                  * Return suppressed knotes to their original state.
4179                  * For workq kqueues, suppressed ones that are still
4180                  * truly active (not just forced into the queue) will
4181                  * set flags we check below to see if anything got
4182                  * woken up.
4183                  */
4184                 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4185                         assert(kn->kn_status & KN_SUPPRESSED);
4186                         knote_unsuppress(kqwq, kn);
4187                 }
4188         }
4189
4190 #if DEBUG || DEVELOPMENT
4191         thread_t self = current_thread();
4192         struct uthread *ut = get_bsdthread_info(self);
4193
4194         assert(thread == self);
4195         assert(ut->uu_kqr_bound == kqr);
4196 #endif // DEBUG || DEVELOPMENT
4197
4198         if (kqwqae_op == KQWQAE_UNBIND) {
4199                 unbind = true;
4200         } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4201                 unbind = false;
4202         } else {
4203                 unbind = !kqr->tr_kq_wakeup;
4204         }
4205         if (unbind) {
4206                 old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4207                 rc = -1;
4208                 /*
4209                  * request a new thread if we didn't process the whole queue or real events
4210                  * have happened (not just putting stay-active events back).
4211                  */
4212                 if (kqr->tr_kq_wakeup) {
4213                         kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4214                             kqr->tr_kq_qos_index, 0);
4215                 }
4216         }
4217
4218         if (rc == 0) {
4219                 /*
4220                  * Reset wakeup bit to notice events firing while we are processing,
4221                  * as we cannot rely on the bucket queue emptiness because of stay
4222                  * active knotes.
4223                  */
4224                 kqr->tr_kq_wakeup = false;
4225         }
4226
4227         if (old_override) {
4228                 thread_drop_kevent_override(thread);
4229         }
4230
4231         return rc;
4232 }
4233
4234 /*
4235  * Return 0 to indicate that processing should proceed,
4236  * -1 if there is nothing to process.
4237  *
4238  * Called with kqueue locked and returns the same way,
4239  * but may drop lock temporarily.
4240  */
4241 static int
4242 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4243     int kevent_flags)
4244 {
4245         int rc = 0;
4246
4247         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4248             0, kqr->tr_kq_qos_index);
4249
4250         rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4251             KQWQAE_BEGIN_PROCESSING);
4252
4253         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4254             thread_tid(kqr_thread(kqr)), kqr->tr_kq_wakeup);
4255
4256         return rc;
4257 }
4258
4259 static thread_qos_t
4260 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4261 {
4262         kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4263         struct knote *kn, *tmp;
4264
4265         kqlock_held(kqwl);
4266
4267         TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4268                 /*
4269                  * If a knote that can adjust QoS is disabled because of the automatic
4270                  * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4271                  * further overrides keep pushing.
4272                  */
4273                 if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
4274                     (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 &&
4275                     (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4276                         qos = MAX(qos, kn->kn_qos_override);
4277                         continue;
4278                 }
4279                 knote_unsuppress(kqwl, kn);
4280         }
4281
4282         return qos;
4283 }
4284
4285 static int
4286 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4287 {
4288         workq_threadreq_t kqr = &kqwl->kqwl_request;
4289         struct kqueue *kq = &kqwl->kqwl_kqueue;
4290         thread_qos_t qos_override;
4291         thread_t thread = kqr_thread_fast(kqr);
4292         int rc = 0, op = KQWL_UTQ_NONE;
4293
4294         kqlock_held(kq);
4295
4296         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4297             kqwl->kqwl_dynamicid, 0, 0);
4298
4299         /* nobody else should still be processing */
4300         assert((kq->kq_state & KQ_PROCESSING) == 0);
4301
4302         kq->kq_state |= KQ_PROCESSING;
4303
4304         if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4305                 op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4306         }
4307
4308         if (kevent_flags & KEVENT_FLAG_PARKING) {
4309                 /*
4310                  * When "parking" we want to process events and if no events are found
4311                  * unbind.
4312                  *
4313                  * However, non overcommit threads sometimes park even when they have
4314                  * more work so that the pool can narrow.  For these, we need to unbind
4315                  * early, so that calling kqworkloop_update_threads_qos() can ask the
4316                  * workqueue subsystem whether the thread should park despite having
4317                  * pending events.
4318                  */
4319                 if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
4320                         op = KQWL_UTQ_PARKING;
4321                 } else {
4322                         op = KQWL_UTQ_UNBINDING;
4323                 }
4324         }
4325         if (op == KQWL_UTQ_NONE) {
4326                 goto done;
4327         }
4328
4329         qos_override = kqworkloop_acknowledge_events(kqwl);
4330
4331         if (op == KQWL_UTQ_UNBINDING) {
4332                 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_IMMEDIATELY);
4333                 kqworkloop_release_live(kqwl);
4334         }
4335         kqworkloop_update_threads_qos(kqwl, op, qos_override);
4336         if (op == KQWL_UTQ_PARKING) {
4337                 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
4338                         /*
4339                          * We cannot trust tr_kq_wakeup when looking at stay active knotes.
4340                          * We need to process once, and kqworkloop_end_processing will
4341                          * handle the unbind.
4342                          */
4343                 } else if (!kqr->tr_kq_wakeup || kqwl->kqwl_owner) {
4344                         kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
4345                         kqworkloop_release_live(kqwl);
4346                         rc = -1;
4347                 }
4348         } else if (op == KQWL_UTQ_UNBINDING) {
4349                 if (kqr_thread(kqr) == thread) {
4350                         /*
4351                          * The thread request fired again, passed the admission check and
4352                          * got bound to the current thread again.
4353                          */
4354                 } else {
4355                         rc = -1;
4356                 }
4357         }
4358
4359         if (rc == 0) {
4360                 /*
4361                  * Reset wakeup bit to notice stay active events firing while we are
4362                  * processing, as we cannot rely on the stayactive bucket emptiness.
4363                  */
4364                 kqwl->kqwl_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
4365         } else {
4366                 kq->kq_state &= ~KQ_PROCESSING;
4367         }
4368
4369         if (rc == -1) {
4370                 kqworkloop_unbind_delayed_override_drop(thread);
4371         }
4372
4373 done:
4374         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4375             kqwl->kqwl_dynamicid, 0, 0);
4376
4377         return rc;
4378 }
4379
4380 /*
4381  * Return 0 to indicate that processing should proceed,
4382  * -1 if there is nothing to process.
4383  * EBADF if the kqueue is draining
4384  *
4385  * Called with kqueue locked and returns the same way,
4386  * but may drop lock temporarily.
4387  * May block.
4388  */
4389 static int
4390 kqfile_begin_processing(struct kqfile *kq)
4391 {
4392         struct kqtailq *suppressq;
4393
4394         kqlock_held(kq);
4395
4396         assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4397         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4398             VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4399
4400         /* wait to become the exclusive processing thread */
4401         for (;;) {
4402                 if (kq->kqf_state & KQ_DRAIN) {
4403                         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4404                             VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4405                         return EBADF;
4406                 }
4407
4408                 if ((kq->kqf_state & KQ_PROCESSING) == 0) {
4409                         break;
4410                 }
4411
4412                 /* if someone else is processing the queue, wait */
4413                 kq->kqf_state |= KQ_PROCWAIT;
4414                 suppressq = &kq->kqf_suppressed;
4415                 waitq_assert_wait64((struct waitq *)&kq->kqf_wqs,
4416                     CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT,
4417                     TIMEOUT_WAIT_FOREVER);
4418
4419                 kqunlock(kq);
4420                 thread_block(THREAD_CONTINUE_NULL);
4421                 kqlock(kq);
4422         }
4423
4424         /* Nobody else processing */
4425
4426         /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
4427         waitq_set_clear_preposts(&kq->kqf_wqs);
4428         kq->kqf_state &= ~KQ_WAKEUP;
4429
4430         /* anything left to process? */
4431         if (TAILQ_EMPTY(&kq->kqf_queue)) {
4432                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4433                     VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4434                 return -1;
4435         }
4436
4437         /* convert to processing mode */
4438         kq->kqf_state |= KQ_PROCESSING;
4439
4440         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4441             VM_KERNEL_UNSLIDE_OR_PERM(kq));
4442
4443         return 0;
4444 }
4445
4446 /*
4447  * Try to end the processing, only called when a workq thread is attempting to
4448  * park (KEVENT_FLAG_PARKING is set).
4449  *
4450  * When returning -1, the kqworkq is setup again so that it is ready to be
4451  * processed.
4452  */
4453 static int
4454 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4455     int kevent_flags)
4456 {
4457         if (!TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index])) {
4458                 /* remember we didn't process everything */
4459                 kqr->tr_kq_wakeup = true;
4460         }
4461
4462         if (kevent_flags & KEVENT_FLAG_PARKING) {
4463                 /*
4464                  * if acknowledge events "succeeds" it means there are events,
4465                  * which is a failure condition for end_processing.
4466                  */
4467                 int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4468                     KQWQAE_END_PROCESSING);
4469                 if (rc == 0) {
4470                         return -1;
4471                 }
4472         }
4473
4474         return 0;
4475 }
4476
4477 /*
4478  * Try to end the processing, only called when a workq thread is attempting to
4479  * park (KEVENT_FLAG_PARKING is set).
4480  *
4481  * When returning -1, the kqworkq is setup again so that it is ready to be
4482  * processed (as if kqworkloop_begin_processing had just been called).
4483  *
4484  * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4485  * the kqworkloop is unbound from its servicer as a side effect.
4486  */
4487 static int
4488 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4489 {
4490         struct kqueue *kq = &kqwl->kqwl_kqueue;
4491         workq_threadreq_t kqr = &kqwl->kqwl_request;
4492         thread_qos_t qos_override;
4493         thread_t thread = kqr_thread_fast(kqr);
4494         int rc = 0;
4495
4496         kqlock_held(kq);
4497
4498         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4499             kqwl->kqwl_dynamicid, 0, 0);
4500
4501         if (flags & KQ_PROCESSING) {
4502                 assert(kq->kq_state & KQ_PROCESSING);
4503
4504                 /*
4505                  * If we still have queued stayactive knotes, remember we didn't finish
4506                  * processing all of them.  This should be extremely rare and would
4507                  * require to have a lot of them registered and fired.
4508                  */
4509                 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
4510                         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS,
4511                             KQWL_BUCKET_STAYACTIVE);
4512                 }
4513
4514                 /*
4515                  * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
4516                  * still under the lock.
4517                  *
4518                  * So we do everything kqworkloop_unbind() would do, but because we're
4519                  * inside kqueue_process(), if the workloop actually received events
4520                  * while our locks were dropped, we have the opportunity to fail the end
4521                  * processing and loop again.
4522                  *
4523                  * This avoids going through the process-wide workqueue lock hence
4524                  * scales better.
4525                  */
4526                 if (kevent_flags & KEVENT_FLAG_PARKING) {
4527                         qos_override = kqworkloop_acknowledge_events(kqwl);
4528                 }
4529         }
4530
4531         if (kevent_flags & KEVENT_FLAG_PARKING) {
4532                 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4533                 if (kqr->tr_kq_wakeup && !kqwl->kqwl_owner) {
4534                         /*
4535                          * Reset wakeup bit to notice stay active events firing while we are
4536                          * processing, as we cannot rely on the stayactive bucket emptiness.
4537                          */
4538                         kqwl->kqwl_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
4539                         rc = -1;
4540                 } else {
4541                         kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
4542                         kqworkloop_release_live(kqwl);
4543                         kq->kq_state &= ~flags;
4544                 }
4545         } else {
4546                 kq->kq_state &= ~flags;
4547                 kq->kq_state |= KQ_R2K_ARMED;
4548                 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4549         }
4550
4551         if ((kevent_flags & KEVENT_FLAG_PARKING) && rc == 0) {
4552                 kqworkloop_unbind_delayed_override_drop(thread);
4553         }
4554
4555         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4556             kqwl->kqwl_dynamicid, 0, 0);
4557
4558         return rc;
4559 }
4560
4561 /*
4562  * Called with kqueue lock held.
4563  *
4564  * 0: no more events
4565  * -1: has more events
4566  * EBADF: kqueue is in draining mode
4567  */
4568 static int
4569 kqfile_end_processing(struct kqfile *kq)
4570 {
4571         struct kqtailq *suppressq = &kq->kqf_suppressed;
4572         struct knote *kn;
4573         int procwait;
4574
4575         kqlock_held(kq);
4576
4577         assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4578
4579         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4580             VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4581
4582         /*
4583          * Return suppressed knotes to their original state.
4584          */
4585         while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4586                 assert(kn->kn_status & KN_SUPPRESSED);
4587                 knote_unsuppress(kq, kn);
4588         }
4589
4590         procwait = (kq->kqf_state & KQ_PROCWAIT);
4591         kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4592
4593         if (procwait) {
4594                 /* first wake up any thread already waiting to process */
4595                 waitq_wakeup64_all((struct waitq *)&kq->kqf_wqs,
4596                     CAST_EVENT64_T(suppressq), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
4597         }
4598
4599         if (kq->kqf_state & KQ_DRAIN) {
4600                 return EBADF;
4601         }
4602         return (kq->kqf_state & KQ_WAKEUP) ? -1 : 0;
4603 }
4604
4605 static int
4606 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4607     struct kqueue_workloop_params *params, int *retval)
4608 {
4609         int error = 0;
4610         struct kqworkloop *kqwl;
4611         struct filedesc *fdp = p->p_fd;
4612         workq_threadreq_param_t trp = { };
4613
4614         switch (cmd) {
4615         case KQ_WORKLOOP_CREATE:
4616                 if (!params->kqwlp_flags) {
4617                         error = EINVAL;
4618                         break;
4619                 }
4620
4621                 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4622                     (params->kqwlp_sched_pri < 1 ||
4623                     params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4624                         error = EINVAL;
4625                         break;
4626                 }
4627
4628                 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4629                     invalid_policy(params->kqwlp_sched_pol)) {
4630                         error = EINVAL;
4631                         break;
4632                 }
4633
4634                 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4635                     (params->kqwlp_cpu_percent <= 0 ||
4636                     params->kqwlp_cpu_percent > 100 ||
4637                     params->kqwlp_cpu_refillms <= 0 ||
4638                     params->kqwlp_cpu_refillms > 0x00ffffff)) {
4639                         error = EINVAL;
4640                         break;
4641                 }
4642
4643                 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4644                         trp.trp_flags |= TRP_PRIORITY;
4645                         trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4646                 }
4647                 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4648                         trp.trp_flags |= TRP_POLICY;
4649                         trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4650                 }
4651                 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4652                         trp.trp_flags |= TRP_CPUPERCENT;
4653                         trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4654                         trp.trp_refillms = params->kqwlp_cpu_refillms;
4655                 }
4656
4657                 error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
4658                     KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4659                     KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
4660                 if (error) {
4661                         break;
4662                 }
4663
4664                 if (!(fdp->fd_flags & FD_WORKLOOP)) {
4665                         /* FD_WORKLOOP indicates we've ever created a workloop
4666                          * via this syscall but its only ever added to a process, never
4667                          * removed.
4668                          */
4669                         proc_fdlock(p);
4670                         fdp->fd_flags |= FD_WORKLOOP;
4671                         proc_fdunlock(p);
4672                 }
4673                 break;
4674         case KQ_WORKLOOP_DESTROY:
4675                 error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL,
4676                     KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4677                     KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
4678                 if (error) {
4679                         break;
4680                 }
4681                 kqlock(kqwl);
4682                 trp.trp_value = kqwl->kqwl_params;
4683                 if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
4684                         trp.trp_flags |= TRP_RELEASED;
4685                         kqwl->kqwl_params = trp.trp_value;
4686                         kqworkloop_release_live(kqwl);
4687                 } else {
4688                         error = EINVAL;
4689                 }
4690                 kqunlock(kqwl);
4691                 kqworkloop_release(kqwl);
4692                 break;
4693         }
4694         *retval = 0;
4695         return error;
4696 }
4697
4698 int
4699 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
4700 {
4701         struct kqueue_workloop_params params = {
4702                 .kqwlp_id = 0,
4703         };
4704         if (uap->sz < sizeof(params.kqwlp_version)) {
4705                 return EINVAL;
4706         }
4707
4708         size_t copyin_sz = MIN(sizeof(params), uap->sz);
4709         int rv = copyin(uap->addr, &params, copyin_sz);
4710         if (rv) {
4711                 return rv;
4712         }
4713
4714         if (params.kqwlp_version != (int)uap->sz) {
4715                 return EINVAL;
4716         }
4717
4718         return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
4719                    retval);
4720 }
4721
4722 /*ARGSUSED*/
4723 static int
4724 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
4725     __unused vfs_context_t ctx)
4726 {
4727         struct kqfile *kq = (struct kqfile *)fp->f_data;
4728         struct kqtailq *suppressq = &kq->kqf_suppressed;
4729         struct kqtailq *queue = &kq->kqf_queue;
4730         struct knote *kn;
4731         int retnum = 0;
4732
4733         if (which != FREAD) {
4734                 return 0;
4735         }
4736
4737         kqlock(kq);
4738
4739         assert((kq->kqf_state & KQ_WORKQ) == 0);
4740
4741         /*
4742          * If this is the first pass, link the wait queue associated with the
4743          * the kqueue onto the wait queue set for the select().  Normally we
4744          * use selrecord() for this, but it uses the wait queue within the
4745          * selinfo structure and we need to use the main one for the kqueue to
4746          * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
4747          * (The select() call will unlink them when it ends).
4748          */
4749         if (wq_link_id != NULL) {
4750                 thread_t cur_act = current_thread();
4751                 struct uthread * ut = get_bsdthread_info(cur_act);
4752
4753                 kq->kqf_state |= KQ_SEL;
4754                 waitq_link((struct waitq *)&kq->kqf_wqs, ut->uu_wqset,
4755                     WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
4756
4757                 /* always consume the reserved link object */
4758                 waitq_link_release(*(uint64_t *)wq_link_id);
4759                 *(uint64_t *)wq_link_id = 0;
4760
4761                 /*
4762                  * selprocess() is expecting that we send it back the waitq
4763                  * that was just added to the thread's waitq set. In order
4764                  * to not change the selrecord() API (which is exported to
4765                  * kexts), we pass this value back through the
4766                  * void *wq_link_id pointer we were passed. We need to use
4767                  * memcpy here because the pointer may not be properly aligned
4768                  * on 32-bit systems.
4769                  */
4770                 void *wqptr = &kq->kqf_wqs;
4771                 memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
4772         }
4773
4774         if (kqfile_begin_processing(kq) == -1) {
4775                 kqunlock(kq);
4776                 return 0;
4777         }
4778
4779         if (!TAILQ_EMPTY(queue)) {
4780                 /*
4781                  * there is something queued - but it might be a
4782                  * KN_STAYACTIVE knote, which may or may not have
4783                  * any events pending.  Otherwise, we have to walk
4784                  * the list of knotes to see, and peek at the
4785                  * (non-vanished) stay-active ones to be really sure.
4786                  */
4787                 while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
4788                         if (kn->kn_status & KN_ACTIVE) {
4789                                 retnum = 1;
4790                                 goto out;
4791                         }
4792                         assert(kn->kn_status & KN_STAYACTIVE);
4793                         knote_suppress(kq, kn);
4794                 }
4795
4796                 /*
4797                  * There were no regular events on the queue, so take
4798                  * a deeper look at the stay-queued ones we suppressed.
4799                  */
4800                 while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
4801                         KNOTE_LOCK_CTX(knlc);
4802                         int result = 0;
4803
4804                         /* If didn't vanish while suppressed - peek at it */
4805                         if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc,
4806                             KNOTE_KQ_LOCK_ON_FAILURE)) {
4807                                 continue;
4808                         }
4809
4810                         result = filter_call(knote_fops(kn), f_peek(kn));
4811
4812                         kqlock(kq);
4813                         knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4814
4815                         /* unsuppress it */
4816                         knote_unsuppress(kq, kn);
4817
4818                         /* has data or it has to report a vanish */
4819                         if (result & FILTER_ACTIVE) {
4820                                 retnum = 1;
4821                                 goto out;
4822                         }
4823                 }
4824         }
4825
4826 out:
4827         kqfile_end_processing(kq);
4828         kqunlock(kq);
4829         return retnum;
4830 }
4831
4832 /*
4833  * kqueue_close -
4834  */
4835 /*ARGSUSED*/
4836 static int
4837 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
4838 {
4839         struct kqfile *kqf = (struct kqfile *)fg->fg_data;
4840
4841         assert((kqf->kqf_state & KQ_WORKQ) == 0);
4842         kqueue_dealloc(&kqf->kqf_kqueue);
4843         fg->fg_data = NULL;
4844         return 0;
4845 }
4846
4847 /*
4848  * Max depth of the nested kq path that can be created.
4849  * Note that this has to be less than the size of kq_level
4850  * to avoid wrapping around and mislabeling the level.
4851  */
4852 #define MAX_NESTED_KQ 1000
4853
4854 /*ARGSUSED*/
4855 /*
4856  * The callers has taken a use-count reference on this kqueue and will donate it
4857  * to the kqueue we are being added to.  This keeps the kqueue from closing until
4858  * that relationship is torn down.
4859  */
4860 static int
4861 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
4862     __unused struct kevent_qos_s *kev)
4863 {
4864         struct kqfile *kqf = (struct kqfile *)fp->f_data;
4865         struct kqueue *kq = &kqf->kqf_kqueue;
4866         struct kqueue *parentkq = knote_get_kq(kn);
4867
4868         assert((kqf->kqf_state & KQ_WORKQ) == 0);
4869
4870         if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
4871                 knote_set_error(kn, EINVAL);
4872                 return 0;
4873         }
4874
4875         /*
4876          * We have to avoid creating a cycle when nesting kqueues
4877          * inside another.  Rather than trying to walk the whole
4878          * potential DAG of nested kqueues, we just use a simple
4879          * ceiling protocol.  When a kqueue is inserted into another,
4880          * we check that the (future) parent is not already nested
4881          * into another kqueue at a lower level than the potenial
4882          * child (because it could indicate a cycle).  If that test
4883          * passes, we just mark the nesting levels accordingly.
4884          *
4885          * Only up to MAX_NESTED_KQ can be nested.
4886          *
4887          * Note: kqworkq and kqworkloop cannot be nested and have reused their
4888          *       kq_level field, so ignore these as parent.
4889          */
4890
4891         kqlock(parentkq);
4892
4893         if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
4894                 if (parentkq->kq_level > 0 &&
4895                     parentkq->kq_level < kq->kq_level) {
4896                         kqunlock(parentkq);
4897                         knote_set_error(kn, EINVAL);
4898                         return 0;
4899                 }
4900
4901                 /* set parent level appropriately */
4902                 uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
4903                 if (plevel < kq->kq_level + 1) {
4904                         if (kq->kq_level + 1 > MAX_NESTED_KQ) {
4905                                 kqunlock(parentkq);
4906                                 knote_set_error(kn, EINVAL);
4907                                 return 0;
4908                         }
4909                         plevel = kq->kq_level + 1;
4910                 }
4911
4912                 parentkq->kq_level = plevel;
4913         }
4914
4915         kqunlock(parentkq);
4916
4917         kn->kn_filtid = EVFILTID_KQREAD;
4918         kqlock(kq);
4919         KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
4920         /* indicate nesting in child, if needed */
4921         if (kq->kq_level == 0) {
4922                 kq->kq_level = 1;
4923         }
4924
4925         int count = kq->kq_count;
4926         kqunlock(kq);
4927         return count > 0;
4928 }
4929
4930 /*
4931  * kqueue_drain - called when kq is closed
4932  */
4933 /*ARGSUSED*/
4934 static int
4935 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
4936 {
4937         struct kqfile *kqf = (struct kqfile *)fp->fp_glob->fg_data;
4938
4939         assert((kqf->kqf_state & KQ_WORKQ) == 0);
4940
4941         kqlock(kqf);
4942         kqf->kqf_state |= KQ_DRAIN;
4943
4944         /* wakeup sleeping threads */
4945         if ((kqf->kqf_state & (KQ_SLEEP | KQ_SEL)) != 0) {
4946                 kqf->kqf_state &= ~(KQ_SLEEP | KQ_SEL);
4947                 (void)waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs,
4948                     KQ_EVENT,
4949                     THREAD_RESTART,
4950                     WAITQ_ALL_PRIORITIES);
4951         }
4952
4953         /* wakeup threads waiting their turn to process */
4954         if (kqf->kqf_state & KQ_PROCWAIT) {
4955                 assert(kqf->kqf_state & KQ_PROCESSING);
4956
4957                 kqf->kqf_state &= ~KQ_PROCWAIT;
4958                 (void)waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs,
4959                     CAST_EVENT64_T(&kqf->kqf_suppressed),
4960                     THREAD_RESTART, WAITQ_ALL_PRIORITIES);
4961         }
4962
4963         kqunlock(kqf);
4964         return 0;
4965 }
4966
4967 /*ARGSUSED*/
4968 int
4969 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
4970 {
4971         assert((kq->kq_state & KQ_WORKQ) == 0);
4972
4973         kqlock(kq);
4974         if (isstat64 != 0) {
4975                 struct stat64 *sb64 = (struct stat64 *)ub;
4976
4977                 bzero((void *)sb64, sizeof(*sb64));
4978                 sb64->st_size = kq->kq_count;
4979                 if (kq->kq_state & KQ_KEV_QOS) {
4980                         sb64->st_blksize = sizeof(struct kevent_qos_s);
4981                 } else if (kq->kq_state & KQ_KEV64) {
4982                         sb64->st_blksize = sizeof(struct kevent64_s);
4983                 } else if (IS_64BIT_PROCESS(p)) {
4984                         sb64->st_blksize = sizeof(struct user64_kevent);
4985                 } else {
4986                         sb64->st_blksize = sizeof(struct user32_kevent);
4987                 }
4988                 sb64->st_mode = S_IFIFO;
4989         } else {
4990                 struct stat *sb = (struct stat *)ub;
4991
4992                 bzero((void *)sb, sizeof(*sb));
4993                 sb->st_size = kq->kq_count;
4994                 if (kq->kq_state & KQ_KEV_QOS) {
4995                         sb->st_blksize = sizeof(struct kevent_qos_s);
4996                 } else if (kq->kq_state & KQ_KEV64) {
4997                         sb->st_blksize = sizeof(struct kevent64_s);
4998                 } else if (IS_64BIT_PROCESS(p)) {
4999                         sb->st_blksize = sizeof(struct user64_kevent);
5000                 } else {
5001                         sb->st_blksize = sizeof(struct user32_kevent);
5002                 }
5003                 sb->st_mode = S_IFIFO;
5004         }
5005         kqunlock(kq);
5006         return 0;
5007 }
5008
5009 static inline bool
5010 kqueue_threadreq_can_use_ast(struct kqueue *kq)
5011 {
5012         if (current_proc() == kq->kq_p) {
5013                 /*
5014                  * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5015                  * do combined send/receive and in the case of self-IPC, the AST may bet
5016                  * set on a thread that will not return to userspace and needs the
5017                  * thread the AST would create to unblock itself.
5018                  *
5019                  * At this time, we really want to target:
5020                  *
5021                  * - kevent variants that can cause thread creations, and dispatch
5022                  *   really only uses kevent_qos and kevent_id,
5023                  *
5024                  * - workq_kernreturn (directly about thread creations)
5025                  *
5026                  * - bsdthread_ctl which is used for qos changes and has direct impact
5027                  *   on the creator thread scheduling decisions.
5028                  */
5029                 switch (current_uthread()->syscall_code) {
5030                 case SYS_kevent_qos:
5031                 case SYS_kevent_id:
5032                 case SYS_workq_kernreturn:
5033                 case SYS_bsdthread_ctl:
5034                         return true;
5035                 }
5036         }
5037         return false;
5038 }
5039
5040 /*
5041  * Interact with the pthread kext to request a servicing there at a specific QoS
5042  * level.
5043  *
5044  * - Caller holds the workq request lock
5045  *
5046  * - May be called with the kqueue's wait queue set locked,
5047  *   so cannot do anything that could recurse on that.
5048  */
5049 static void
5050 kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t kqr,
5051     kq_index_t qos, int flags)
5052 {
5053         assert(kqr->tr_kq_wakeup);
5054         assert(kqr_thread(kqr) == THREAD_NULL);
5055         assert(!kqr_thread_requested(kqr));
5056         struct turnstile *ts = TURNSTILE_NULL;
5057
5058         if (workq_is_exiting(kq->kq_p)) {
5059                 return;
5060         }
5061
5062         kqlock_held(kq);
5063
5064         if (kq->kq_state & KQ_WORKLOOP) {
5065                 __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq;
5066
5067                 assert(kqwl->kqwl_owner == THREAD_NULL);
5068                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5069                     kqwl->kqwl_dynamicid, 0, qos, kqr->tr_kq_wakeup);
5070                 ts = kqwl->kqwl_turnstile;
5071                 /* Add a thread request reference on the kqueue. */
5072                 kqworkloop_retain(kqwl);
5073         } else {
5074                 assert(kq->kq_state & KQ_WORKQ);
5075                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
5076                     -1, 0, qos, kqr->tr_kq_wakeup);
5077         }
5078
5079         /*
5080          * New-style thread request supported.
5081          * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5082          * its use until a corresponding kqueue_threadreq_bind callback.
5083          */
5084         if (kqueue_threadreq_can_use_ast(kq)) {
5085                 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5086         }
5087         if (qos == KQWQ_QOS_MANAGER) {
5088                 qos = WORKQ_THREAD_QOS_MANAGER;
5089         }
5090         if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) {
5091                 /*
5092                  * Process is shutting down or exec'ing.
5093                  * All the kqueues are going to be cleaned up
5094                  * soon. Forget we even asked for a thread -
5095                  * and make sure we don't ask for more.
5096                  */
5097                 kq->kq_state &= ~KQ_R2K_ARMED;
5098                 kqueue_release_live(kq);
5099         }
5100 }
5101
5102 /*
5103  * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5104  *
5105  * This is used when kqueue_threadreq_bind may cause a lock inversion.
5106  */
5107 __attribute__((always_inline))
5108 void
5109 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5110     struct uthread *ut)
5111 {
5112         ut->uu_kqr_bound = kqr;
5113         kqr->tr_thread = ut->uu_thread;
5114         kqr->tr_state = WORKQ_TR_STATE_BINDING;
5115 }
5116
5117 /*
5118  * kqueue_threadreq_bind_commit - commit a bind prepost
5119  *
5120  * The workq code has to commit any binding prepost before the thread has
5121  * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5122  */
5123 void
5124 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5125 {
5126         struct uthread *ut = get_bsdthread_info(thread);
5127         workq_threadreq_t kqr = ut->uu_kqr_bound;
5128         kqueue_t kqu = kqr_kqueue(p, kqr);
5129
5130         kqlock(kqu);
5131         if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5132                 kqueue_threadreq_bind(p, kqr, thread, 0);
5133         }
5134         kqunlock(kqu);
5135 }
5136
5137 static void
5138 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5139     workq_kern_threadreq_flags_t flags)
5140 {
5141         assert(kqr_thread_requested_pending(kqr));
5142
5143         kqlock_held(kqu);
5144
5145         if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5146                 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5147         }
5148         workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5149 }
5150
5151 /*
5152  * kqueue_threadreq_bind - bind thread to processing kqrequest
5153  *
5154  * The provided thread will be responsible for delivering events
5155  * associated with the given kqrequest.  Bind it and get ready for
5156  * the thread to eventually arrive.
5157  */
5158 void
5159 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5160     unsigned int flags)
5161 {
5162         kqueue_t kqu = kqr_kqueue(p, kqr);
5163         struct uthread *ut = get_bsdthread_info(thread);
5164
5165         kqlock_held(kqu);
5166
5167         assert(ut->uu_kqueue_override == 0);
5168
5169         if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5170                 assert(ut->uu_kqr_bound == kqr);
5171                 assert(kqr->tr_thread == thread);
5172         } else {
5173                 assert(kqr_thread_requested_pending(kqr));
5174                 assert(kqr->tr_thread == THREAD_NULL);
5175                 assert(ut->uu_kqr_bound == NULL);
5176                 ut->uu_kqr_bound = kqr;
5177                 kqr->tr_thread = thread;
5178         }
5179
5180         kqr->tr_state = WORKQ_TR_STATE_BOUND;
5181
5182         if (kqu.kq->kq_state & KQ_WORKLOOP) {
5183                 struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5184
5185                 if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5186                         /*
5187                          * <rdar://problem/38626999> shows that asserting here is not ok.
5188                          *
5189                          * This is not supposed to happen for correct use of the interface,
5190                          * but it is sadly possible for userspace (with the help of memory
5191                          * corruption, such as over-release of a dispatch queue) to make
5192                          * the creator thread the "owner" of a workloop.
5193                          *
5194                          * Once that happens, and that creator thread picks up the same
5195                          * workloop as a servicer, we trip this codepath. We need to fixup
5196                          * the state to forget about this thread being the owner, as the
5197                          * entire workloop state machine expects servicers to never be
5198                          * owners and everything would basically go downhill from here.
5199                          */
5200                         kqu.kqwl->kqwl_owner = THREAD_NULL;
5201                         if (kqworkloop_override(kqu.kqwl)) {
5202                                 thread_drop_kevent_override(thread);
5203                         }
5204                 }
5205
5206                 if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5207                         /*
5208                          * Past this point, the interlock is the kq req lock again,
5209                          * so we can fix the inheritor for good.
5210                          */
5211                         filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5212                         turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5213                 }
5214
5215                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5216                     thread_tid(thread), kqr->tr_kq_qos_index,
5217                     (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
5218
5219                 ut->uu_kqueue_override = kqr->tr_kq_override_index;
5220                 if (kqr->tr_kq_override_index) {
5221                         thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5222                 }
5223         } else {
5224                 assert(kqr->tr_kq_override_index == 0);
5225
5226                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5227                     thread_tid(thread), kqr->tr_kq_qos_index,
5228                     (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
5229         }
5230 }
5231
5232 /*
5233  * kqueue_threadreq_cancel - abort a pending thread request
5234  *
5235  * Called when exiting/exec'ing. Forget our pending request.
5236  */
5237 void
5238 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5239 {
5240         kqueue_release(kqr_kqueue(p, kqr));
5241 }
5242
5243 workq_threadreq_param_t
5244 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5245 {
5246         struct kqworkloop *kqwl;
5247         workq_threadreq_param_t trp;
5248
5249         assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5250         kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5251         trp.trp_value = kqwl->kqwl_params;
5252         return trp;
5253 }
5254
5255 /*
5256  *      kqueue_threadreq_unbind - unbind thread from processing kqueue
5257  *
5258  *      End processing the per-QoS bucket of events and allow other threads
5259  *      to be requested for future servicing.
5260  *
5261  *      caller holds a reference on the kqueue.
5262  */
5263 void
5264 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5265 {
5266         if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5267                 kqworkloop_unbind(kqr_kqworkloop(kqr));
5268         } else {
5269                 kqworkq_unbind(p, kqr);
5270         }
5271 }
5272
5273 /*
5274  * If we aren't already busy processing events [for this QoS],
5275  * request workq thread support as appropriate.
5276  *
5277  * TBD - for now, we don't segregate out processing by QoS.
5278  *
5279  * - May be called with the kqueue's wait queue set locked,
5280  *   so cannot do anything that could recurse on that.
5281  */
5282 static void
5283 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5284 {
5285         workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5286
5287         /* convert to thread qos value */
5288         assert(qos_index < KQWQ_NBUCKETS);
5289
5290         if (!kqr->tr_kq_wakeup) {
5291                 kqr->tr_kq_wakeup = true;
5292                 if (!kqr_thread_requested(kqr)) {
5293                         kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5294                 }
5295         }
5296 }
5297
5298 /*
5299  * This represent the asynchronous QoS a given workloop contributes,
5300  * hence is the max of the current active knotes (override index)
5301  * and the workloop max qos (userspace async qos).
5302  */
5303 static kq_index_t
5304 kqworkloop_override(struct kqworkloop *kqwl)
5305 {
5306         workq_threadreq_t kqr = &kqwl->kqwl_request;
5307         return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5308 }
5309
5310 static inline void
5311 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5312 {
5313         workq_threadreq_t kqr = &kqwl->kqwl_request;
5314
5315         kqlock_held(kqwl);
5316
5317         if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5318                 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5319                 act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5320         }
5321 }
5322
5323 static void
5324 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5325 {
5326         workq_threadreq_t kqr = &kqwl->kqwl_request;
5327         struct kqueue *kq = &kqwl->kqwl_kqueue;
5328         kq_index_t old_override = kqworkloop_override(kqwl);
5329         kq_index_t i;
5330
5331         kqlock_held(kqwl);
5332
5333         switch (op) {
5334         case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5335                 if (qos == KQWL_BUCKET_STAYACTIVE) {
5336                         /*
5337                          * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
5338                          * a high watermark (kqwl_stayactive_qos) of any stay active knote
5339                          * that was ever registered with this workloop.
5340                          *
5341                          * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
5342                          * knote, we use this high-watermark as a wakeup-index, and also set
5343                          * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
5344                          * there is at least one stay active knote fired until the next full
5345                          * processing of this bucket.
5346                          */
5347                         kqwl->kqwl_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT;
5348                         qos = kqwl->kqwl_stayactive_qos;
5349                         assert(qos);
5350                 }
5351                 if (kqwl->kqwl_wakeup_indexes & (1 << qos)) {
5352                         assert(kqr->tr_kq_wakeup);
5353                         break;
5354                 }
5355
5356                 kqwl->kqwl_wakeup_indexes |= (1 << qos);
5357                 kqr->tr_kq_wakeup = true;
5358                 kqworkloop_request_fire_r2k_notification(kqwl);
5359                 goto recompute;
5360
5361         case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
5362                 assert(qos);
5363                 if (kqwl->kqwl_stayactive_qos < qos) {
5364                         kqwl->kqwl_stayactive_qos = qos;
5365                         if (kqwl->kqwl_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
5366                                 assert(kqr->tr_kq_wakeup);
5367                                 kqwl->kqwl_wakeup_indexes |= (1 << qos);
5368                                 goto recompute;
5369                         }
5370                 }
5371                 break;
5372
5373         case KQWL_UTQ_PARKING:
5374         case KQWL_UTQ_UNBINDING:
5375                 kqr->tr_kq_override_index = qos;
5376                 OS_FALLTHROUGH;
5377         case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5378                 if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5379                         assert(qos == THREAD_QOS_UNSPECIFIED);
5380                 }
5381                 i = KQWL_BUCKET_STAYACTIVE;
5382                 if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5383                         kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5384                 }
5385                 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) &&
5386                     (kqwl->kqwl_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
5387                         /*
5388                          * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
5389                          * knote may have fired, so we need to merge in kqwl_stayactive_qos.
5390                          *
5391                          * Unlike other buckets, this one is never empty but could be idle.
5392                          */
5393                         kqwl->kqwl_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT;
5394                         kqwl->kqwl_wakeup_indexes |= (1 << kqwl->kqwl_stayactive_qos);
5395                 } else {
5396                         kqwl->kqwl_wakeup_indexes = 0;
5397                 }
5398                 for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) {
5399                         if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) {
5400                                 kqwl->kqwl_wakeup_indexes |= (1 << i);
5401                         }
5402                 }
5403                 if (kqwl->kqwl_wakeup_indexes) {
5404                         kqr->tr_kq_wakeup = true;
5405                         kqworkloop_request_fire_r2k_notification(kqwl);
5406                 } else {
5407                         kqr->tr_kq_wakeup = false;
5408                 }
5409                 goto recompute;
5410
5411         case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5412                 kqr->tr_kq_override_index = qos;
5413                 goto recompute;
5414
5415         case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5416 recompute:
5417                 /*
5418                  * When modifying the wakeup QoS or the override QoS, we always need to
5419                  * maintain our invariant that kqr_override_index is at least as large
5420                  * as the highest QoS for which an event is fired.
5421                  *
5422                  * However this override index can be larger when there is an overriden
5423                  * suppressed knote pushing on the kqueue.
5424                  */
5425                 if (kqwl->kqwl_wakeup_indexes > (1 << qos)) {
5426                         qos = (uint8_t)(fls(kqwl->kqwl_wakeup_indexes) - 1); /* fls is 1-based */
5427                 }
5428                 if (kqr->tr_kq_override_index < qos) {
5429                         kqr->tr_kq_override_index = qos;
5430                 }
5431                 break;
5432
5433         case KQWL_UTQ_REDRIVE_EVENTS:
5434                 break;
5435
5436         case KQWL_UTQ_SET_QOS_INDEX:
5437                 kqr->tr_kq_qos_index = qos;
5438                 break;
5439
5440         default:
5441                 panic("unknown kqwl thread qos update operation: %d", op);
5442         }
5443
5444         thread_t kqwl_owner = kqwl->kqwl_owner;
5445         thread_t servicer = kqr_thread(kqr);
5446         boolean_t qos_changed = FALSE;
5447         kq_index_t new_override = kqworkloop_override(kqwl);
5448
5449         /*
5450          * Apply the diffs to the owner if applicable
5451          */
5452         if (kqwl_owner) {
5453 #if 0
5454                 /* JMM - need new trace hooks for owner overrides */
5455                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5456                     kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5457                     (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
5458 #endif
5459                 if (new_override == old_override) {
5460                         // nothing to do
5461                 } else if (old_override == THREAD_QOS_UNSPECIFIED) {
5462                         thread_add_kevent_override(kqwl_owner, new_override);
5463                 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5464                         thread_drop_kevent_override(kqwl_owner);
5465                 } else { /*  old_override != new_override */
5466                         thread_update_kevent_override(kqwl_owner, new_override);
5467                 }
5468         }
5469
5470         /*
5471          * apply the diffs to the servicer
5472          */
5473         if (!kqr_thread_requested(kqr)) {
5474                 /*
5475                  * No servicer, nor thread-request
5476                  *
5477                  * Make a new thread request, unless there is an owner (or the workloop
5478                  * is suspended in userland) or if there is no asynchronous work in the
5479                  * first place.
5480                  */
5481
5482                 if (kqwl_owner == NULL && kqr->tr_kq_wakeup) {
5483                         int initiate_flags = 0;
5484                         if (op == KQWL_UTQ_UNBINDING) {
5485                                 initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5486                         }
5487                         kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5488                 }
5489         } else if (servicer) {
5490                 /*
5491                  * Servicer in flight
5492                  *
5493                  * Just apply the diff to the servicer
5494                  */
5495                 struct uthread *ut = get_bsdthread_info(servicer);
5496                 if (ut->uu_kqueue_override != new_override) {
5497                         if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5498                                 thread_add_servicer_override(servicer, new_override);
5499                         } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5500                                 thread_drop_servicer_override(servicer);
5501                         } else { /* ut->uu_kqueue_override != new_override */
5502                                 thread_update_servicer_override(servicer, new_override);
5503                         }
5504                         ut->uu_kqueue_override = new_override;
5505                         qos_changed = TRUE;
5506                 }
5507         } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5508                 /*
5509                  * No events to deliver anymore.
5510                  *
5511                  * However canceling with turnstiles is challenging, so the fact that
5512                  * the request isn't useful will be discovered by the servicer himself
5513                  * later on.
5514                  */
5515         } else if (old_override != new_override) {
5516                 /*
5517                  * Request is in flight
5518                  *
5519                  * Apply the diff to the thread request
5520                  */
5521                 kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
5522                 qos_changed = TRUE;
5523         }
5524
5525         if (qos_changed) {
5526                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
5527                     thread_tid(servicer), kqr->tr_kq_qos_index,
5528                     (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
5529         }
5530 }
5531
5532 static void
5533 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
5534 {
5535         if ((kqwl->kqwl_state & KQ_PROCESSING) &&
5536             kqr_thread(&kqwl->kqwl_request) == current_thread()) {
5537                 /*
5538                  * kqworkloop_end_processing() will perform the required QoS
5539                  * computations when it unsets the processing mode.
5540                  */
5541                 return;
5542         }
5543
5544         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
5545 }
5546
5547 static struct kqtailq *
5548 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
5549 {
5550         if (kq.kq->kq_state & KQ_WORKLOOP) {
5551                 return &kq.kqwl->kqwl_suppressed;
5552         } else if (kq.kq->kq_state & KQ_WORKQ) {
5553                 return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index];
5554         } else {
5555                 return &kq.kqf->kqf_suppressed;
5556         }
5557 }
5558
5559 struct turnstile *
5560 kqueue_alloc_turnstile(kqueue_t kqu)
5561 {
5562         struct kqworkloop *kqwl = kqu.kqwl;
5563         kq_state_t kq_state;
5564
5565         kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
5566         if (kq_state & KQ_HAS_TURNSTILE) {
5567                 /* force a dependency to pair with the atomic or with release below */
5568                 return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
5569                            (uintptr_t)kq_state);
5570         }
5571
5572         if (!(kq_state & KQ_WORKLOOP)) {
5573                 return TURNSTILE_NULL;
5574         }
5575
5576         struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
5577         bool workq_locked = false;
5578
5579         kqlock(kqu);
5580
5581         if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5582                 workq_locked = true;
5583                 workq_kern_threadreq_lock(kqwl->kqwl_p);
5584         }
5585
5586         if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
5587                 free_ts = ts;
5588                 ts = kqwl->kqwl_turnstile;
5589         } else {
5590                 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
5591                     ts, TURNSTILE_WORKLOOPS);
5592
5593                 /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
5594                 os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
5595
5596                 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5597                         workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
5598                             &kqwl->kqwl_request, kqwl->kqwl_owner,
5599                             ts, TURNSTILE_IMMEDIATE_UPDATE);
5600                         /*
5601                          * The workq may no longer be the interlock after this.
5602                          * In which case the inheritor wasn't updated.
5603                          */
5604                 }
5605                 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
5606                         filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5607                 }
5608         }
5609
5610         if (workq_locked) {
5611                 workq_kern_threadreq_unlock(kqwl->kqwl_p);
5612         }
5613
5614         kqunlock(kqu);
5615
5616         if (free_ts) {
5617                 turnstile_deallocate(free_ts);
5618         } else {
5619                 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
5620         }
5621         return ts;
5622 }
5623
5624 __attribute__((always_inline))
5625 struct turnstile *
5626 kqueue_turnstile(kqueue_t kqu)
5627 {
5628         kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
5629         if (kq_state & KQ_WORKLOOP) {
5630                 return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
5631         }
5632         return TURNSTILE_NULL;
5633 }
5634
5635 __attribute__((always_inline))
5636 struct turnstile *
5637 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
5638 {
5639         struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
5640         if (kqwl) {
5641                 return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
5642         }
5643         return TURNSTILE_NULL;
5644 }
5645
5646 static void
5647 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
5648 {
5649         workq_threadreq_t kqr = &kqwl->kqwl_request;
5650
5651         /*
5652          * This test is racy, but since we never remove this bit,
5653          * it allows us to avoid taking a lock.
5654          */
5655         if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
5656                 return;
5657         }
5658
5659         kqlock_held(kqwl);
5660
5661         if (kqr_thread_requested_pending(kqr)) {
5662                 kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
5663                     WORKQ_THREADREQ_MAKE_OVERCOMMIT);
5664         } else {
5665                 kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
5666         }
5667 }
5668
5669 static void
5670 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
5671     kq_index_t override_index)
5672 {
5673         workq_threadreq_t kqr;
5674         kq_index_t old_override_index;
5675         kq_index_t queue_index = kn->kn_qos_index;
5676
5677         if (override_index <= queue_index) {
5678                 return;
5679         }
5680
5681         kqr = kqworkq_get_request(kqwq, queue_index);
5682
5683         kqlock_held(kqwq);
5684
5685         old_override_index = kqr->tr_kq_override_index;
5686         if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
5687                 thread_t servicer = kqr_thread(kqr);
5688                 kqr->tr_kq_override_index = override_index;
5689
5690                 /* apply the override to [incoming?] servicing thread */
5691                 if (servicer) {
5692                         if (old_override_index) {
5693                                 thread_update_kevent_override(servicer, override_index);
5694                         } else {
5695                                 thread_add_kevent_override(servicer, override_index);
5696                         }
5697                 }
5698         }
5699 }
5700
5701 static void
5702 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
5703 {
5704         if (kqu.kq->kq_state & KQ_WORKLOOP) {
5705                 kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
5706                     qos);
5707         } else {
5708                 kqworkq_update_override(kqu.kqwq, kn, qos);
5709         }
5710 }
5711
5712 static void
5713 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
5714     enum kqwl_unbind_locked_mode how)
5715 {
5716         struct uthread *ut = get_bsdthread_info(thread);
5717         workq_threadreq_t kqr = &kqwl->kqwl_request;
5718
5719         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
5720             thread_tid(thread), 0, 0);
5721
5722         kqlock_held(kqwl);
5723
5724         assert(ut->uu_kqr_bound == kqr);
5725         ut->uu_kqr_bound = NULL;
5726         if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
5727             ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5728                 thread_drop_servicer_override(thread);
5729                 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5730         }
5731
5732         if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
5733                 turnstile_update_inheritor(kqwl->kqwl_turnstile,
5734                     TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
5735                 turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
5736                     TURNSTILE_INTERLOCK_HELD);
5737         }
5738
5739         kqr->tr_thread = THREAD_NULL;
5740         kqr->tr_state = WORKQ_TR_STATE_IDLE;
5741         kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5742 }
5743
5744 static void
5745 kqworkloop_unbind_delayed_override_drop(thread_t thread)
5746 {
5747         struct uthread *ut = get_bsdthread_info(thread);
5748         assert(ut->uu_kqr_bound == NULL);
5749         if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5750                 thread_drop_servicer_override(thread);
5751                 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5752         }
5753 }
5754
5755 /*
5756  *      kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
5757  *
5758  *      It will acknowledge events, and possibly request a new thread if:
5759  *      - there were active events left
5760  *      - we pended waitq hook callouts during processing
5761  *      - we pended wakeups while processing (or unsuppressing)
5762  *
5763  *      Called with kqueue lock held.
5764  */
5765 static void
5766 kqworkloop_unbind(struct kqworkloop *kqwl)
5767 {
5768         struct kqueue *kq = &kqwl->kqwl_kqueue;
5769         workq_threadreq_t kqr = &kqwl->kqwl_request;
5770         thread_t thread = kqr_thread_fast(kqr);
5771         int op = KQWL_UTQ_PARKING;
5772         kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
5773
5774         assert(thread == current_thread());
5775
5776         kqlock(kqwl);
5777
5778         /*
5779          * Forcing the KQ_PROCESSING flag allows for QoS updates because of
5780          * unsuppressing knotes not to be applied until the eventual call to
5781          * kqworkloop_update_threads_qos() below.
5782          */
5783         assert((kq->kq_state & KQ_PROCESSING) == 0);
5784         if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5785                 kq->kq_state |= KQ_PROCESSING;
5786                 qos_override = kqworkloop_acknowledge_events(kqwl);
5787                 kq->kq_state &= ~KQ_PROCESSING;
5788         }
5789
5790         kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
5791         kqworkloop_update_threads_qos(kqwl, op, qos_override);
5792
5793         kqunlock(kqwl);
5794
5795         /*
5796          * Drop the override on the current thread last, after the call to
5797          * kqworkloop_update_threads_qos above.
5798          */
5799         kqworkloop_unbind_delayed_override_drop(thread);
5800
5801         /* If last reference, dealloc the workloop kq */
5802         kqworkloop_release(kqwl);
5803 }
5804
5805 static thread_qos_t
5806 kqworkq_unbind_locked(struct kqworkq *kqwq,
5807     workq_threadreq_t kqr, thread_t thread)
5808 {
5809         struct uthread *ut = get_bsdthread_info(thread);
5810         kq_index_t old_override = kqr->tr_kq_override_index;
5811
5812         KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
5813             thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
5814
5815         kqlock_held(kqwq);
5816
5817         assert(ut->uu_kqr_bound == kqr);
5818         ut->uu_kqr_bound = NULL;
5819         kqr->tr_thread = THREAD_NULL;
5820         kqr->tr_state = WORKQ_TR_STATE_IDLE;
5821         kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5822         kqwq->kqwq_state &= ~KQ_R2K_ARMED;
5823
5824         return old_override;
5825 }
5826
5827 /*
5828  *      kqworkq_unbind - unbind of a workq kqueue from a thread
5829  *
5830  *      We may have to request new threads.
5831  *      This can happen there are no waiting processing threads and:
5832  *      - there were active events we never got to (count > 0)
5833  *      - we pended waitq hook callouts during processing
5834  *      - we pended wakeups while processing (or unsuppressing)
5835  */
5836 static void
5837 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
5838 {
5839         struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
5840         __assert_only int rc;
5841
5842         kqlock(kqwq);
5843         rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
5844         assert(rc == -1);
5845         kqunlock(kqwq);
5846 }
5847
5848 workq_threadreq_t
5849 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
5850 {
5851         assert(qos_index < KQWQ_NBUCKETS);
5852         return &kqwq->kqwq_request[qos_index];
5853 }
5854
5855 static void
5856 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
5857 {
5858         kq_index_t qos = _pthread_priority_thread_qos(pp);
5859
5860         if (kqu.kq->kq_state & KQ_WORKLOOP) {
5861                 assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
5862                 pp = _pthread_priority_normalize(pp);
5863         } else if (kqu.kq->kq_state & KQ_WORKQ) {
5864                 if (qos == THREAD_QOS_UNSPECIFIED) {
5865                         /* On workqueues, outside of QoS means MANAGER */
5866                         qos = KQWQ_QOS_MANAGER;
5867                         pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
5868                 } else {
5869                         pp = _pthread_priority_normalize(pp);
5870                 }
5871         } else {
5872                 pp = _pthread_unspecified_priority();
5873                 qos = THREAD_QOS_UNSPECIFIED;
5874         }
5875
5876         kn->kn_qos = (int32_t)pp;
5877
5878         if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
5879                 /* Never lower QoS when in "Merge" mode */
5880                 kn->kn_qos_override = qos;
5881         }
5882
5883         /* only adjust in-use qos index when not suppressed */
5884         if (kn->kn_status & KN_SUPPRESSED) {
5885                 kqueue_update_override(kqu, kn, qos);
5886         } else if (kn->kn_qos_index != qos) {
5887                 knote_dequeue(kqu, kn);
5888                 kn->kn_qos_index = qos;
5889         }
5890 }
5891
5892 static void
5893 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
5894 {
5895         thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
5896
5897         kqlock_held(kq);
5898
5899         assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
5900         assert(qos_index < THREAD_QOS_LAST);
5901
5902         /*
5903          * Early exit for knotes that should not change QoS
5904          */
5905         if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
5906                 panic("filter %d cannot change QoS", kn->kn_filtid);
5907         } else if (__improbable(!knote_has_qos(kn))) {
5908                 return;
5909         }
5910
5911         /*
5912          * knotes with the FALLBACK flag will only use their registration QoS if the
5913          * incoming event has no QoS, else, the registration QoS acts as a floor.
5914          */
5915         thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
5916         if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
5917                 if (qos_index == THREAD_QOS_UNSPECIFIED) {
5918                         qos_index = req_qos;
5919                 }
5920         } else {
5921                 if (qos_index < req_qos) {
5922                         qos_index = req_qos;
5923                 }
5924         }
5925         if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
5926                 /* Never lower QoS when in "Merge" mode */
5927                 return;
5928         }
5929
5930         if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
5931                 /*
5932                  * When we're trying to update the QoS override and that both an
5933                  * f_event() and other f_* calls are running concurrently, any of these
5934                  * in flight calls may want to perform overrides that aren't properly
5935                  * serialized with each other.
5936                  *
5937                  * The first update that observes this racy situation enters a "Merge"
5938                  * mode which causes subsequent override requests to saturate the
5939                  * override instead of replacing its value.
5940                  *
5941                  * This mode is left when knote_unlock() or knote_post()
5942                  * observe that no other f_* routine is in flight.
5943                  */
5944                 kn->kn_status |= KN_MERGE_QOS;
5945         }
5946
5947         /*
5948          * Now apply the override if it changed.
5949          */
5950
5951         if (kn->kn_qos_override == qos_index) {
5952                 return;
5953         }
5954
5955         kn->kn_qos_override = qos_index;
5956
5957         if (kn->kn_status & KN_SUPPRESSED) {
5958                 /*
5959                  * For suppressed events, the kn_qos_index field cannot be touched as it
5960                  * allows us to know on which supress queue the knote is for a kqworkq.
5961                  *
5962                  * Also, there's no natural push applied on the kqueues when this field
5963                  * changes anyway. We hence need to apply manual overrides in this case,
5964                  * which will be cleared when the events are later acknowledged.
5965                  */
5966                 kqueue_update_override(kq, kn, qos_index);
5967         } else if (kn->kn_qos_index != qos_index) {
5968                 knote_dequeue(kq, kn);
5969                 kn->kn_qos_index = qos_index;
5970         }
5971 }
5972
5973 /*
5974  * Called back from waitq code when no threads waiting and the hook was set.
5975  *
5976  * Preemption is disabled - minimal work can be done in this context!!!
5977  */
5978 void
5979 waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *kq_hook)
5980 {
5981         kqueue_t kqu;
5982
5983         kqu.kq = __container_of(kq_hook, struct kqueue, kq_waitq_hook);
5984         assert(kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
5985
5986         kqlock(kqu);
5987
5988         if (kqu.kq->kq_count > 0) {
5989                 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5990                         kqworkloop_wakeup(kqu.kqwl, KQWL_BUCKET_STAYACTIVE);
5991                 } else {
5992                         kqworkq_wakeup(kqu.kqwq, KQWQ_QOS_MANAGER);
5993                 }
5994         }
5995
5996         kqunlock(kqu);
5997 }
5998
5999 void
6000 klist_init(struct klist *list)
6001 {
6002         SLIST_INIT(list);
6003 }
6004
6005
6006 /*
6007  * Query/Post each knote in the object's list
6008  *
6009  *      The object lock protects the list. It is assumed
6010  *      that the filter/event routine for the object can
6011  *      determine that the object is already locked (via
6012  *      the hint) and not deadlock itself.
6013  *
6014  *      The object lock should also hold off pending
6015  *      detach/drop operations.
6016  */
6017 void
6018 knote(struct klist *list, long hint)
6019 {
6020         struct knote *kn;
6021
6022         SLIST_FOREACH(kn, list, kn_selnext) {
6023                 knote_post(kn, hint);
6024         }
6025 }
6026
6027 /*
6028  * attach a knote to the specified list.  Return true if this is the first entry.
6029  * The list is protected by whatever lock the object it is associated with uses.
6030  */
6031 int
6032 knote_attach(struct klist *list, struct knote *kn)
6033 {
6034         int ret = SLIST_EMPTY(list);
6035         SLIST_INSERT_HEAD(list, kn, kn_selnext);
6036         return ret;
6037 }
6038
6039 /*
6040  * detach a knote from the specified list.  Return true if that was the last entry.
6041  * The list is protected by whatever lock the object it is associated with uses.
6042  */
6043 int
6044 knote_detach(struct klist *list, struct knote *kn)
6045 {
6046         SLIST_REMOVE(list, kn, knote, kn_selnext);
6047         return SLIST_EMPTY(list);
6048 }
6049
6050 /*
6051  * knote_vanish - Indicate that the source has vanished
6052  *
6053  * If the knote has requested EV_VANISHED delivery,
6054  * arrange for that. Otherwise, deliver a NOTE_REVOKE
6055  * event for backward compatibility.
6056  *
6057  * The knote is marked as having vanished, but is not
6058  * actually detached from the source in this instance.
6059  * The actual detach is deferred until the knote drop.
6060  *
6061  * Our caller already has the object lock held. Calling
6062  * the detach routine would try to take that lock
6063  * recursively - which likely is not supported.
6064  */
6065 void
6066 knote_vanish(struct klist *list, bool make_active)
6067 {
6068         struct knote *kn;
6069         struct knote *kn_next;
6070
6071         SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6072                 struct kqueue *kq = knote_get_kq(kn);
6073
6074                 kqlock(kq);
6075                 if (__probable(kn->kn_status & KN_REQVANISH)) {
6076                         /*
6077                          * If EV_VANISH supported - prepare to deliver one
6078                          */
6079                         kn->kn_status |= KN_VANISHED;
6080                 } else {
6081                         /*
6082                          * Handle the legacy way to indicate that the port/portset was
6083                          * deallocated or left the current Mach portspace (modern technique
6084                          * is with an EV_VANISHED protocol).
6085                          *
6086                          * Deliver an EV_EOF event for these changes (hopefully it will get
6087                          * delivered before the port name recycles to the same generation
6088                          * count and someone tries to re-register a kevent for it or the
6089                          * events are udata-specific - avoiding a conflict).
6090                          */
6091                         kn->kn_flags |= EV_EOF | EV_ONESHOT;
6092                 }
6093                 if (make_active) {
6094                         knote_activate(kq, kn, FILTER_ACTIVE);
6095                 }
6096                 kqunlock(kq);
6097         }
6098 }
6099
6100 /*
6101  * Force a lazy allocation of the waitqset link
6102  * of the kq_wqs associated with the kn
6103  * if it wasn't already allocated.
6104  *
6105  * This allows knote_link_waitq to never block
6106  * if reserved_link is not NULL.
6107  */
6108 void
6109 knote_link_waitqset_lazy_alloc(struct knote *kn)
6110 {
6111         struct kqueue *kq = knote_get_kq(kn);
6112         waitq_set_lazy_init_link(&kq->kq_wqs);
6113 }
6114
6115 /*
6116  * Check if a lazy allocation for the waitqset link
6117  * of the kq_wqs is needed.
6118  */
6119 boolean_t
6120 knote_link_waitqset_should_lazy_alloc(struct knote *kn)
6121 {
6122         struct kqueue *kq = knote_get_kq(kn);
6123         return waitq_set_should_lazy_init_link(&kq->kq_wqs);
6124 }
6125
6126 /*
6127  * For a given knote, link a provided wait queue directly with the kqueue.
6128  * Wakeups will happen via recursive wait queue support.  But nothing will move
6129  * the knote to the active list at wakeup (nothing calls knote()).  Instead,
6130  * we permanently enqueue them here.
6131  *
6132  * kqueue and knote references are held by caller.
6133  * waitq locked by caller.
6134  *
6135  * caller provides the wait queue link structure and insures that the kq->kq_wqs
6136  * is linked by previously calling knote_link_waitqset_lazy_alloc.
6137  */
6138 int
6139 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
6140 {
6141         struct kqueue *kq = knote_get_kq(kn);
6142         kern_return_t kr;
6143
6144         kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
6145         if (kr == KERN_SUCCESS) {
6146                 knote_markstayactive(kn);
6147                 return 0;
6148         } else {
6149                 return EINVAL;
6150         }
6151 }
6152
6153 /*
6154  * Unlink the provided wait queue from the kqueue associated with a knote.
6155  * Also remove it from the magic list of directly attached knotes.
6156  *
6157  * Note that the unlink may have already happened from the other side, so
6158  * ignore any failures to unlink and just remove it from the kqueue list.
6159  *
6160  * On success, caller is responsible for the link structure
6161  */
6162 int
6163 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
6164 {
6165         struct kqueue *kq = knote_get_kq(kn);
6166         kern_return_t kr;
6167
6168         kr = waitq_unlink(wq, &kq->kq_wqs);
6169         knote_clearstayactive(kn);
6170         return (kr != KERN_SUCCESS) ? EINVAL : 0;
6171 }
6172
6173 /*
6174  * remove all knotes referencing a specified fd
6175  *
6176  * Entered with the proc_fd lock already held.
6177  * It returns the same way, but may drop it temporarily.
6178  */
6179 void
6180 knote_fdclose(struct proc *p, int fd)
6181 {
6182         struct klist *list;
6183         struct knote *kn;
6184         KNOTE_LOCK_CTX(knlc);
6185
6186 restart:
6187         list = &p->p_fd->fd_knlist[fd];
6188         SLIST_FOREACH(kn, list, kn_link) {
6189                 struct kqueue *kq = knote_get_kq(kn);
6190
6191                 kqlock(kq);
6192
6193                 if (kq->kq_p != p) {
6194                         panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6195                             __func__, kq->kq_p, p);
6196                 }
6197
6198                 /*
6199                  * If the knote supports EV_VANISHED delivery,
6200                  * transition it to vanished mode (or skip over
6201                  * it if already vanished).
6202                  */
6203                 if (kn->kn_status & KN_VANISHED) {
6204                         kqunlock(kq);
6205                         continue;
6206                 }
6207
6208                 proc_fdunlock(p);
6209                 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6210                         /* the knote was dropped by someone, nothing to do */
6211                 } else if (kn->kn_status & KN_REQVANISH) {
6212                         kn->kn_status |= KN_VANISHED;
6213
6214                         kqunlock(kq);
6215                         knote_fops(kn)->f_detach(kn);
6216                         if (kn->kn_is_fd) {
6217                                 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6218                         }
6219                         kn->kn_filtid = EVFILTID_DETACHED;
6220                         kqlock(kq);
6221
6222                         knote_activate(kq, kn, FILTER_ACTIVE);
6223                         knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6224                 } else {
6225                         knote_drop(kq, kn, &knlc);
6226                 }
6227
6228                 proc_fdlock(p);
6229                 goto restart;
6230         }
6231 }
6232
6233 /*
6234  * knote_fdfind - lookup a knote in the fd table for process
6235  *
6236  * If the filter is file-based, lookup based on fd index.
6237  * Otherwise use a hash based on the ident.
6238  *
6239  * Matching is based on kq, filter, and ident. Optionally,
6240  * it may also be based on the udata field in the kevent -
6241  * allowing multiple event registration for the file object
6242  * per kqueue.
6243  *
6244  * fd_knhashlock or fdlock held on entry (and exit)
6245  */
6246 static struct knote *
6247 knote_fdfind(struct kqueue *kq,
6248     const struct kevent_internal_s *kev,
6249     bool is_fd,
6250     struct proc *p)
6251 {
6252         struct filedesc *fdp = p->p_fd;
6253         struct klist *list = NULL;
6254         struct knote *kn = NULL;
6255
6256         /*
6257          * determine where to look for the knote
6258          */
6259         if (is_fd) {
6260                 /* fd-based knotes are linked off the fd table */
6261                 if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6262                         list = &fdp->fd_knlist[kev->kei_ident];
6263                 }
6264         } else if (fdp->fd_knhashmask != 0) {
6265                 /* hash non-fd knotes here too */
6266                 list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6267         }
6268
6269         /*
6270          * scan the selected list looking for a match
6271          */
6272         if (list != NULL) {
6273                 SLIST_FOREACH(kn, list, kn_link) {
6274                         if (kq == knote_get_kq(kn) &&
6275                             kev->kei_ident == kn->kn_id &&
6276                             kev->kei_filter == kn->kn_filter) {
6277                                 if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6278                                         if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6279                                             kev->kei_udata == kn->kn_udata) {
6280                                                 break; /* matching udata-specific knote */
6281                                         }
6282                                 } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6283                                         break; /* matching non-udata-specific knote */
6284                                 }
6285                         }
6286                 }
6287         }
6288         return kn;
6289 }
6290
6291 /*
6292  * kq_add_knote- Add knote to the fd table for process
6293  * while checking for duplicates.
6294  *
6295  * All file-based filters associate a list of knotes by file
6296  * descriptor index. All other filters hash the knote by ident.
6297  *
6298  * May have to grow the table of knote lists to cover the
6299  * file descriptor index presented.
6300  *
6301  * fd_knhashlock and fdlock unheld on entry (and exit).
6302  *
6303  * Takes a rwlock boost if inserting the knote is successful.
6304  */
6305 static int
6306 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6307     struct proc *p)
6308 {
6309         struct filedesc *fdp = p->p_fd;
6310         struct klist *list = NULL;
6311         int ret = 0;
6312         bool is_fd = kn->kn_is_fd;
6313         uint64_t nofile = proc_limitgetcur(p, RLIMIT_NOFILE, TRUE);
6314
6315         if (is_fd) {
6316                 proc_fdlock(p);
6317         } else {
6318                 knhash_lock(fdp);
6319         }
6320
6321         if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6322                 /* found an existing knote: we can't add this one */
6323                 ret = ERESTART;
6324                 goto out_locked;
6325         }
6326
6327         /* knote was not found: add it now */
6328         if (!is_fd) {
6329                 if (fdp->fd_knhashmask == 0) {
6330                         u_long size = 0;
6331
6332                         list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6333                         if (list == NULL) {
6334                                 ret = ENOMEM;
6335                                 goto out_locked;
6336                         }
6337
6338                         fdp->fd_knhash = list;
6339                         fdp->fd_knhashmask = size;
6340                 }
6341
6342                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6343                 SLIST_INSERT_HEAD(list, kn, kn_link);
6344                 ret = 0;
6345                 goto out_locked;
6346         } else {
6347                 /* knote is fd based */
6348
6349                 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6350                         u_int size = 0;
6351
6352                         /* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6353                         if (kn->kn_id >= (uint64_t) nofile
6354                             || kn->kn_id >= (uint64_t)maxfilesperproc) {
6355                                 ret = EINVAL;
6356                                 goto out_locked;
6357                         }
6358                         /* have to grow the fd_knlist */
6359                         size = fdp->fd_knlistsize;
6360                         while (size <= kn->kn_id) {
6361                                 size += KQEXTENT;
6362                         }
6363
6364                         if (size >= (UINT_MAX / sizeof(struct klist *))) {
6365                                 ret = EINVAL;
6366                                 goto out_locked;
6367                         }
6368
6369                         MALLOC(list, struct klist *,
6370                             size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
6371                         if (list == NULL) {
6372                                 ret = ENOMEM;
6373                                 goto out_locked;
6374                         }
6375
6376                         bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
6377                             fdp->fd_knlistsize * sizeof(struct klist *));
6378                         bzero((caddr_t)list +
6379                             fdp->fd_knlistsize * sizeof(struct klist *),
6380                             (size - fdp->fd_knlistsize) * sizeof(struct klist *));
6381                         FREE(fdp->fd_knlist, M_KQUEUE);
6382                         fdp->fd_knlist = list;
6383                         fdp->fd_knlistsize = size;
6384                 }
6385
6386                 list = &fdp->fd_knlist[kn->kn_id];
6387                 SLIST_INSERT_HEAD(list, kn, kn_link);
6388                 ret = 0;
6389                 goto out_locked;
6390         }
6391
6392 out_locked:
6393         if (ret == 0) {
6394                 kqlock(kq);
6395                 assert((kn->kn_status & KN_LOCKED) == 0);
6396                 (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6397                 kqueue_retain(kq); /* retain a kq ref */
6398         }
6399         if (is_fd) {
6400                 proc_fdunlock(p);
6401         } else {
6402                 knhash_unlock(fdp);
6403         }
6404
6405         return ret;
6406 }
6407
6408 /*
6409  * kq_remove_knote - remove a knote from the fd table for process
6410  *
6411  * If the filter is file-based, remove based on fd index.
6412  * Otherwise remove from the hash based on the ident.
6413  *
6414  * fd_knhashlock and fdlock unheld on entry (and exit).
6415  */
6416 static void
6417 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6418     struct knote_lock_ctx *knlc)
6419 {
6420         struct filedesc *fdp = p->p_fd;
6421         struct klist *list = NULL;
6422         uint16_t kq_state;
6423         bool is_fd = kn->kn_is_fd;
6424
6425         if (is_fd) {
6426                 proc_fdlock(p);
6427         } else {
6428                 knhash_lock(fdp);
6429         }
6430
6431         if (is_fd) {
6432                 assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6433                 list = &fdp->fd_knlist[kn->kn_id];
6434         } else {
6435                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6436         }
6437         SLIST_REMOVE(list, kn, knote, kn_link);
6438
6439         kqlock(kq);
6440         kq_state = kq->kq_state;
6441         if (knlc) {
6442                 knote_unlock_cancel(kq, kn, knlc);
6443         } else {
6444                 kqunlock(kq);
6445         }
6446         if (is_fd) {
6447                 proc_fdunlock(p);
6448         } else {
6449                 knhash_unlock(fdp);
6450         }
6451
6452         if (kq_state & KQ_DYNAMIC) {
6453                 kqworkloop_release((struct kqworkloop *)kq);
6454         }
6455 }
6456
6457 /*
6458  * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6459  * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6460  *
6461  * fd_knhashlock or fdlock unheld on entry (and exit)
6462  */
6463
6464 static struct knote *
6465 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
6466     bool is_fd, struct proc *p)
6467 {
6468         struct filedesc *fdp = p->p_fd;
6469         struct knote *kn;
6470
6471         if (is_fd) {
6472                 proc_fdlock(p);
6473         } else {
6474                 knhash_lock(fdp);
6475         }
6476
6477         /*
6478          * Temporary horrible hack:
6479          * this cast is gross and will go away in a future change.
6480          * It is OK to do because we don't look at xflags/s_fflags,
6481          * and that when we cast down the kev this way,
6482          * the truncated filter field works.
6483          */
6484         kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
6485
6486         if (kn) {
6487                 kqlock(kq);
6488                 assert(knote_get_kq(kn) == kq);
6489         }
6490
6491         if (is_fd) {
6492                 proc_fdunlock(p);
6493         } else {
6494                 knhash_unlock(fdp);
6495         }
6496
6497         return kn;
6498 }
6499
6500 __attribute__((noinline))
6501 static void
6502 kqfile_wakeup(struct kqfile *kqf, __unused kq_index_t qos)
6503 {
6504         /* flag wakeups during processing */
6505         if (kqf->kqf_state & KQ_PROCESSING) {
6506                 kqf->kqf_state |= KQ_WAKEUP;
6507         }
6508
6509         /* wakeup a thread waiting on this queue */
6510         if (kqf->kqf_state & (KQ_SLEEP | KQ_SEL)) {
6511                 kqf->kqf_state &= ~(KQ_SLEEP | KQ_SEL);
6512                 waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs, KQ_EVENT,
6513                     THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
6514         }
6515
6516         /* wakeup other kqueues/select sets we're inside */
6517         KNOTE(&kqf->kqf_sel.si_note, 0);
6518 }
6519
6520 static struct kqtailq *
6521 knote_get_tailq(kqueue_t kqu, struct knote *kn)
6522 {
6523         kq_index_t qos_index = kn->kn_qos_index;
6524
6525         if (kqu.kq->kq_state & KQ_WORKLOOP) {
6526                 assert(qos_index < KQWL_NBUCKETS);
6527         } else if (kqu.kq->kq_state & KQ_WORKQ) {
6528                 assert(qos_index < KQWQ_NBUCKETS);
6529         } else {
6530                 assert(qos_index == QOS_INDEX_KQFILE);
6531         }
6532         static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue),
6533             "struct kqueue::kq_queue must be exactly at the end");
6534         return &kqu.kq->kq_queue[qos_index];
6535 }
6536
6537 static void
6538 knote_enqueue(kqueue_t kqu, struct knote *kn, kn_status_t wakeup_mask)
6539 {
6540         kqlock_held(kqu);
6541
6542         if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
6543                 return;
6544         }
6545
6546         if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)) {
6547                 return;
6548         }
6549
6550         if ((kn->kn_status & KN_QUEUED) == 0) {
6551                 struct kqtailq *queue = knote_get_tailq(kqu, kn);
6552
6553                 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
6554                 kn->kn_status |= KN_QUEUED;
6555                 kqu.kq->kq_count++;
6556         } else if ((kn->kn_status & KN_STAYACTIVE) == 0) {
6557                 return;
6558         }
6559
6560         if (kn->kn_status & wakeup_mask) {
6561                 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6562                         kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
6563                 } else if (kqu.kq->kq_state & KQ_WORKQ) {
6564                         kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
6565                 } else {
6566                         kqfile_wakeup(kqu.kqf, kn->kn_qos_index);
6567                 }
6568         }
6569 }
6570
6571 __attribute__((always_inline))
6572 static inline void
6573 knote_dequeue(kqueue_t kqu, struct knote *kn)
6574 {
6575         if (kn->kn_status & KN_QUEUED) {
6576                 struct kqtailq *queue = knote_get_tailq(kqu, kn);
6577
6578                 // attaching the knote calls knote_reset_priority() without
6579                 // the kqlock which is fine, so we can't call kqlock_held()
6580                 // if we're not queued.
6581                 kqlock_held(kqu);
6582
6583                 TAILQ_REMOVE(queue, kn, kn_tqe);
6584                 kn->kn_status &= ~KN_QUEUED;
6585                 kqu.kq->kq_count--;
6586         }
6587 }
6588
6589 /* called with kqueue lock held */
6590 static void
6591 knote_suppress(kqueue_t kqu, struct knote *kn)
6592 {
6593         struct kqtailq *suppressq;
6594
6595         kqlock_held(kqu);
6596
6597         assert((kn->kn_status & KN_SUPPRESSED) == 0);
6598         assert(kn->kn_status & KN_QUEUED);
6599
6600         knote_dequeue(kqu, kn);
6601         /* deactivate - so new activations indicate a wakeup */
6602         kn->kn_status &= ~KN_ACTIVE;
6603         kn->kn_status |= KN_SUPPRESSED;
6604         suppressq = kqueue_get_suppressed_queue(kqu, kn);
6605         TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
6606 }
6607
6608 __attribute__((always_inline))
6609 static inline void
6610 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
6611 {
6612         struct kqtailq *suppressq;
6613
6614         kqlock_held(kqu);
6615
6616         assert(kn->kn_status & KN_SUPPRESSED);
6617
6618         kn->kn_status &= ~KN_SUPPRESSED;
6619         suppressq = kqueue_get_suppressed_queue(kqu, kn);
6620         TAILQ_REMOVE(suppressq, kn, kn_tqe);
6621
6622         /*
6623          * If the knote is no longer active, reset its push,
6624          * and resynchronize kn_qos_index with kn_qos_override
6625          * for knotes with a real qos.
6626          */
6627         if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
6628                 kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
6629         }
6630         kn->kn_qos_index = kn->kn_qos_override;
6631 }
6632
6633 /* called with kqueue lock held */
6634 static void
6635 knote_unsuppress(kqueue_t kqu, struct knote *kn)
6636 {
6637         if (kn->kn_status & KN_SUPPRESSED) {
6638                 knote_unsuppress_noqueue(kqu, kn);
6639
6640                 /* don't wakeup if unsuppressing just a stay-active knote */
6641                 knote_enqueue(kqu, kn, KN_ACTIVE);
6642         }
6643 }
6644
6645 __attribute__((always_inline))
6646 static inline void
6647 knote_mark_active(struct knote *kn)
6648 {
6649         if ((kn->kn_status & KN_ACTIVE) == 0) {
6650                 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
6651                     kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
6652                     kn->kn_filtid);
6653         }
6654
6655         kn->kn_status |= KN_ACTIVE;
6656 }
6657
6658 /* called with kqueue lock held */
6659 static void
6660 knote_activate(kqueue_t kqu, struct knote *kn, int result)
6661 {
6662         assert(result & FILTER_ACTIVE);
6663         if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
6664                 // may dequeue the knote
6665                 knote_adjust_qos(kqu.kq, kn, result);
6666         }
6667         knote_mark_active(kn);
6668         knote_enqueue(kqu, kn, KN_ACTIVE | KN_STAYACTIVE);
6669 }
6670
6671 /*
6672  * This function applies changes requested by f_attach or f_touch for
6673  * a given filter. It proceeds in a carefully chosen order to help
6674  * every single transition do the minimal amount of work possible.
6675  */
6676 static void
6677 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
6678     int result)
6679 {
6680         kn_status_t wakeup_mask = KN_ACTIVE;
6681
6682         if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
6683                 /*
6684                  * When a stayactive knote is reenabled, we may have missed wakeups
6685                  * while it was disabled, so we need to poll it. To do so, ask
6686                  * knote_enqueue() below to reenqueue it.
6687                  */
6688                 wakeup_mask |= KN_STAYACTIVE;
6689                 kn->kn_status &= ~KN_DISABLED;
6690
6691                 /*
6692                  * it is possible for userland to have knotes registered for a given
6693                  * workloop `wl_orig` but really handled on another workloop `wl_new`.
6694                  *
6695                  * In that case, rearming will happen from the servicer thread of
6696                  * `wl_new` which if `wl_orig` is no longer being serviced, would cause
6697                  * this knote to stay suppressed forever if we only relied on
6698                  * kqworkloop_acknowledge_events to be called by `wl_orig`.
6699                  *
6700                  * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
6701                  * unsuppress because that would mess with the processing phase of
6702                  * `wl_orig`, however it also means kqworkloop_acknowledge_events()
6703                  * will be called.
6704                  */
6705                 if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
6706                         if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
6707                                 knote_unsuppress_noqueue(kqu, kn);
6708                         }
6709                 }
6710         }
6711
6712         if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
6713                 // may dequeue the knote
6714                 knote_reset_priority(kqu, kn, kev->qos);
6715         }
6716
6717         /*
6718          * When we unsuppress above, or because of knote_reset_priority(),
6719          * the knote may have been dequeued, we need to restore the invariant
6720          * that if the knote is active it needs to be queued now that
6721          * we're done applying changes.
6722          */
6723         if (result & FILTER_ACTIVE) {
6724                 knote_activate(kqu, kn, result);
6725         } else {
6726                 knote_enqueue(kqu, kn, wakeup_mask);
6727         }
6728
6729         if ((result & FILTER_THREADREQ_NODEFEER) &&
6730             act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
6731                 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
6732         }
6733 }
6734
6735 /*
6736  * knote_drop - disconnect and drop the knote
6737  *
6738  * Called with the kqueue locked, returns with the kqueue unlocked.
6739  *
6740  * If a knote locking context is passed, it is canceled.
6741  *
6742  * The knote may have already been detached from
6743  * (or not yet attached to) its source object.
6744  */
6745 static void
6746 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
6747 {
6748         struct proc *p = kq->kq_p;
6749
6750         kqlock_held(kq);
6751
6752         assert((kn->kn_status & KN_DROPPING) == 0);
6753         if (knlc == NULL) {
6754                 assert((kn->kn_status & KN_LOCKED) == 0);
6755         }
6756         kn->kn_status |= KN_DROPPING;
6757
6758         if (kn->kn_status & KN_SUPPRESSED) {
6759                 knote_unsuppress_noqueue(kq, kn);
6760         } else {
6761                 knote_dequeue(kq, kn);
6762         }
6763         knote_wait_for_post(kq, kn);
6764
6765         knote_fops(kn)->f_detach(kn);
6766
6767         /* kq may be freed when kq_remove_knote() returns */
6768         kq_remove_knote(kq, kn, p, knlc);
6769         if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
6770                 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6771         }
6772
6773         knote_free(kn);
6774 }
6775
6776 void
6777 knote_init(void)
6778 {
6779 #if CONFIG_MEMORYSTATUS
6780         /* Initialize the memorystatus list lock */
6781         memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
6782 #endif
6783 }
6784 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
6785
6786 const struct filterops *
6787 knote_fops(struct knote *kn)
6788 {
6789         return sysfilt_ops[kn->kn_filtid];
6790 }
6791
6792 static struct knote *
6793 knote_alloc(void)
6794 {
6795         return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO);
6796 }
6797
6798 static void
6799 knote_free(struct knote *kn)
6800 {
6801         assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
6802         zfree(knote_zone, kn);
6803 }
6804
6805 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
6806
6807 kevent_ctx_t
6808 kevent_get_context(thread_t thread)
6809 {
6810         uthread_t ut = get_bsdthread_info(thread);
6811         return &ut->uu_save.uus_kevent;
6812 }
6813
6814 static inline bool
6815 kevent_args_requesting_events(unsigned int flags, int nevents)
6816 {
6817         return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
6818 }
6819
6820 static inline int
6821 kevent_adjust_flags_for_proc(proc_t p, int flags)
6822 {
6823         __builtin_assume(p);
6824         return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
6825 }
6826
6827 /*!
6828  * @function kevent_get_kqfile
6829  *
6830  * @brief
6831  * Lookup a kqfile by fd.
6832  *
6833  * @discussion
6834  * Callers: kevent, kevent64, kevent_qos
6835  *
6836  * This is not assumed to be a fastpath (kqfile interfaces are legacy)
6837  */
6838 OS_NOINLINE
6839 static int
6840 kevent_get_kqfile(struct proc *p, int fd, int flags,
6841     struct fileproc **fpp, struct kqueue **kqp)
6842 {
6843         int error = 0;
6844         struct kqueue *kq;
6845
6846         error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
6847         if (__improbable(error)) {
6848                 return error;
6849         }
6850         kq = (struct kqueue *)(*fpp)->f_data;
6851
6852         uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
6853         if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
6854                 kqlock(kq);
6855                 kq_state = kq->kq_state;
6856                 if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
6857                         if (flags & KEVENT_FLAG_LEGACY32) {
6858                                 kq_state |= KQ_KEV32;
6859                         } else if (flags & KEVENT_FLAG_LEGACY64) {
6860                                 kq_state |= KQ_KEV64;
6861                         } else {
6862                                 kq_state |= KQ_KEV_QOS;
6863                         }
6864                         kq->kq_state = kq_state;
6865                 }
6866                 kqunlock(kq);
6867         }
6868
6869         /*
6870          * kqfiles can't be used through the legacy kevent()
6871          * and other interfaces at the same time.
6872          */
6873         if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
6874             (bool)(kq_state & KQ_KEV32))) {
6875                 fp_drop(p, fd, *fpp, 0);
6876                 return EINVAL;
6877         }
6878
6879         *kqp = kq;
6880         return 0;
6881 }
6882
6883 /*!
6884  * @function kevent_get_kqwq
6885  *
6886  * @brief
6887  * Lookup or create the process kqwq (faspath).
6888  *
6889  * @discussion
6890  * Callers: kevent64, kevent_qos
6891  */
6892 OS_ALWAYS_INLINE
6893 static int
6894 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
6895 {
6896         struct kqworkq *kqwq = p->p_fd->fd_wqkqueue;
6897
6898         if (__improbable(kevent_args_requesting_events(flags, nevents))) {
6899                 return EINVAL;
6900         }
6901         if (__improbable(kqwq == NULL)) {
6902                 kqwq = kqworkq_alloc(p, flags);
6903                 if (__improbable(kqwq == NULL)) {
6904                         return ENOMEM;
6905                 }
6906         }
6907
6908         *kqp = &kqwq->kqwq_kqueue;
6909         return 0;
6910 }
6911
6912 #pragma mark kevent copyio
6913
6914 /*!
6915  * @function kevent_get_data_size
6916  *
6917  * @brief
6918  * Copies in the extra data size from user-space.
6919  */
6920 static int
6921 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
6922     kevent_ctx_t kectx)
6923 {
6924         if (!data_avail || !data_out) {
6925                 kectx->kec_data_size  = 0;
6926                 kectx->kec_data_resid = 0;
6927         } else if (flags & KEVENT_FLAG_PROC64) {
6928                 user64_size_t usize = 0;
6929                 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
6930                 if (__improbable(error)) {
6931                         return error;
6932                 }
6933                 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
6934         } else {
6935                 user32_size_t usize = 0;
6936                 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
6937                 if (__improbable(error)) {
6938                         return error;
6939                 }
6940                 kectx->kec_data_avail = data_avail;
6941                 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
6942         }
6943         kectx->kec_data_out   = data_out;
6944         kectx->kec_data_avail = data_avail;
6945         return 0;
6946 }
6947
6948 /*!
6949  * @function kevent_put_data_size
6950  *
6951  * @brief
6952  * Copies out the residual data size to user-space if any has been used.
6953  */
6954 static int
6955 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
6956 {
6957         if (kectx->kec_data_resid == kectx->kec_data_size) {
6958                 return 0;
6959         }
6960         if (flags & KEVENT_FLAG_KERNEL) {
6961                 *(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
6962                 return 0;
6963         }
6964         if (flags & KEVENT_FLAG_PROC64) {
6965                 user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
6966                 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
6967         } else {
6968                 user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
6969                 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
6970         }
6971 }
6972
6973 /*!
6974  * @function kevent_legacy_copyin
6975  *
6976  * @brief
6977  * Handles the copyin of a kevent/kevent64 event.
6978  */
6979 static int
6980 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
6981 {
6982         int error;
6983
6984         assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
6985
6986         if (flags & KEVENT_FLAG_LEGACY64) {
6987                 struct kevent64_s kev64;
6988
6989                 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
6990                 if (__improbable(error)) {
6991                         return error;
6992                 }
6993                 *addrp += sizeof(kev64);
6994                 *kevp = (struct kevent_qos_s){
6995                         .ident  = kev64.ident,
6996                         .filter = kev64.filter,
6997                         /* Make sure user doesn't pass in any system flags */
6998                         .flags  = kev64.flags & ~EV_SYSFLAGS,
6999                         .udata  = kev64.udata,
7000                         .fflags = kev64.fflags,
7001                         .data   = kev64.data,
7002                         .ext[0] = kev64.ext[0],
7003                         .ext[1] = kev64.ext[1],
7004                 };
7005         } else if (flags & KEVENT_FLAG_PROC64) {
7006                 struct user64_kevent kev64;
7007
7008                 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7009                 if (__improbable(error)) {
7010                         return error;
7011                 }
7012                 *addrp += sizeof(kev64);
7013                 *kevp = (struct kevent_qos_s){
7014                         .ident  = kev64.ident,
7015                         .filter = kev64.filter,
7016                         /* Make sure user doesn't pass in any system flags */
7017                         .flags  = kev64.flags & ~EV_SYSFLAGS,
7018                         .udata  = kev64.udata,
7019                         .fflags = kev64.fflags,
7020                         .data   = kev64.data,
7021                 };
7022         } else {
7023                 struct user32_kevent kev32;
7024
7025                 error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
7026                 if (__improbable(error)) {
7027                         return error;
7028                 }
7029                 *addrp += sizeof(kev32);
7030                 *kevp = (struct kevent_qos_s){
7031                         .ident  = (uintptr_t)kev32.ident,
7032                         .filter = kev32.filter,
7033                         /* Make sure user doesn't pass in any system flags */
7034                         .flags  = kev32.flags & ~EV_SYSFLAGS,
7035                         .udata  = CAST_USER_ADDR_T(kev32.udata),
7036                         .fflags = kev32.fflags,
7037                         .data   = (intptr_t)kev32.data,
7038                 };
7039         }
7040
7041         return 0;
7042 }
7043
7044 /*!
7045  * @function kevent_modern_copyin
7046  *
7047  * @brief
7048  * Handles the copyin of a kevent_qos/kevent_id event.
7049  */
7050 static int
7051 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
7052 {
7053         int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
7054         if (__probable(!error)) {
7055                 /* Make sure user doesn't pass in any system flags */
7056                 *addrp += sizeof(struct kevent_qos_s);
7057                 kevp->flags &= ~EV_SYSFLAGS;
7058         }
7059         return error;
7060 }
7061
7062 /*!
7063  * @function kevent_legacy_copyout
7064  *
7065  * @brief
7066  * Handles the copyout of a kevent/kevent64 event.
7067  */
7068 static int
7069 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7070 {
7071         int advance;
7072         int error;
7073
7074         assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7075
7076         /*
7077          * fully initialize the differnt output event structure
7078          * types from the internal kevent (and some universal
7079          * defaults for fields not represented in the internal
7080          * form).
7081          *
7082          * Note: these structures have no padding hence the C99
7083          *       initializers below do not leak kernel info.
7084          */
7085         if (flags & KEVENT_FLAG_LEGACY64) {
7086                 struct kevent64_s kev64 = {
7087                         .ident  = kevp->ident,
7088                         .filter = kevp->filter,
7089                         .flags  = kevp->flags,
7090                         .fflags = kevp->fflags,
7091                         .data   = (int64_t)kevp->data,
7092                         .udata  = kevp->udata,
7093                         .ext[0] = kevp->ext[0],
7094                         .ext[1] = kevp->ext[1],
7095                 };
7096                 advance = sizeof(struct kevent64_s);
7097                 error = copyout((caddr_t)&kev64, *addrp, advance);
7098         } else if (flags & KEVENT_FLAG_PROC64) {
7099                 /*
7100                  * deal with the special case of a user-supplied
7101                  * value of (uintptr_t)-1.
7102                  */
7103                 uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7104                     (uint64_t)-1LL : (uint64_t)kevp->ident;
7105                 struct user64_kevent kev64 = {
7106                         .ident  = ident,
7107                         .filter = kevp->filter,
7108                         .flags  = kevp->flags,
7109                         .fflags = kevp->fflags,
7110                         .data   = (int64_t) kevp->data,
7111                         .udata  = (user_addr_t) kevp->udata,
7112                 };
7113                 advance = sizeof(kev64);
7114                 error = copyout((caddr_t)&kev64, *addrp, advance);
7115         } else {
7116                 struct user32_kevent kev32 = {
7117                         .ident  = (uint32_t)kevp->ident,
7118                         .filter = kevp->filter,
7119                         .flags  = kevp->flags,
7120                         .fflags = kevp->fflags,
7121                         .data   = (int32_t)kevp->data,
7122                         .udata  = (uint32_t)kevp->udata,
7123                 };
7124                 advance = sizeof(kev32);
7125                 error = copyout((caddr_t)&kev32, *addrp, advance);
7126         }
7127         if (__probable(!error)) {
7128                 *addrp += advance;
7129         }
7130         return error;
7131 }
7132
7133 /*!
7134  * @function kevent_modern_copyout
7135  *
7136  * @brief
7137  * Handles the copyout of a kevent_qos/kevent_id event.
7138  */
7139 OS_ALWAYS_INLINE
7140 static inline int
7141 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7142 {
7143         int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7144         if (__probable(!error)) {
7145                 *addrp += sizeof(struct kevent_qos_s);
7146         }
7147         return error;
7148 }
7149
7150 #pragma mark kevent core implementation
7151
7152 /*!
7153  * @function kevent_callback_inline
7154  *
7155  * @brief
7156  * Callback for each individual event
7157  *
7158  * @discussion
7159  * This is meant to be inlined in kevent_modern_callback and
7160  * kevent_legacy_callback.
7161  */
7162 OS_ALWAYS_INLINE
7163 static inline int
7164 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7165 {
7166         int error;
7167
7168         assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7169
7170         /*
7171          * Copy out the appropriate amount of event data for this user.
7172          */
7173         if (legacy) {
7174                 error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7175                     kectx->kec_process_flags);
7176         } else {
7177                 error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7178         }
7179
7180         /*
7181          * If there isn't space for additional events, return
7182          * a harmless error to stop the processing here
7183          */
7184         if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7185                 error = EWOULDBLOCK;
7186         }
7187         return error;
7188 }
7189
7190 /*!
7191  * @function kevent_modern_callback
7192  *
7193  * @brief
7194  * Callback for each individual modern event.
7195  *
7196  * @discussion
7197  * This callback handles kevent_qos/kevent_id events.
7198  */
7199 static int
7200 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7201 {
7202         return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7203 }
7204
7205 /*!
7206  * @function kevent_legacy_callback
7207  *
7208  * @brief
7209  * Callback for each individual legacy event.
7210  *
7211  * @discussion
7212  * This callback handles kevent/kevent64 events.
7213  */
7214 static int
7215 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7216 {
7217         return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7218 }
7219
7220 /*!
7221  * @function kevent_cleanup
7222  *
7223  * @brief
7224  * Handles the cleanup returning from a kevent call.
7225  *
7226  * @discussion
7227  * kevent entry points will take a reference on workloops,
7228  * and a usecount on the fileglob of kqfiles.
7229  *
7230  * This function undoes this on the exit paths of kevents.
7231  *
7232  * @returns
7233  * The error to return to userspace.
7234  */
7235 static int
7236 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7237 {
7238         // poll should not call any codepath leading to this
7239         assert((flags & KEVENT_FLAG_POLL) == 0);
7240
7241         if (flags & KEVENT_FLAG_WORKLOOP) {
7242                 kqworkloop_release(kqu.kqwl);
7243         } else if (flags & KEVENT_FLAG_WORKQ) {
7244                 /* nothing held */
7245         } else {
7246                 fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7247         }
7248
7249         /* don't restart after signals... */
7250         if (error == ERESTART) {
7251                 error = EINTR;
7252         } else if (error == 0) {
7253                 /* don't abandon other output just because of residual copyout failures */
7254                 (void)kevent_put_data_size(flags, kectx);
7255         }
7256
7257         if (flags & KEVENT_FLAG_PARKING) {
7258                 thread_t th = current_thread();
7259                 struct uthread *uth = get_bsdthread_info(th);
7260                 if (uth->uu_kqr_bound) {
7261                         thread_unfreeze_base_pri(th);
7262                 }
7263         }
7264         return error;
7265 }
7266
7267 /*!
7268  * @function kqueue_process
7269  *
7270  * @brief
7271  * Process the triggered events in a kqueue.
7272  *
7273  * @discussion
7274  * Walk the queued knotes and validate that they are really still triggered
7275  * events by calling the filter routines (if necessary).
7276  *
7277  * For each event that is still considered triggered, invoke the callback
7278  * routine provided.
7279  *
7280  * caller holds a reference on the kqueue.
7281  * kqueue locked on entry and exit - but may be dropped
7282  * kqueue list locked (held for duration of call)
7283  *
7284  * This is only called by kqueue_scan() so that the compiler can inline it.
7285  *
7286  * @returns
7287  * - 0:            no event was returned, no other error occured
7288  * - EBADF:        the kqueue is being destroyed (KQ_DRAIN is set)
7289  * - EWOULDBLOCK:  (not an error) events have been found and we should return
7290  * - EFAULT:       copyout failed
7291  * - filter specific errors
7292  */
7293 static int
7294 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7295     kevent_callback_t callback)
7296 {
7297         workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7298         struct knote *kn;
7299         int error = 0, rc = 0;
7300         struct kqtailq *base_queue, *queue;
7301 #if DEBUG || DEVELOPMENT
7302         int retries = 64;
7303 #endif
7304         uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7305
7306         if (kq_type & KQ_WORKQ) {
7307                 rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7308         } else if (kq_type & KQ_WORKLOOP) {
7309                 rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7310         } else {
7311 kqfile_retry:
7312                 rc = kqfile_begin_processing(kqu.kqf);
7313                 if (rc == EBADF) {
7314                         return EBADF;
7315                 }
7316         }
7317
7318         if (rc == -1) {
7319                 /* Nothing to process */
7320                 return 0;
7321         }
7322
7323         /*
7324          * loop through the enqueued knotes associated with this request,
7325          * processing each one. Each request may have several queues
7326          * of knotes to process (depending on the type of kqueue) so we
7327          * have to loop through all the queues as long as we have additional
7328          * space.
7329          */
7330
7331 process_again:
7332         if (kq_type & KQ_WORKQ) {
7333                 base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index];
7334         } else if (kq_type & KQ_WORKLOOP) {
7335                 base_queue = &kqu.kqwl->kqwl_queue[0];
7336                 queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7337         } else {
7338                 base_queue = queue = &kqu.kqf->kqf_queue;
7339         }
7340
7341         do {
7342                 while ((kn = TAILQ_FIRST(queue)) != NULL) {
7343                         error = knote_process(kn, kectx, callback);
7344                         if (error == EJUSTRETURN) {
7345                                 error = 0;
7346                         } else if (__improbable(error)) {
7347                                 /* error is EWOULDBLOCK when the out event array is full */
7348                                 goto stop_processing;
7349                         }
7350                 }
7351         } while (queue-- > base_queue);
7352
7353         if (kectx->kec_process_noutputs) {
7354                 /* callers will transform this into no error */
7355                 error = EWOULDBLOCK;
7356         }
7357
7358 stop_processing:
7359         /*
7360          * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7361          * we want to unbind the kqrequest from the thread.
7362          *
7363          * However, because the kq locks are dropped several times during process,
7364          * new knotes may have fired again, in which case, we want to fail the end
7365          * processing and process again, until it converges.
7366          *
7367          * If we have an error or returned events, end processing never fails.
7368          */
7369         if (error) {
7370                 flags &= ~KEVENT_FLAG_PARKING;
7371         }
7372         if (kq_type & KQ_WORKQ) {
7373                 rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7374         } else if (kq_type & KQ_WORKLOOP) {
7375                 rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7376         } else {
7377                 rc = kqfile_end_processing(kqu.kqf);
7378         }
7379
7380         if (__probable(error)) {
7381                 return error;
7382         }
7383
7384         if (__probable(rc >= 0)) {
7385                 assert(rc == 0 || rc == EBADF);
7386                 return rc;
7387         }
7388
7389 #if DEBUG || DEVELOPMENT
7390         if (retries-- == 0) {
7391                 panic("kevent: way too many knote_process retries, kq: %p (0x%04x)",
7392                     kqu.kq, kqu.kq->kq_state);
7393         }
7394 #endif
7395         if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7396                 assert(flags & KEVENT_FLAG_PARKING);
7397                 goto process_again;
7398         } else {
7399                 goto kqfile_retry;
7400         }
7401 }
7402
7403 /*!
7404  * @function kqueue_scan_continue
7405  *
7406  * @brief
7407  * The continuation used by kqueue_scan for kevent entry points.
7408  *
7409  * @discussion
7410  * Assumes we inherit a use/ref count on the kq or its fileglob.
7411  *
7412  * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7413  * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7414  */
7415 OS_NORETURN OS_NOINLINE
7416 static void
7417 kqueue_scan_continue(void *data, wait_result_t wait_result)
7418 {
7419         uthread_t ut = current_uthread();
7420         kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7421         int error = 0, flags = kectx->kec_process_flags;
7422         struct kqueue *kq = data;
7423
7424         /*
7425          * only kevent variants call in here, so we know the callback is
7426          * kevent_legacy_callback or kevent_modern_callback.
7427          */
7428         assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7429
7430         switch (wait_result) {
7431         case THREAD_AWAKENED:
7432                 if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7433                         error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7434                 } else {
7435                         error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7436                 }
7437                 break;
7438         case THREAD_TIMED_OUT:
7439                 error = 0;
7440                 break;
7441         case THREAD_INTERRUPTED:
7442                 error = EINTR;
7443                 break;
7444         case THREAD_RESTART:
7445                 error = EBADF;
7446                 break;
7447         default:
7448                 panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7449         }
7450
7451
7452         error = kevent_cleanup(kq, flags, error, kectx);
7453         *(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7454         unix_syscall_return(error);
7455 }
7456
7457 /*!
7458  * @function kqueue_scan
7459  *
7460  * @brief
7461  * Scan and wait for events in a kqueue (used by poll & kevent).
7462  *
7463  * @discussion
7464  * Process the triggered events in a kqueue.
7465  *
7466  * If there are no events triggered arrange to wait for them:
7467  * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7468  * - possibly until kectx->kec_deadline expires
7469  *
7470  * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7471  * are set, then it will wait in the kqueue_scan_continue continuation.
7472  *
7473  * poll() will block in place, and KEVENT_FLAG_KERNEL calls
7474  * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
7475  *
7476  * @param kq
7477  * The kqueue being scanned.
7478  *
7479  * @param flags
7480  * The KEVENT_FLAG_* flags for this call.
7481  *
7482  * @param kectx
7483  * The context used for this scan.
7484  * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
7485  *
7486  * @param callback
7487  * The callback to be called on events sucessfully processed.
7488  * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
7489  */
7490 int
7491 kqueue_scan(struct kqueue *kq, int flags, kevent_ctx_t kectx,
7492     kevent_callback_t callback)
7493 {
7494         int error;
7495
7496         for (;;) {
7497                 kqlock(kq);
7498                 error = kqueue_process(kq, flags, kectx, callback);
7499
7500                 /*
7501                  * If we got an error, events returned (EWOULDBLOCK)
7502                  * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
7503                  * just return.
7504                  */
7505                 if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
7506                         kqunlock(kq);
7507                         return error == EWOULDBLOCK ? 0 : error;
7508                 }
7509
7510                 waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
7511                     KQ_EVENT, THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL,
7512                     kectx->kec_deadline, TIMEOUT_NO_LEEWAY);
7513                 kq->kq_state |= KQ_SLEEP;
7514
7515                 kqunlock(kq);
7516
7517                 if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
7518                         thread_block_parameter(kqueue_scan_continue, kq);
7519                         __builtin_unreachable();
7520                 }
7521
7522                 wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
7523                 switch (wr) {
7524                 case THREAD_AWAKENED:
7525                         break;
7526                 case THREAD_TIMED_OUT:
7527                         return 0;
7528                 case THREAD_INTERRUPTED:
7529                         return EINTR;
7530                 case THREAD_RESTART:
7531                         return EBADF;
7532                 default:
7533                         panic("%s: - bad wait_result (%d)", __func__, wr);
7534                 }
7535         }
7536 }
7537
7538 /*!
7539  * @function kevent_internal
7540  *
7541  * @brief
7542  * Common kevent code.
7543  *
7544  * @discussion
7545  * Needs to be inlined to specialize for legacy or modern and
7546  * eliminate dead code.
7547  *
7548  * This is the core logic of kevent entry points, that will:
7549  * - register kevents
7550  * - optionally scan the kqueue for events
7551  *
7552  * The caller is giving kevent_internal a reference on the kqueue
7553  * or its fileproc that needs to be cleaned up by kevent_cleanup().
7554  */
7555 OS_ALWAYS_INLINE
7556 static inline int
7557 kevent_internal(kqueue_t kqu,
7558     user_addr_t changelist, int nchanges,
7559     user_addr_t ueventlist, int nevents,
7560     int flags, kevent_ctx_t kectx, int32_t *retval,
7561     bool legacy)
7562 {
7563         int error = 0, noutputs = 0, register_rc;
7564
7565         /* only bound threads can receive events on workloops */
7566         if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
7567 #if CONFIG_WORKLOOP_DEBUG
7568                 UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
7569                         .uu_kqid = kqu.kqwl->kqwl_dynamicid,
7570                         .uu_kq = error ? NULL : kqu.kq,
7571                         .uu_error = error,
7572                         .uu_nchanges = nchanges,
7573                         .uu_nevents = nevents,
7574                         .uu_flags = flags,
7575                 });
7576 #endif // CONFIG_WORKLOOP_DEBUG
7577
7578                 if (flags & KEVENT_FLAG_KERNEL) {
7579                         /* see kevent_workq_internal */
7580                         error = copyout(&kqu.kqwl->kqwl_dynamicid,
7581                             ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
7582                         kectx->kec_data_resid -= sizeof(kqueue_id_t);
7583                         if (__improbable(error)) {
7584                                 goto out;
7585                         }
7586                 }
7587
7588                 if (kevent_args_requesting_events(flags, nevents)) {
7589                         /*
7590                          * Disable the R2K notification while doing a register, if the
7591                          * caller wants events too, we don't want the AST to be set if we
7592                          * will process these events soon.
7593                          */
7594                         kqlock(kqu);
7595                         kqu.kq->kq_state &= ~KQ_R2K_ARMED;
7596                         kqunlock(kqu);
7597                         flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
7598                 }
7599         }
7600
7601         /* register all the change requests the user provided... */
7602         while (nchanges > 0 && error == 0) {
7603                 struct kevent_qos_s kev;
7604                 struct knote *kn = NULL;
7605
7606                 if (legacy) {
7607                         error = kevent_legacy_copyin(&changelist, &kev, flags);
7608                 } else {
7609                         error = kevent_modern_copyin(&changelist, &kev);
7610                 }
7611                 if (error) {
7612                         break;
7613                 }
7614
7615                 register_rc = kevent_register(kqu.kq, &kev, &kn);
7616                 if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
7617                         thread_t thread = current_thread();
7618
7619                         kqlock_held(kqu);
7620
7621                         if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
7622                                 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7623                         }
7624
7625                         // f_post_register_wait is meant to call a continuation and not to
7626                         // return, which is why we don't support FILTER_REGISTER_WAIT if
7627                         // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
7628                         // waits isn't the last.
7629                         //
7630                         // It is implementable, but not used by any userspace code at the
7631                         // moment, so for now return ENOTSUP if someone tries to do it.
7632                         if (nchanges == 1 && noutputs < nevents &&
7633                             (flags & KEVENT_FLAG_KERNEL) == 0 &&
7634                             (flags & KEVENT_FLAG_PARKING) == 0 &&
7635                             (flags & KEVENT_FLAG_ERROR_EVENTS) &&
7636                             (flags & KEVENT_FLAG_WORKLOOP)) {
7637                                 uthread_t ut = get_bsdthread_info(thread);
7638
7639                                 /*
7640                                  * store the continuation/completion data in the uthread
7641                                  *
7642                                  * Note: the kectx aliases with this,
7643                                  * and is destroyed in the process.
7644                                  */
7645                                 ut->uu_save.uus_kevent_register = (struct _kevent_register){
7646                                         .kev        = kev,
7647                                         .kqwl       = kqu.kqwl,
7648                                         .eventout   = noutputs,
7649                                         .ueventlist = ueventlist,
7650                                 };
7651                                 knote_fops(kn)->f_post_register_wait(ut, kn,
7652                                     &ut->uu_save.uus_kevent_register);
7653                                 __builtin_unreachable();
7654                         }
7655                         kqunlock(kqu);
7656
7657                         kev.flags |= EV_ERROR;
7658                         kev.data = ENOTSUP;
7659                 } else {
7660                         assert((register_rc & FILTER_REGISTER_WAIT) == 0);
7661                 }
7662
7663                 // keep in sync with kevent_register_wait_return()
7664                 if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
7665                         if ((kev.flags & EV_ERROR) == 0) {
7666                                 kev.flags |= EV_ERROR;
7667                                 kev.data = 0;
7668                         }
7669                         if (legacy) {
7670                                 error = kevent_legacy_copyout(&kev, &ueventlist, flags);
7671                         } else {
7672                                 error = kevent_modern_copyout(&kev, &ueventlist);
7673                         }
7674                         if (error == 0) {
7675                                 noutputs++;
7676                         }
7677                 } else if (kev.flags & EV_ERROR) {
7678                         error = (int)kev.data;
7679                 }
7680                 nchanges--;
7681         }
7682
7683         if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
7684             nevents > 0 && noutputs == 0 && error == 0) {
7685                 kectx->kec_process_flags = flags;
7686                 kectx->kec_process_nevents = nevents;
7687                 kectx->kec_process_noutputs = 0;
7688                 kectx->kec_process_eventlist = ueventlist;
7689
7690                 if (legacy) {
7691                         error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
7692                 } else {
7693                         error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
7694                 }
7695
7696                 noutputs = kectx->kec_process_noutputs;
7697         } else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
7698                 /*
7699                  * If we didn't through kqworkloop_end_processing(),
7700                  * we need to do it here.
7701                  *
7702                  * kqueue_scan will call kqworkloop_end_processing(),
7703                  * so we only need to do it if we didn't scan.
7704                  */
7705                 kqlock(kqu);
7706                 kqworkloop_end_processing(kqu.kqwl, 0, 0);
7707                 kqunlock(kqu);
7708         }
7709
7710         *retval = noutputs;
7711 out:
7712         return kevent_cleanup(kqu.kq, flags, error, kectx);
7713 }
7714
7715 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
7716
7717 /*!
7718  * @function kevent_modern_internal
7719  *
7720  * @brief
7721  * The backend of the kevent_id and kevent_workq_internal entry points.
7722  *
7723  * @discussion
7724  * Needs to be inline due to the number of arguments.
7725  */
7726 OS_NOINLINE
7727 static int
7728 kevent_modern_internal(kqueue_t kqu,
7729     user_addr_t changelist, int nchanges,
7730     user_addr_t ueventlist, int nevents,
7731     int flags, kevent_ctx_t kectx, int32_t *retval)
7732 {
7733         return kevent_internal(kqu.kq, changelist, nchanges,
7734                    ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
7735 }
7736
7737 /*!
7738  * @function kevent_id
7739  *
7740  * @brief
7741  * The kevent_id() syscall.
7742  */
7743 int
7744 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
7745 {
7746         int error, flags = uap->flags & KEVENT_FLAG_USER;
7747         uthread_t uth = current_uthread();
7748         workq_threadreq_t kqr = uth->uu_kqr_bound;
7749         kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7750         kqueue_t kqu;
7751
7752         flags = kevent_adjust_flags_for_proc(p, flags);
7753         flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
7754
7755         if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
7756             KEVENT_FLAG_WORKLOOP)) {
7757                 return EINVAL;
7758         }
7759
7760         error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7761         if (__improbable(error)) {
7762                 return error;
7763         }
7764
7765         kectx->kec_deadline = 0;
7766         kectx->kec_fp       = NULL;
7767         kectx->kec_fd       = -1;
7768         /* the kec_process_* fields are filled if kqueue_scann is called only */
7769
7770         /*
7771          * Get the kq we are going to be working on
7772          * As a fastpath, look at the currently bound workloop.
7773          */
7774         kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
7775         if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
7776                 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
7777                         return EEXIST;
7778                 }
7779                 kqworkloop_retain(kqu.kqwl);
7780         } else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
7781                 return EXDEV;
7782         } else {
7783                 error = kqworkloop_get_or_create(p, uap->id, NULL, flags, &kqu.kqwl);
7784                 if (__improbable(error)) {
7785                         return error;
7786                 }
7787         }
7788
7789         return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
7790                    uap->eventlist, uap->nevents, flags, kectx, retval);
7791 }
7792
7793 /**!
7794  * @function kevent_workq_internal
7795  *
7796  * @discussion
7797  * This function is exported for the sake of the workqueue subsystem.
7798  *
7799  * It is called in two ways:
7800  * - when a thread is about to go to userspace to ask for pending event
7801  * - when a thread is returning from userspace with events back
7802  *
7803  * the workqueue subsystem will only use the following flags:
7804  * - KEVENT_FLAG_STACK_DATA (always)
7805  * - KEVENT_FLAG_IMMEDIATE (always)
7806  * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
7807  *   userspace).
7808  *
7809  * It implicitly acts on the bound kqueue, and for the case of workloops
7810  * will copyout the kqueue ID before anything else.
7811  *
7812  *
7813  * Pthread will have setup the various arguments to fit this stack layout:
7814  *
7815  * +-------....----+--------------+-----------+--------------------+
7816  * |  user stack   |  data avail  |  nevents  |   pthread_self()   |
7817  * +-------....----+--------------+-----------+--------------------+
7818  *                 ^              ^
7819  *             data_out       eventlist
7820  *
7821  * When a workloop is used, the workloop ID is copied out right before
7822  * the eventlist and is taken from the data buffer.
7823  *
7824  * @warning
7825  * This function is carefuly tailored to not make any call except the final tail
7826  * call into kevent_modern_internal. (LTO inlines current_uthread()).
7827  *
7828  * This function is performance sensitive due to the workq subsystem.
7829  */
7830 int
7831 kevent_workq_internal(struct proc *p,
7832     user_addr_t changelist, int nchanges,
7833     user_addr_t eventlist, int nevents,
7834     user_addr_t data_out, user_size_t *data_available,
7835     unsigned int flags, int32_t *retval)
7836 {
7837         uthread_t uth = current_uthread();
7838         workq_threadreq_t kqr = uth->uu_kqr_bound;
7839         kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7840         kqueue_t kqu;
7841
7842         assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
7843             flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
7844
7845         kectx->kec_data_out   = data_out;
7846         kectx->kec_data_avail = (uint64_t)data_available;
7847         kectx->kec_data_size  = *data_available;
7848         kectx->kec_data_resid = *data_available;
7849         kectx->kec_deadline   = 0;
7850         kectx->kec_fp         = NULL;
7851         kectx->kec_fd         = -1;
7852         /* the kec_process_* fields are filled if kqueue_scann is called only */
7853
7854         flags = kevent_adjust_flags_for_proc(p, flags);
7855
7856         if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
7857                 kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
7858                 kqworkloop_retain(kqu.kqwl);
7859
7860                 flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
7861                     KEVENT_FLAG_KERNEL;
7862         } else {
7863                 kqu.kqwq = p->p_fd->fd_wqkqueue;
7864
7865                 flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
7866         }
7867
7868         return kevent_modern_internal(kqu, changelist, nchanges,
7869                    eventlist, nevents, flags, kectx, retval);
7870 }
7871
7872 /*!
7873  * @function kevent_qos
7874  *
7875  * @brief
7876  * The kevent_qos() syscall.
7877  */
7878 int
7879 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
7880 {
7881         uthread_t uth = current_uthread();
7882         kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7883         int error, flags = uap->flags & KEVENT_FLAG_USER;
7884         struct kqueue *kq;
7885
7886         if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
7887                 return EINVAL;
7888         }
7889
7890         flags = kevent_adjust_flags_for_proc(p, flags);
7891
7892         error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7893         if (__improbable(error)) {
7894                 return error;
7895         }
7896
7897         kectx->kec_deadline = 0;
7898         kectx->kec_fp       = NULL;
7899         kectx->kec_fd       = uap->fd;
7900         /* the kec_process_* fields are filled if kqueue_scann is called only */
7901
7902         /* get the kq we are going to be working on */
7903         if (__probable(flags & KEVENT_FLAG_WORKQ)) {
7904                 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
7905         } else {
7906                 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
7907         }
7908         if (__improbable(error)) {
7909                 return error;
7910         }
7911
7912         return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
7913                    uap->eventlist, uap->nevents, flags, kectx, retval);
7914 }
7915
7916 #pragma mark legacy syscalls: kevent, kevent64
7917
7918 /*!
7919  * @function kevent_legacy_get_deadline
7920  *
7921  * @brief
7922  * Compute the deadline for the legacy kevent syscalls.
7923  *
7924  * @discussion
7925  * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
7926  * as this takes precedence over the deadline.
7927  *
7928  * This function will fail if utimeout is USER_ADDR_NULL
7929  * (the caller should check).
7930  */
7931 static int
7932 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
7933 {
7934         struct timespec ts;
7935
7936         if (flags & KEVENT_FLAG_PROC64) {
7937                 struct user64_timespec ts64;
7938                 int error = copyin(utimeout, &ts64, sizeof(ts64));
7939                 if (__improbable(error)) {
7940                         return error;
7941                 }
7942                 ts.tv_sec = (unsigned long)ts64.tv_sec;
7943                 ts.tv_nsec = (long)ts64.tv_nsec;
7944         } else {
7945                 struct user32_timespec ts32;
7946                 int error = copyin(utimeout, &ts32, sizeof(ts32));
7947                 if (__improbable(error)) {
7948                         return error;
7949                 }
7950                 ts.tv_sec = ts32.tv_sec;
7951                 ts.tv_nsec = ts32.tv_nsec;
7952         }
7953         if (!timespec_is_valid(&ts)) {
7954                 return EINVAL;
7955         }
7956
7957         clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
7958         return 0;
7959 }
7960
7961 /*!
7962  * @function kevent_legacy_internal
7963  *
7964  * @brief
7965  * The core implementation for kevent and kevent64
7966  */
7967 OS_NOINLINE
7968 static int
7969 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
7970     int32_t *retval, int flags)
7971 {
7972         uthread_t uth = current_uthread();
7973         kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7974         struct kqueue *kq;
7975         int error;
7976
7977         if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
7978                 return EINVAL;
7979         }
7980
7981         flags = kevent_adjust_flags_for_proc(p, flags);
7982
7983         kectx->kec_data_out   = 0;
7984         kectx->kec_data_avail = 0;
7985         kectx->kec_data_size  = 0;
7986         kectx->kec_data_resid = 0;
7987         kectx->kec_deadline   = 0;
7988         kectx->kec_fp         = NULL;
7989         kectx->kec_fd         = uap->fd;
7990         /* the kec_process_* fields are filled if kqueue_scann is called only */
7991
7992         /* convert timeout to absolute - if we have one (and not immediate) */
7993         if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
7994                 error = kevent_legacy_get_deadline(flags, uap->timeout,
7995                     &kectx->kec_deadline);
7996                 if (__improbable(error)) {
7997                         return error;
7998                 }
7999         }
8000
8001         /* get the kq we are going to be working on */
8002         if (flags & KEVENT_FLAG_WORKQ) {
8003                 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8004         } else {
8005                 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8006         }
8007         if (__improbable(error)) {
8008                 return error;
8009         }
8010
8011         return kevent_internal(kq, uap->changelist, uap->nchanges,
8012                    uap->eventlist, uap->nevents, flags, kectx, retval,
8013                    /*legacy*/ true);
8014 }
8015
8016 /*!
8017  * @function kevent
8018  *
8019  * @brief
8020  * The legacy kevent() syscall.
8021  */
8022 int
8023 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
8024 {
8025         struct kevent64_args args = {
8026                 .fd         = uap->fd,
8027                 .changelist = uap->changelist,
8028                 .nchanges   = uap->nchanges,
8029                 .eventlist  = uap->eventlist,
8030                 .nevents    = uap->nevents,
8031                 .timeout    = uap->timeout,
8032         };
8033
8034         return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
8035 }
8036
8037 /*!
8038  * @function kevent64
8039  *
8040  * @brief
8041  * The legacy kevent64() syscall.
8042  */
8043 int
8044 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
8045 {
8046         int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
8047         return kevent_legacy_internal(p, uap, retval, flags);
8048 }
8049
8050 #pragma mark - socket interface
8051
8052 #if SOCKETS
8053 #include <sys/param.h>
8054 #include <sys/socket.h>
8055 #include <sys/protosw.h>
8056 #include <sys/domain.h>
8057 #include <sys/mbuf.h>
8058 #include <sys/kern_event.h>
8059 #include <sys/malloc.h>
8060 #include <sys/sys_domain.h>
8061 #include <sys/syslog.h>
8062
8063 #ifndef ROUNDUP64
8064 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8065 #endif
8066
8067 #ifndef ADVANCE64
8068 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8069 #endif
8070
8071 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8072 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8073
8074 static int kev_attach(struct socket *so, int proto, struct proc *p);
8075 static int kev_detach(struct socket *so);
8076 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8077     struct ifnet *ifp, struct proc *p);
8078 static lck_mtx_t * event_getlock(struct socket *, int);
8079 static int event_lock(struct socket *, int, void *);
8080 static int event_unlock(struct socket *, int, void *);
8081
8082 static int event_sofreelastref(struct socket *);
8083 static void kev_delete(struct kern_event_pcb *);
8084
8085 static struct pr_usrreqs event_usrreqs = {
8086         .pru_attach =           kev_attach,
8087         .pru_control =          kev_control,
8088         .pru_detach =           kev_detach,
8089         .pru_soreceive =        soreceive,
8090 };
8091
8092 static struct protosw eventsw[] = {
8093         {
8094                 .pr_type =              SOCK_RAW,
8095                 .pr_protocol =          SYSPROTO_EVENT,
8096                 .pr_flags =             PR_ATOMIC,
8097                 .pr_usrreqs =           &event_usrreqs,
8098                 .pr_lock =              event_lock,
8099                 .pr_unlock =            event_unlock,
8100                 .pr_getlock =           event_getlock,
8101         }
8102 };
8103
8104 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8105 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8106
8107 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8108     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8109
8110 struct kevtstat kevtstat;
8111 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8112     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8113     kevt_getstat, "S,kevtstat", "");
8114
8115 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8116     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8117     kevt_pcblist, "S,xkevtpcb", "");
8118
8119 static lck_mtx_t *
8120 event_getlock(struct socket *so, int flags)
8121 {
8122 #pragma unused(flags)
8123         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8124
8125         if (so->so_pcb != NULL) {
8126                 if (so->so_usecount < 0) {
8127                         panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8128                             so, so->so_usecount, solockhistory_nr(so));
8129                 }
8130                 /* NOTREACHED */
8131         } else {
8132                 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
8133                     so, solockhistory_nr(so));
8134                 /* NOTREACHED */
8135         }
8136         return &ev_pcb->evp_mtx;
8137 }
8138
8139 static int
8140 event_lock(struct socket *so, int refcount, void *lr)
8141 {
8142         void *lr_saved;
8143
8144         if (lr == NULL) {
8145                 lr_saved = __builtin_return_address(0);
8146         } else {
8147                 lr_saved = lr;
8148         }
8149
8150         if (so->so_pcb != NULL) {
8151                 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8152         } else {
8153                 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
8154                     so, lr_saved, solockhistory_nr(so));
8155                 /* NOTREACHED */
8156         }
8157
8158         if (so->so_usecount < 0) {
8159                 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
8160                     so, so->so_pcb, lr_saved, so->so_usecount,
8161                     solockhistory_nr(so));
8162                 /* NOTREACHED */
8163         }
8164
8165         if (refcount) {
8166                 so->so_usecount++;
8167         }
8168
8169         so->lock_lr[so->next_lock_lr] = lr_saved;
8170         so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8171         return 0;
8172 }
8173
8174 static int
8175 event_unlock(struct socket *so, int refcount, void *lr)
8176 {
8177         void *lr_saved;
8178         lck_mtx_t *mutex_held;
8179
8180         if (lr == NULL) {
8181                 lr_saved = __builtin_return_address(0);
8182         } else {
8183                 lr_saved = lr;
8184         }
8185
8186         if (refcount) {
8187                 so->so_usecount--;
8188         }
8189         if (so->so_usecount < 0) {
8190                 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8191                     so, so->so_usecount, solockhistory_nr(so));
8192                 /* NOTREACHED */
8193         }
8194         if (so->so_pcb == NULL) {
8195                 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
8196                     so, so->so_usecount, (void *)lr_saved,
8197                     solockhistory_nr(so));
8198                 /* NOTREACHED */
8199         }
8200         mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8201
8202         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8203         so->unlock_lr[so->next_unlock_lr] = lr_saved;
8204         so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8205
8206         if (so->so_usecount == 0) {
8207                 VERIFY(so->so_flags & SOF_PCBCLEARING);
8208                 event_sofreelastref(so);
8209         } else {
8210                 lck_mtx_unlock(mutex_held);
8211         }
8212
8213         return 0;
8214 }
8215
8216 static int
8217 event_sofreelastref(struct socket *so)
8218 {
8219         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8220
8221         LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8222
8223         so->so_pcb = NULL;
8224
8225         /*
8226          * Disable upcall in the event another thread is in kev_post_msg()
8227          * appending record to the receive socket buffer, since sbwakeup()
8228          * may release the socket lock otherwise.
8229          */
8230         so->so_rcv.sb_flags &= ~SB_UPCALL;
8231         so->so_snd.sb_flags &= ~SB_UPCALL;
8232         so->so_event = sonullevent;
8233         lck_mtx_unlock(&(ev_pcb->evp_mtx));
8234
8235         LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8236         lck_rw_lock_exclusive(&kev_rwlock);
8237         LIST_REMOVE(ev_pcb, evp_link);
8238         kevtstat.kes_pcbcount--;
8239         kevtstat.kes_gencnt++;
8240         lck_rw_done(&kev_rwlock);
8241         kev_delete(ev_pcb);
8242
8243         sofreelastref(so, 1);
8244         return 0;
8245 }
8246
8247 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8248
8249 static
8250 struct kern_event_head kern_event_head;
8251
8252 static u_int32_t static_event_id = 0;
8253
8254 static ZONE_DECLARE(ev_pcb_zone, "kerneventpcb",
8255     sizeof(struct kern_event_pcb), ZC_ZFREE_CLEARMEM);
8256
8257 /*
8258  * Install the protosw's for the NKE manager.  Invoked at extension load time
8259  */
8260 void
8261 kern_event_init(struct domain *dp)
8262 {
8263         struct protosw *pr;
8264         int i;
8265
8266         VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8267         VERIFY(dp == systemdomain);
8268
8269         for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8270                 net_add_proto(pr, dp, 1);
8271         }
8272 }
8273
8274 static int
8275 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8276 {
8277         int error = 0;
8278         struct kern_event_pcb *ev_pcb;
8279
8280         error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8281         if (error != 0) {
8282                 return error;
8283         }
8284
8285         ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8286         lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8287
8288         ev_pcb->evp_socket = so;
8289         ev_pcb->evp_vendor_code_filter = 0xffffffff;
8290
8291         so->so_pcb = (caddr_t) ev_pcb;
8292         lck_rw_lock_exclusive(&kev_rwlock);
8293         LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8294         kevtstat.kes_pcbcount++;
8295         kevtstat.kes_gencnt++;
8296         lck_rw_done(&kev_rwlock);
8297
8298         return error;
8299 }
8300
8301 static void
8302 kev_delete(struct kern_event_pcb *ev_pcb)
8303 {
8304         VERIFY(ev_pcb != NULL);
8305         lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8306         zfree(ev_pcb_zone, ev_pcb);
8307 }
8308
8309 static int
8310 kev_detach(struct socket *so)
8311 {
8312         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8313
8314         if (ev_pcb != NULL) {
8315                 soisdisconnected(so);
8316                 so->so_flags |= SOF_PCBCLEARING;
8317         }
8318
8319         return 0;
8320 }
8321
8322 /*
8323  * For now, kev_vendor_code and mbuf_tags use the same
8324  * mechanism.
8325  */
8326 errno_t
8327 kev_vendor_code_find(
8328         const char      *string,
8329         u_int32_t       *out_vendor_code)
8330 {
8331         if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8332                 return EINVAL;
8333         }
8334         return net_str_id_find_internal(string, out_vendor_code,
8335                    NSI_VENDOR_CODE, 1);
8336 }
8337
8338 errno_t
8339 kev_msg_post(struct kev_msg *event_msg)
8340 {
8341         mbuf_tag_id_t min_vendor, max_vendor;
8342
8343         net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8344
8345         if (event_msg == NULL) {
8346                 return EINVAL;
8347         }
8348
8349         /*
8350          * Limit third parties to posting events for registered vendor codes
8351          * only
8352          */
8353         if (event_msg->vendor_code < min_vendor ||
8354             event_msg->vendor_code > max_vendor) {
8355                 os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8356                 return EINVAL;
8357         }
8358         return kev_post_msg(event_msg);
8359 }
8360
8361 int
8362 kev_post_msg(struct kev_msg *event_msg)
8363 {
8364         struct mbuf *m, *m2;
8365         struct kern_event_pcb *ev_pcb;
8366         struct kern_event_msg *ev;
8367         char *tmp;
8368         u_int32_t total_size;
8369         int i;
8370
8371         /* Verify the message is small enough to fit in one mbuf w/o cluster */
8372         total_size = KEV_MSG_HEADER_SIZE;
8373
8374         for (i = 0; i < 5; i++) {
8375                 if (event_msg->dv[i].data_length == 0) {
8376                         break;
8377                 }
8378                 total_size += event_msg->dv[i].data_length;
8379         }
8380
8381         if (total_size > MLEN) {
8382                 os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8383                 return EMSGSIZE;
8384         }
8385
8386         m = m_get(M_WAIT, MT_DATA);
8387         if (m == 0) {
8388                 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8389                 return ENOMEM;
8390         }
8391         ev = mtod(m, struct kern_event_msg *);
8392         total_size = KEV_MSG_HEADER_SIZE;
8393
8394         tmp = (char *) &ev->event_data[0];
8395         for (i = 0; i < 5; i++) {
8396                 if (event_msg->dv[i].data_length == 0) {
8397                         break;
8398                 }
8399
8400                 total_size += event_msg->dv[i].data_length;
8401                 bcopy(event_msg->dv[i].data_ptr, tmp,
8402                     event_msg->dv[i].data_length);
8403                 tmp += event_msg->dv[i].data_length;
8404         }
8405
8406         ev->id = ++static_event_id;
8407         ev->total_size   = total_size;
8408         ev->vendor_code  = event_msg->vendor_code;
8409         ev->kev_class    = event_msg->kev_class;
8410         ev->kev_subclass = event_msg->kev_subclass;
8411         ev->event_code   = event_msg->event_code;
8412
8413         m->m_len = total_size;
8414         lck_rw_lock_shared(&kev_rwlock);
8415         for (ev_pcb = LIST_FIRST(&kern_event_head);
8416             ev_pcb;
8417             ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8418                 lck_mtx_lock(&ev_pcb->evp_mtx);
8419                 if (ev_pcb->evp_socket->so_pcb == NULL) {
8420                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8421                         continue;
8422                 }
8423                 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8424                         if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8425                                 lck_mtx_unlock(&ev_pcb->evp_mtx);
8426                                 continue;
8427                         }
8428
8429                         if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8430                                 if (ev_pcb->evp_class_filter != ev->kev_class) {
8431                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8432                                         continue;
8433                                 }
8434
8435                                 if ((ev_pcb->evp_subclass_filter !=
8436                                     KEV_ANY_SUBCLASS) &&
8437                                     (ev_pcb->evp_subclass_filter !=
8438                                     ev->kev_subclass)) {
8439                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8440                                         continue;
8441                                 }
8442                         }
8443                 }
8444
8445                 m2 = m_copym(m, 0, m->m_len, M_WAIT);
8446                 if (m2 == 0) {
8447                         os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8448                         m_free(m);
8449                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8450                         lck_rw_done(&kev_rwlock);
8451                         return ENOMEM;
8452                 }
8453                 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8454                         /*
8455                          * We use "m" for the socket stats as it would be
8456                          * unsafe to use "m2"
8457                          */
8458                         so_inc_recv_data_stat(ev_pcb->evp_socket,
8459                             1, m->m_len, MBUF_TC_BE);
8460
8461                         sorwakeup(ev_pcb->evp_socket);
8462                         os_atomic_inc(&kevtstat.kes_posted, relaxed);
8463                 } else {
8464                         os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
8465                 }
8466                 lck_mtx_unlock(&ev_pcb->evp_mtx);
8467         }
8468         m_free(m);
8469         lck_rw_done(&kev_rwlock);
8470
8471         return 0;
8472 }
8473
8474 static int
8475 kev_control(struct socket *so,
8476     u_long cmd,
8477     caddr_t data,
8478     __unused struct ifnet *ifp,
8479     __unused struct proc *p)
8480 {
8481         struct kev_request *kev_req = (struct kev_request *) data;
8482         struct kern_event_pcb  *ev_pcb;
8483         struct kev_vendor_code *kev_vendor;
8484         u_int32_t  *id_value = (u_int32_t *) data;
8485
8486         switch (cmd) {
8487         case SIOCGKEVID:
8488                 *id_value = static_event_id;
8489                 break;
8490         case SIOCSKEVFILT:
8491                 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8492                 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
8493                 ev_pcb->evp_class_filter = kev_req->kev_class;
8494                 ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
8495                 break;
8496         case SIOCGKEVFILT:
8497                 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8498                 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
8499                 kev_req->kev_class   = ev_pcb->evp_class_filter;
8500                 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
8501                 break;
8502         case SIOCGKEVVENDOR:
8503                 kev_vendor = (struct kev_vendor_code *)data;
8504                 /* Make sure string is NULL terminated */
8505                 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
8506                 return net_str_id_find_internal(kev_vendor->vendor_string,
8507                            &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
8508         default:
8509                 return ENOTSUP;
8510         }
8511
8512         return 0;
8513 }
8514
8515 int
8516 kevt_getstat SYSCTL_HANDLER_ARGS
8517 {
8518 #pragma unused(oidp, arg1, arg2)
8519         int error = 0;
8520
8521         lck_rw_lock_shared(&kev_rwlock);
8522
8523         if (req->newptr != USER_ADDR_NULL) {
8524                 error = EPERM;
8525                 goto done;
8526         }
8527         if (req->oldptr == USER_ADDR_NULL) {
8528                 req->oldidx = sizeof(struct kevtstat);
8529                 goto done;
8530         }
8531
8532         error = SYSCTL_OUT(req, &kevtstat,
8533             MIN(sizeof(struct kevtstat), req->oldlen));
8534 done:
8535         lck_rw_done(&kev_rwlock);
8536
8537         return error;
8538 }
8539
8540 __private_extern__ int
8541 kevt_pcblist SYSCTL_HANDLER_ARGS
8542 {
8543 #pragma unused(oidp, arg1, arg2)
8544         int error = 0;
8545         uint64_t n, i;
8546         struct xsystmgen xsg;
8547         void *buf = NULL;
8548         size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
8549             ROUNDUP64(sizeof(struct xsocket_n)) +
8550             2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
8551             ROUNDUP64(sizeof(struct xsockstat_n));
8552         struct kern_event_pcb  *ev_pcb;
8553
8554         buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
8555         if (buf == NULL) {
8556                 return ENOMEM;
8557         }
8558
8559         lck_rw_lock_shared(&kev_rwlock);
8560
8561         n = kevtstat.kes_pcbcount;
8562
8563         if (req->oldptr == USER_ADDR_NULL) {
8564                 req->oldidx = (size_t) ((n + n / 8) * item_size);
8565                 goto done;
8566         }
8567         if (req->newptr != USER_ADDR_NULL) {
8568                 error = EPERM;
8569                 goto done;
8570         }
8571         bzero(&xsg, sizeof(xsg));
8572         xsg.xg_len = sizeof(xsg);
8573         xsg.xg_count = n;
8574         xsg.xg_gen = kevtstat.kes_gencnt;
8575         xsg.xg_sogen = so_gencnt;
8576         error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8577         if (error) {
8578                 goto done;
8579         }
8580         /*
8581          * We are done if there is no pcb
8582          */
8583         if (n == 0) {
8584                 goto done;
8585         }
8586
8587         i = 0;
8588         for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
8589             i < n && ev_pcb != NULL;
8590             i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8591                 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
8592                 struct xsocket_n *xso = (struct xsocket_n *)
8593                     ADVANCE64(xk, sizeof(*xk));
8594                 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
8595                     ADVANCE64(xso, sizeof(*xso));
8596                 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
8597                     ADVANCE64(xsbrcv, sizeof(*xsbrcv));
8598                 struct xsockstat_n *xsostats = (struct xsockstat_n *)
8599                     ADVANCE64(xsbsnd, sizeof(*xsbsnd));
8600
8601                 bzero(buf, item_size);
8602
8603                 lck_mtx_lock(&ev_pcb->evp_mtx);
8604
8605                 xk->kep_len = sizeof(struct xkevtpcb);
8606                 xk->kep_kind = XSO_EVT;
8607                 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
8608                 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
8609                 xk->kep_class_filter = ev_pcb->evp_class_filter;
8610                 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
8611
8612                 sotoxsocket_n(ev_pcb->evp_socket, xso);
8613                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
8614                     &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
8615                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
8616                     &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
8617                 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
8618
8619                 lck_mtx_unlock(&ev_pcb->evp_mtx);
8620
8621                 error = SYSCTL_OUT(req, buf, item_size);
8622         }
8623
8624         if (error == 0) {
8625                 /*
8626                  * Give the user an updated idea of our state.
8627                  * If the generation differs from what we told
8628                  * her before, she knows that something happened
8629                  * while we were processing this request, and it
8630                  * might be necessary to retry.
8631                  */
8632                 bzero(&xsg, sizeof(xsg));
8633                 xsg.xg_len = sizeof(xsg);
8634                 xsg.xg_count = n;
8635                 xsg.xg_gen = kevtstat.kes_gencnt;
8636                 xsg.xg_sogen = so_gencnt;
8637                 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8638                 if (error) {
8639                         goto done;
8640                 }
8641         }
8642
8643 done:
8644         lck_rw_done(&kev_rwlock);
8645
8646         if (buf != NULL) {
8647                 FREE(buf, M_TEMP);
8648         }
8649
8650         return error;
8651 }
8652
8653 #endif /* SOCKETS */
8654
8655
8656 int
8657 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
8658 {
8659         struct vinfo_stat * st;
8660
8661         st = &kinfo->kq_stat;
8662
8663         st->vst_size = kq->kq_count;
8664         if (kq->kq_state & KQ_KEV_QOS) {
8665                 st->vst_blksize = sizeof(struct kevent_qos_s);
8666         } else if (kq->kq_state & KQ_KEV64) {
8667                 st->vst_blksize = sizeof(struct kevent64_s);
8668         } else {
8669                 st->vst_blksize = sizeof(struct kevent);
8670         }
8671         st->vst_mode = S_IFIFO;
8672         st->vst_ino = (kq->kq_state & KQ_DYNAMIC) ?
8673             ((struct kqworkloop *)kq)->kqwl_dynamicid : 0;
8674
8675         /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
8676 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
8677         kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
8678
8679         return 0;
8680 }
8681
8682 static int
8683 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
8684 {
8685         workq_threadreq_t kqr = &kqwl->kqwl_request;
8686         workq_threadreq_param_t trp = {};
8687         int err;
8688
8689         if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
8690                 return EINVAL;
8691         }
8692
8693         if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
8694                 return err;
8695         }
8696
8697         kqlock(kqwl);
8698
8699         kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
8700         kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
8701         kqdi->kqdi_request_state = kqr->tr_state;
8702         kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
8703         kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
8704         kqdi->kqdi_sync_waiters = 0;
8705         kqdi->kqdi_sync_waiter_qos = 0;
8706
8707         trp.trp_value = kqwl->kqwl_params;
8708         if (trp.trp_flags & TRP_PRIORITY) {
8709                 kqdi->kqdi_pri = trp.trp_pri;
8710         } else {
8711                 kqdi->kqdi_pri = 0;
8712         }
8713
8714         if (trp.trp_flags & TRP_POLICY) {
8715                 kqdi->kqdi_pol = trp.trp_pol;
8716         } else {
8717                 kqdi->kqdi_pol = 0;
8718         }
8719
8720         if (trp.trp_flags & TRP_CPUPERCENT) {
8721                 kqdi->kqdi_cpupercent = trp.trp_cpupercent;
8722         } else {
8723                 kqdi->kqdi_cpupercent = 0;
8724         }
8725
8726         kqunlock(kqwl);
8727
8728         return 0;
8729 }
8730
8731
8732 void
8733 knote_markstayactive(struct knote *kn)
8734 {
8735         struct kqueue *kq = knote_get_kq(kn);
8736         kq_index_t qos;
8737
8738         kqlock(kq);
8739         kn->kn_status |= KN_STAYACTIVE;
8740
8741         /*
8742          * Making a knote stay active is a property of the knote that must be
8743          * established before it is fully attached.
8744          */
8745         assert((kn->kn_status & (KN_QUEUED | KN_SUPPRESSED)) == 0);
8746
8747         /* handle all stayactive knotes on the (appropriate) manager */
8748         if (kq->kq_state & KQ_WORKLOOP) {
8749                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8750
8751                 qos = _pthread_priority_thread_qos(kn->kn_qos);
8752                 assert(qos && qos < THREAD_QOS_LAST);
8753                 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, qos);
8754                 qos = KQWL_BUCKET_STAYACTIVE;
8755         } else if (kq->kq_state & KQ_WORKQ) {
8756                 qos = KQWQ_QOS_MANAGER;
8757         } else {
8758                 qos = THREAD_QOS_UNSPECIFIED;
8759         }
8760
8761         kn->kn_qos_override = qos;
8762         kn->kn_qos_index = qos;
8763
8764         knote_activate(kq, kn, FILTER_ACTIVE);
8765         kqunlock(kq);
8766 }
8767
8768 void
8769 knote_clearstayactive(struct knote *kn)
8770 {
8771         struct kqueue *kq = knote_get_kq(kn);
8772         kqlock(kq);
8773         kn->kn_status &= ~(KN_STAYACTIVE | KN_ACTIVE);
8774         knote_dequeue(kq, kn);
8775         kqunlock(kq);
8776 }
8777
8778 static unsigned long
8779 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
8780     unsigned long buflen, unsigned long nknotes)
8781 {
8782         for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
8783                 if (kq == knote_get_kq(kn)) {
8784                         if (nknotes < buflen) {
8785                                 struct kevent_extinfo *info = &buf[nknotes];
8786
8787                                 kqlock(kq);
8788
8789                                 info->kqext_kev         = *(struct kevent_qos_s *)&kn->kn_kevent;
8790                                 if (knote_has_qos(kn)) {
8791                                         info->kqext_kev.qos =
8792                                             _pthread_priority_thread_qos_fast(kn->kn_qos);
8793                                 } else {
8794                                         info->kqext_kev.qos = kn->kn_qos_override;
8795                                 }
8796                                 info->kqext_kev.filter |= 0xff00; /* sign extend filter */
8797                                 info->kqext_kev.xflags  = 0; /* this is where sfflags lives */
8798                                 info->kqext_kev.data    = 0; /* this is where sdata lives */
8799                                 info->kqext_sdata       = kn->kn_sdata;
8800                                 info->kqext_status      = kn->kn_status;
8801                                 info->kqext_sfflags     = kn->kn_sfflags;
8802
8803                                 kqunlock(kq);
8804                         }
8805
8806                         /* we return total number of knotes, which may be more than requested */
8807                         nknotes++;
8808                 }
8809         }
8810
8811         return nknotes;
8812 }
8813
8814 int
8815 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
8816     int32_t *nkqueues_out)
8817 {
8818         proc_t p = (proc_t)proc;
8819         struct filedesc *fdp = p->p_fd;
8820         unsigned int nkqueues = 0;
8821         unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
8822         size_t buflen, bufsize;
8823         kqueue_id_t *kq_ids = NULL;
8824         int err = 0;
8825
8826         assert(p != NULL);
8827
8828         if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
8829                 err = EINVAL;
8830                 goto out;
8831         }
8832
8833         buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
8834
8835         if (ubuflen != 0) {
8836                 if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
8837                         err = ERANGE;
8838                         goto out;
8839                 }
8840                 kq_ids = kheap_alloc(KHEAP_TEMP, bufsize, Z_WAITOK | Z_ZERO);
8841                 if (!kq_ids) {
8842                         err = ENOMEM;
8843                         goto out;
8844                 }
8845         }
8846
8847         kqhash_lock(fdp);
8848
8849         if (fdp->fd_kqhashmask > 0) {
8850                 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
8851                         struct kqworkloop *kqwl;
8852
8853                         LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8854                                 /* report the number of kqueues, even if they don't all fit */
8855                                 if (nkqueues < buflen) {
8856                                         kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
8857                                 }
8858                                 nkqueues++;
8859                         }
8860                 }
8861         }
8862
8863         kqhash_unlock(fdp);
8864
8865         if (kq_ids) {
8866                 size_t copysize;
8867                 if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), &copysize)) {
8868                         err = ERANGE;
8869                         goto out;
8870                 }
8871
8872                 assert(ubufsize >= copysize);
8873                 err = copyout(kq_ids, ubuf, copysize);
8874         }
8875
8876 out:
8877         if (kq_ids) {
8878                 kheap_free(KHEAP_TEMP, kq_ids, bufsize);
8879         }
8880
8881         if (!err) {
8882                 *nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
8883         }
8884         return err;
8885 }
8886
8887 int
8888 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8889     uint32_t ubufsize, int32_t *size_out)
8890 {
8891         proc_t p = (proc_t)proc;
8892         struct kqworkloop *kqwl;
8893         int err = 0;
8894         struct kqueue_dyninfo kqdi = { };
8895
8896         assert(p != NULL);
8897
8898         if (ubufsize < sizeof(struct kqueue_info)) {
8899                 return ENOBUFS;
8900         }
8901
8902         kqwl = kqworkloop_hash_lookup_and_retain(p->p_fd, kq_id);
8903         if (!kqwl) {
8904                 return ESRCH;
8905         }
8906
8907         /*
8908          * backward compatibility: allow the argument to this call to only be
8909          * a struct kqueue_info
8910          */
8911         if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
8912                 ubufsize = sizeof(struct kqueue_dyninfo);
8913                 err = fill_kqueue_dyninfo(kqwl, &kqdi);
8914         } else {
8915                 ubufsize = sizeof(struct kqueue_info);
8916                 err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
8917         }
8918         if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
8919                 *size_out = ubufsize;
8920         }
8921         kqworkloop_release(kqwl);
8922         return err;
8923 }
8924
8925 int
8926 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8927     uint32_t ubufsize, int32_t *nknotes_out)
8928 {
8929         proc_t p = (proc_t)proc;
8930         struct kqworkloop *kqwl;
8931         int err;
8932
8933         kqwl = kqworkloop_hash_lookup_and_retain(p->p_fd, kq_id);
8934         if (!kqwl) {
8935                 return ESRCH;
8936         }
8937
8938         err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
8939         kqworkloop_release(kqwl);
8940         return err;
8941 }
8942
8943 int
8944 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
8945     uint32_t bufsize, int32_t *retval)
8946 {
8947         struct knote *kn;
8948         int i;
8949         int err = 0;
8950         struct filedesc *fdp = p->p_fd;
8951         unsigned long nknotes = 0;
8952         unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
8953         struct kevent_extinfo *kqext = NULL;
8954
8955         /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
8956         buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
8957
8958         kqext = kheap_alloc(KHEAP_TEMP,
8959             buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
8960         if (kqext == NULL) {
8961                 err = ENOMEM;
8962                 goto out;
8963         }
8964
8965         proc_fdlock(p);
8966         for (i = 0; i < fdp->fd_knlistsize; i++) {
8967                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
8968                 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8969         }
8970         proc_fdunlock(p);
8971
8972         if (fdp->fd_knhashmask != 0) {
8973                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
8974                         knhash_lock(fdp);
8975                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
8976                         nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8977                         knhash_unlock(fdp);
8978                 }
8979         }
8980
8981         assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
8982         err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
8983
8984 out:
8985         if (kqext) {
8986                 kheap_free(KHEAP_TEMP, kqext, buflen * sizeof(struct kevent_extinfo));
8987                 kqext = NULL;
8988         }
8989
8990         if (!err) {
8991                 *retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
8992         }
8993         return err;
8994 }
8995
8996 static unsigned int
8997 klist_copy_udata(struct klist *list, uint64_t *buf,
8998     unsigned int buflen, unsigned int nknotes)
8999 {
9000         struct knote *kn;
9001         SLIST_FOREACH(kn, list, kn_link) {
9002                 if (nknotes < buflen) {
9003                         /*
9004                          * kevent_register will always set kn_udata atomically
9005                          * so that we don't have to take any kqlock here.
9006                          */
9007                         buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
9008                 }
9009                 /* we return total number of knotes, which may be more than requested */
9010                 nknotes++;
9011         }
9012
9013         return nknotes;
9014 }
9015
9016 int
9017 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
9018 {
9019         proc_t p = (proc_t)proc;
9020         struct filedesc *fdp = p->p_fd;
9021         unsigned int nuptrs = 0;
9022         unsigned int buflen = bufsize / sizeof(uint64_t);
9023         struct kqworkloop *kqwl;
9024
9025         if (buflen > 0) {
9026                 assert(buf != NULL);
9027         }
9028
9029         proc_fdlock(p);
9030         for (int i = 0; i < fdp->fd_knlistsize; i++) {
9031                 nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
9032         }
9033         proc_fdunlock(p);
9034
9035         knhash_lock(fdp);
9036         if (fdp->fd_knhashmask != 0) {
9037                 for (size_t i = 0; i < fdp->fd_knhashmask + 1; i++) {
9038                         nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9039                 }
9040         }
9041         knhash_unlock(fdp);
9042
9043         kqhash_lock(fdp);
9044         if (fdp->fd_kqhashmask != 0) {
9045                 for (size_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
9046                         LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9047                                 if (nuptrs < buflen) {
9048                                         buf[nuptrs] = kqwl->kqwl_dynamicid;
9049                                 }
9050                                 nuptrs++;
9051                         }
9052                 }
9053         }
9054         kqhash_unlock(fdp);
9055
9056         return (int)nuptrs;
9057 }
9058
9059 static void
9060 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9061 {
9062         uint64_t ast_addr;
9063         bool proc_is_64bit = !!(p->p_flag & P_LP64);
9064         size_t user_addr_size = proc_is_64bit ? 8 : 4;
9065         uint32_t ast_flags32 = 0;
9066         uint64_t ast_flags64 = 0;
9067         struct uthread *ut = get_bsdthread_info(thread);
9068
9069         if (ut->uu_kqr_bound != NULL) {
9070                 ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9071         }
9072
9073         if (ast_flags64 == 0) {
9074                 return;
9075         }
9076
9077         if (!(p->p_flag & P_LP64)) {
9078                 ast_flags32 = (uint32_t)ast_flags64;
9079                 assert(ast_flags64 < 0x100000000ull);
9080         }
9081
9082         ast_addr = thread_rettokern_addr(thread);
9083         if (ast_addr == 0) {
9084                 return;
9085         }
9086
9087         if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9088             (user_addr_t)ast_addr,
9089             user_addr_size) != 0) {
9090                 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9091                     "ast_addr = %llu\n", p->p_pid, thread_tid(current_thread()), ast_addr);
9092         }
9093 }
9094
9095 void
9096 kevent_ast(thread_t thread, uint16_t bits)
9097 {
9098         proc_t p = current_proc();
9099
9100         if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9101                 workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9102         }
9103         if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9104                 kevent_set_return_to_kernel_user_tsd(p, thread);
9105         }
9106 }
9107
9108 #if DEVELOPMENT || DEBUG
9109
9110 #define KEVENT_SYSCTL_BOUND_ID 1
9111
9112 static int
9113 kevent_sysctl SYSCTL_HANDLER_ARGS
9114 {
9115 #pragma unused(oidp, arg2)
9116         uintptr_t type = (uintptr_t)arg1;
9117         uint64_t bound_id = 0;
9118
9119         if (type != KEVENT_SYSCTL_BOUND_ID) {
9120                 return EINVAL;
9121         }
9122
9123         if (req->newptr) {
9124                 return EINVAL;
9125         }
9126
9127         struct uthread *ut = get_bsdthread_info(current_thread());
9128         if (!ut) {
9129                 return EFAULT;
9130         }
9131
9132         workq_threadreq_t kqr = ut->uu_kqr_bound;
9133         if (kqr) {
9134                 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9135                         bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9136                 } else {
9137                         bound_id = -1;
9138                 }
9139         }
9140
9141         return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9142 }
9143
9144 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9145     "kevent information");
9146
9147 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9148     CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9149     (void *)KEVENT_SYSCTL_BOUND_ID,
9150     sizeof(kqueue_id_t), kevent_sysctl, "Q",
9151     "get the ID of the bound kqueue");
9152
9153 #endif /* DEVELOPMENT || DEBUG */