bsd/kern/kern_event.c

   1 /*
   2  * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  */
  29 /*-
  30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  31  * All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  52  * SUCH DAMAGE.
  53  */
  54 /*
  55  *      @(#)kern_event.c       1.0 (3/31/2000)
  56  */
  57 #include <stdint.h>
  58 #include <stdatomic.h>
  59
  60 #include <sys/param.h>
  61 #include <sys/systm.h>
  62 #include <sys/filedesc.h>
  63 #include <sys/kernel.h>
  64 #include <sys/proc_internal.h>
  65 #include <sys/kauth.h>
  66 #include <sys/malloc.h>
  67 #include <sys/unistd.h>
  68 #include <sys/file_internal.h>
  69 #include <sys/fcntl.h>
  70 #include <sys/select.h>
  71 #include <sys/queue.h>
  72 #include <sys/event.h>
  73 #include <sys/eventvar.h>
  74 #include <sys/protosw.h>
  75 #include <sys/socket.h>
  76 #include <sys/socketvar.h>
  77 #include <sys/stat.h>
  78 #include <sys/sysctl.h>
  79 #include <sys/uio.h>
  80 #include <sys/sysproto.h>
  81 #include <sys/user.h>
  82 #include <sys/vnode_internal.h>
  83 #include <string.h>
  84 #include <sys/proc_info.h>
  85 #include <sys/codesign.h>
  86 #include <sys/pthread_shims.h>
  87 #include <sys/kdebug.h>
  88 #include <sys/reason.h>
  89 #include <os/reason_private.h>
  90
  91 #include <kern/locks.h>
  92 #include <kern/clock.h>
  93 #include <kern/cpu_data.h>
  94 #include <kern/policy_internal.h>
  95 #include <kern/thread_call.h>
  96 #include <kern/sched_prim.h>
  97 #include <kern/waitq.h>
  98 #include <kern/zalloc.h>
  99 #include <kern/kalloc.h>
 100 #include <kern/assert.h>
 101 #include <kern/ast.h>
 102 #include <kern/thread.h>
 103 #include <kern/kcdata.h>
 104
 105 #include <libkern/libkern.h>
 106 #include <libkern/OSAtomic.h>
 107
 108 #include "net/net_str_id.h"
 109
 110 #include <mach/task.h>
 111 #include <libkern/section_keywords.h>
 112
 113 #if CONFIG_MEMORYSTATUS
 114 #include <sys/kern_memorystatus.h>
 115 #endif
 116
 117 extern thread_t port_name_to_thread(mach_port_name_t    port_name); /* osfmk/kern/ipc_tt.h   */
 118 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
 119
 120 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
 121
 122 /*
 123  * JMM - this typedef needs to be unified with pthread_priority_t
 124  *       and mach_msg_priority_t. It also needs to be the same type
 125  *       everywhere.
 126  */
 127 typedef int32_t qos_t;
 128
 129 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 130
 131 #define KQ_EVENT        NO_EVENT64
 132
 133 #define KNUSE_NONE       0x0
 134 #define KNUSE_STEAL_DROP 0x1
 135 #define KNUSE_BOOST      0x2
 136 static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn, int flags);
 137 static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
 138 static int kqlock2knotedetach(struct kqueue *kq, struct knote *kn, int flags);
 139 static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int flags);
 140
 141 static int kqueue_read(struct fileproc *fp, struct uio *uio,
 142                 int flags, vfs_context_t ctx);
 143 static int kqueue_write(struct fileproc *fp, struct uio *uio,
 144                 int flags, vfs_context_t ctx);
 145 static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
 146                 vfs_context_t ctx);
 147 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
 148                 vfs_context_t ctx);
 149 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
 150 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
 151                 struct kevent_internal_s *kev, vfs_context_t ctx);
 152 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
 153
 154 static const struct fileops kqueueops = {
 155         .fo_type = DTYPE_KQUEUE,
 156         .fo_read = kqueue_read,
 157         .fo_write = kqueue_write,
 158         .fo_ioctl = kqueue_ioctl,
 159         .fo_select = kqueue_select,
 160         .fo_close = kqueue_close,
 161         .fo_kqfilter = kqueue_kqfilter,
 162         .fo_drain = kqueue_drain,
 163 };
 164
 165 static void kevent_put_kq(struct proc *p, kqueue_id_t id, struct fileproc *fp, struct kqueue *kq);
 166 static int kevent_internal(struct proc *p,
 167                            kqueue_id_t id, kqueue_id_t *id_out,
 168                            user_addr_t changelist, int nchanges,
 169                            user_addr_t eventlist, int nevents,
 170                            user_addr_t data_out, uint64_t data_available,
 171                            unsigned int flags, user_addr_t utimeout,
 172                            kqueue_continue_t continuation,
 173                            int32_t *retval);
 174 static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp,
 175                          struct proc *p, unsigned int flags);
 176 static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
 177                           struct proc *p, unsigned int flags);
 178 char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
 179
 180 static void kqueue_interrupt(struct kqueue *kq);
 181 static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
 182                            void *data);
 183 static void kevent_continue(struct kqueue *kq, void *data, int error);
 184 static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
 185 static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data,
 186                           struct filt_process_s *process_data, int *countp, struct proc *p);
 187 static struct kqtailq *kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index);
 188 static struct kqtailq *kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index);
 189 static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index);
 190
 191 static struct kqtailq *kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index);
 192
 193 static void kqworkq_request_thread(struct kqworkq *kqwq, kq_index_t qos_index);
 194 static void kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index);
 195 static void kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index);
 196 static void kqworkq_bind_thread_impl(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
 197 static void kqworkq_unbind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
 198 static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
 199
 200 enum {
 201         KQWL_UO_NONE = 0,
 202         KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI = 0x1,
 203         KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI = 0x2,
 204         KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS = 0x4,
 205         KQWL_UO_UPDATE_OVERRIDE_LAZY = 0x8
 206 };
 207
 208 static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t qos_index, kq_index_t override_index, uint32_t flags);
 209 static void kqworkloop_bind_thread_impl(struct kqworkloop *kqwl, thread_t thread, unsigned int flags);
 210 static void kqworkloop_unbind_thread(struct kqworkloop *kqwl, thread_t thread, unsigned int flags);
 211 static inline kq_index_t kqworkloop_combined_qos(struct kqworkloop *kqwl, boolean_t *);
 212 static void kqworkloop_update_suppress_sync_count(struct kqrequest *kqr, uint32_t flags);
 213 enum {
 214         KQWL_UTQ_NONE,
 215         /*
 216          * The wakeup qos is the qos of QUEUED knotes.
 217          *
 218          * This QoS is accounted for with the events override in the
 219          * kqr_override_index field. It is raised each time a new knote is queued at
 220          * a given QoS. The kqr_wakeup_indexes field is a superset of the non empty
 221          * knote buckets and is recomputed after each event delivery.
 222          */
 223         KQWL_UTQ_UPDATE_WAKEUP_QOS,
 224         KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
 225         KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
 226         /*
 227          * The wakeup override is for suppressed knotes that have fired again at
 228          * a higher QoS than the one for which they are suppressed already.
 229          * This override is cleared when the knote suppressed list becomes empty.
 230          */
 231         KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
 232         KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
 233         /*
 234          * The async QoS is the maximum QoS of an event enqueued on this workloop in
 235          * userland. It is copied from the only EVFILT_WORKLOOP knote with
 236          * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
 237          * such knote, this QoS is 0.
 238          */
 239         KQWL_UTQ_SET_ASYNC_QOS,
 240         /*
 241          * The sync waiters QoS is the maximum QoS of any thread blocked on an
 242          * EVFILT_WORKLOOP knote marked with the NOTE_WL_SYNC_WAIT bit.
 243          * If there is no such knote, this QoS is 0.
 244          */
 245         KQWL_UTQ_SET_SYNC_WAITERS_QOS,
 246         KQWL_UTQ_REDRIVE_EVENTS,
 247 };
 248 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
 249 static void kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index);
 250
 251 static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data,
 252                          struct filt_process_s *process_data, struct proc *p);
 253 #if 0
 254 static void knote_put(struct knote *kn);
 255 #endif
 256
 257 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
 258                 struct kevent_internal_s *kev, struct proc *p, int *knoteuse_flags);
 259 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, bool is_fd, struct proc *p);
 260 static void kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, kn_status_t *kn_status, uint16_t *kq_state);
 261
 262 static void knote_drop(struct knote *kn, struct proc *p);
 263 static struct knote *knote_alloc(void);
 264 static void knote_free(struct knote *kn);
 265
 266 static void knote_activate(struct knote *kn);
 267 static void knote_deactivate(struct knote *kn);
 268
 269 static void knote_enable(struct knote *kn);
 270 static void knote_disable(struct knote *kn);
 271
 272 static int knote_enqueue(struct knote *kn);
 273 static void knote_dequeue(struct knote *kn);
 274
 275 static void knote_suppress(struct knote *kn);
 276 static void knote_unsuppress(struct knote *kn);
 277 static void knote_wakeup(struct knote *kn);
 278
 279 static kq_index_t knote_get_queue_index(struct knote *kn);
 280 static struct kqtailq *knote_get_queue(struct knote *kn);
 281 static kq_index_t knote_get_req_index(struct knote *kn);
 282 static kq_index_t knote_get_qos_index(struct knote *kn);
 283 static void knote_set_qos_index(struct knote *kn, kq_index_t qos_index);
 284 static kq_index_t knote_get_qos_override_index(struct knote *kn);
 285 static kq_index_t knote_get_sync_qos_override_index(struct knote *kn);
 286 static void knote_set_qos_override_index(struct knote *kn, kq_index_t qos_index, boolean_t override_is_sync);
 287 static void knote_set_qos_overcommit(struct knote *kn);
 288
 289 static int filt_fileattach(struct knote *kn, struct kevent_internal_s *kev);
 290 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
 291         .f_isfd = 1,
 292         .f_attach = filt_fileattach,
 293 };
 294
 295 static void filt_kqdetach(struct knote *kn);
 296 static int filt_kqueue(struct knote *kn, long hint);
 297 static int filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev);
 298 static int filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 299 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
 300         .f_isfd = 1,
 301         .f_detach = filt_kqdetach,
 302         .f_event = filt_kqueue,
 303         .f_touch = filt_kqtouch,
 304         .f_process = filt_kqprocess,
 305 };
 306
 307 /* placeholder for not-yet-implemented filters */
 308 static int filt_badattach(struct knote *kn, struct kevent_internal_s *kev);
 309 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
 310         .f_attach = filt_badattach,
 311 };
 312
 313 static int filt_procattach(struct knote *kn, struct kevent_internal_s *kev);
 314 static void filt_procdetach(struct knote *kn);
 315 static int filt_proc(struct knote *kn, long hint);
 316 static int filt_proctouch(struct knote *kn, struct kevent_internal_s *kev);
 317 static int filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 318 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
 319         .f_attach = filt_procattach,
 320         .f_detach = filt_procdetach,
 321         .f_event = filt_proc,
 322         .f_touch = filt_proctouch,
 323         .f_process = filt_procprocess,
 324 };
 325
 326 #if CONFIG_MEMORYSTATUS
 327 extern const struct filterops memorystatus_filtops;
 328 #endif /* CONFIG_MEMORYSTATUS */
 329
 330 extern const struct filterops fs_filtops;
 331
 332 extern const struct filterops sig_filtops;
 333
 334 static zone_t knote_zone;
 335 static zone_t kqfile_zone;
 336 static zone_t kqworkq_zone;
 337 static zone_t kqworkloop_zone;
 338
 339 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 340
 341 /* Mach portset filter */
 342 extern const struct filterops machport_filtops;
 343
 344 /* User filter */
 345 static int filt_userattach(struct knote *kn, struct kevent_internal_s *kev);
 346 static void filt_userdetach(struct knote *kn);
 347 static int filt_user(struct knote *kn, long hint);
 348 static int filt_usertouch(struct knote *kn, struct kevent_internal_s *kev);
 349 static int filt_userprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 350 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
 351         .f_attach = filt_userattach,
 352         .f_detach = filt_userdetach,
 353         .f_event = filt_user,
 354         .f_touch = filt_usertouch,
 355         .f_process = filt_userprocess,
 356 };
 357
 358 static lck_spin_t _filt_userlock;
 359 static void filt_userlock(void);
 360 static void filt_userunlock(void);
 361
 362 /* Workloop filter */
 363 static bool filt_wlneeds_boost(struct kevent_internal_s *kev);
 364 static int filt_wlattach(struct knote *kn, struct kevent_internal_s *kev);
 365 static int filt_wlpost_attach(struct knote *kn, struct  kevent_internal_s *kev);
 366 static void filt_wldetach(struct knote *kn);
 367 static int filt_wlevent(struct knote *kn, long hint);
 368 static int filt_wltouch(struct knote *kn, struct kevent_internal_s *kev);
 369 static int filt_wldrop_and_unlock(struct knote *kn, struct kevent_internal_s *kev);
 370 static int filt_wlprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 371 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
 372         .f_needs_boost = filt_wlneeds_boost,
 373         .f_attach = filt_wlattach,
 374         .f_post_attach = filt_wlpost_attach,
 375         .f_detach = filt_wldetach,
 376         .f_event = filt_wlevent,
 377         .f_touch = filt_wltouch,
 378         .f_drop_and_unlock = filt_wldrop_and_unlock,
 379         .f_process = filt_wlprocess,
 380 };
 381
 382 extern const struct filterops pipe_rfiltops;
 383 extern const struct filterops pipe_wfiltops;
 384 extern const struct filterops ptsd_kqops;
 385 extern const struct filterops soread_filtops;
 386 extern const struct filterops sowrite_filtops;
 387 extern const struct filterops sock_filtops;
 388 extern const struct filterops soexcept_filtops;
 389 extern const struct filterops spec_filtops;
 390 extern const struct filterops bpfread_filtops;
 391 extern const struct filterops necp_fd_rfiltops;
 392 extern const struct filterops fsevent_filtops;
 393 extern const struct filterops vnode_filtops;
 394 extern const struct filterops tty_filtops;
 395
 396 const static struct filterops timer_filtops;
 397
 398 /*
 399  *
 400  * Rules for adding new filters to the system:
 401  * Public filters:
 402  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
 403  *   in the exported section of the header
 404  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
 405  * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
 406  *   of the Public Filters section in the array.
 407  * Private filters:
 408  * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
 409  *   in the XNU_KERNEL_PRIVATE section of the header
 410  * - Update the EVFILTID_MAX value to reflect the new addition
 411  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
 412  *   the Private filters section of the array.
 413  */
 414 SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = {
 415         /* Public Filters */
 416         [~EVFILT_READ]                                  = &file_filtops,
 417         [~EVFILT_WRITE]                                 = &file_filtops,
 418         [~EVFILT_AIO]                                   = &bad_filtops,
 419         [~EVFILT_VNODE]                                 = &file_filtops,
 420         [~EVFILT_PROC]                                  = &proc_filtops,
 421         [~EVFILT_SIGNAL]                                = &sig_filtops,
 422         [~EVFILT_TIMER]                                 = &timer_filtops,
 423         [~EVFILT_MACHPORT]                              = &machport_filtops,
 424         [~EVFILT_FS]                                    = &fs_filtops,
 425         [~EVFILT_USER]                                  = &user_filtops,
 426                                                                           &bad_filtops,
 427                                                                           &bad_filtops,
 428         [~EVFILT_SOCK]                                  = &file_filtops,
 429 #if CONFIG_MEMORYSTATUS
 430         [~EVFILT_MEMORYSTATUS]                  = &memorystatus_filtops,
 431 #else
 432         [~EVFILT_MEMORYSTATUS]                  = &bad_filtops,
 433 #endif
 434         [~EVFILT_EXCEPT]                                = &file_filtops,
 435
 436         [~EVFILT_WORKLOOP]              = &workloop_filtops,
 437
 438         /* Private filters */
 439         [EVFILTID_KQREAD]                               = &kqread_filtops,
 440         [EVFILTID_PIPE_R]                               = &pipe_rfiltops,
 441         [EVFILTID_PIPE_W]                               = &pipe_wfiltops,
 442         [EVFILTID_PTSD]                                 = &ptsd_kqops,
 443         [EVFILTID_SOREAD]                               = &soread_filtops,
 444         [EVFILTID_SOWRITE]                              = &sowrite_filtops,
 445         [EVFILTID_SCK]                                  = &sock_filtops,
 446         [EVFILTID_SOEXCEPT]                     = &soexcept_filtops,
 447         [EVFILTID_SPEC]                                 = &spec_filtops,
 448         [EVFILTID_BPFREAD]                              = &bpfread_filtops,
 449         [EVFILTID_NECP_FD]                              = &necp_fd_rfiltops,
 450         [EVFILTID_FSEVENT]                              = &fsevent_filtops,
 451         [EVFILTID_VN]                                   = &vnode_filtops,
 452         [EVFILTID_TTY]                                  = &tty_filtops
 453 };
 454
 455 /* waitq prepost callback */
 456 void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos);
 457
 458 #ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
 459 #define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */
 460 #endif
 461 #ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
 462 #define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG    0x80000000 /* request overcommit threads */
 463 #endif
 464 #ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK
 465 #define _PTHREAD_PRIORITY_QOS_CLASS_MASK    0x003fff00  /* QoS class mask */
 466 #endif
 467 #ifndef _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32
 468 #define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 8
 469 #endif
 470
 471 static inline __kdebug_only
 472 uintptr_t
 473 kqr_thread_id(struct kqrequest *kqr)
 474 {
 475         return (uintptr_t)thread_tid(kqr->kqr_thread);
 476 }
 477
 478 static inline
 479 boolean_t is_workqueue_thread(thread_t thread)
 480 {
 481         return (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE);
 482 }
 483
 484 static inline
 485 void knote_canonicalize_kevent_qos(struct knote *kn)
 486 {
 487         struct kqueue *kq = knote_get_kq(kn);
 488         unsigned long canonical;
 489
 490         if ((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0)
 491                 return;
 492
 493         /* preserve manager and overcommit flags in this case */
 494         canonical = pthread_priority_canonicalize(kn->kn_qos, FALSE);
 495         kn->kn_qos = (qos_t)canonical;
 496 }
 497
 498 static inline
 499 kq_index_t qos_index_from_qos(struct knote *kn, qos_t qos, boolean_t propagation)
 500 {
 501         struct kqueue *kq = knote_get_kq(kn);
 502         kq_index_t qos_index;
 503         unsigned long flags = 0;
 504
 505         if ((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0)
 506                 return QOS_INDEX_KQFILE;
 507
 508         qos_index = (kq_index_t)thread_qos_from_pthread_priority(
 509                                 (unsigned long)qos, &flags);
 510
 511         if (kq->kq_state & KQ_WORKQ) {
 512                 /* workq kqueues support requesting a manager thread (non-propagation) */
 513                 if (!propagation && (flags & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG))
 514                         return KQWQ_QOS_MANAGER;
 515         }
 516
 517         return qos_index;
 518 }
 519
 520 static inline
 521 qos_t qos_from_qos_index(kq_index_t qos_index)
 522 {
 523         /* should only happen for KQ_WORKQ */
 524         if (qos_index == KQWQ_QOS_MANAGER)
 525                 return  _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
 526
 527         if (qos_index == 0)
 528                 return THREAD_QOS_UNSPECIFIED;
 529
 530         /* Should have support from pthread kext support */
 531         return (1 << (qos_index - 1 +
 532                       _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32));
 533 }
 534
 535 /* kqr lock must be held */
 536 static inline
 537 unsigned long pthread_priority_for_kqrequest(
 538         struct kqrequest *kqr,
 539         kq_index_t qos_index)
 540 {
 541         unsigned long priority = qos_from_qos_index(qos_index);
 542         if (kqr->kqr_state & KQR_THOVERCOMMIT) {
 543                 priority |= _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
 544         }
 545         return priority;
 546 }
 547
 548 static inline
 549 kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags)
 550 {
 551 #pragma unused(thread)
 552         kq_index_t qos_index;
 553
 554         if (flags & KEVENT_FLAG_WORKQ_MANAGER)
 555                 return KQWQ_QOS_MANAGER;
 556
 557         qos_index = (kq_index_t)qos_class;
 558         assert(qos_index > 0 && qos_index < KQWQ_QOS_MANAGER);
 559
 560         return qos_index;
 561 }
 562
 563 /*
 564  * kqueue/note lock implementations
 565  *
 566  *      The kqueue lock guards the kq state, the state of its queues,
 567  *      and the kqueue-aware status and use counts of individual knotes.
 568  *
 569  *      The kqueue workq lock is used to protect state guarding the
 570  *      interaction of the kqueue with the workq.  This state cannot
 571  *      be guarded by the kq lock - as it needs to be taken when we
 572  *      already have the waitq set lock held (during the waitq hook
 573  *      callback).  It might be better to use the waitq lock itself
 574  *      for this, but the IRQ requirements make that difficult).
 575  *
 576  *      Knote flags, filter flags, and associated data are protected
 577  *      by the underlying object lock - and are only ever looked at
 578  *      by calling the filter to get a [consistent] snapshot of that
 579  *      data.
 580  */
 581 lck_grp_attr_t * kq_lck_grp_attr;
 582 lck_grp_t * kq_lck_grp;
 583 lck_attr_t * kq_lck_attr;
 584
 585 static inline void
 586 kqlock(struct kqueue *kq)
 587 {
 588         lck_spin_lock(&kq->kq_lock);
 589 }
 590
 591 static inline void
 592 kqlock_held(__assert_only struct kqueue *kq)
 593 {
 594         LCK_SPIN_ASSERT(&kq->kq_lock, LCK_ASSERT_OWNED);
 595 }
 596
 597 static inline void
 598 kqunlock(struct kqueue *kq)
 599 {
 600         lck_spin_unlock(&kq->kq_lock);
 601 }
 602
 603 static inline void
 604 knhash_lock(proc_t p)
 605 {
 606         lck_mtx_lock(&p->p_fd->fd_knhashlock);
 607 }
 608
 609 static inline void
 610 knhash_unlock(proc_t p)
 611 {
 612         lck_mtx_unlock(&p->p_fd->fd_knhashlock);
 613 }
 614
 615
 616 /*
 617  * Convert a kq lock to a knote use referece.
 618  *
 619  *      If the knote is being dropped, or has
 620  *  vanished, we can't get a use reference.
 621  *  Just return with it still locked.
 622  *
 623  *      - kq locked at entry
 624  *      - unlock on exit if we get the use reference
 625  */
 626 static int
 627 kqlock2knoteuse(struct kqueue *kq, struct knote *kn, int flags)
 628 {
 629         if (kn->kn_status & (KN_DROPPING | KN_VANISHED))
 630                 return (0);
 631
 632         assert(kn->kn_status & KN_ATTACHED);
 633         kn->kn_inuse++;
 634         if (flags & KNUSE_BOOST) {
 635                 set_thread_rwlock_boost();
 636         }
 637         kqunlock(kq);
 638         return (1);
 639 }
 640
 641 /*
 642  *      - kq locked at entry
 643  *      - kq unlocked at exit
 644  */
 645 __disable_tail_calls
 646 static wait_result_t
 647 knoteusewait(struct kqueue *kq, struct knote *kn)
 648 {
 649         kn->kn_status |= KN_USEWAIT;
 650         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
 651                         CAST_EVENT64_T(&kn->kn_status),
 652                         THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
 653         kqunlock(kq);
 654         return thread_block(THREAD_CONTINUE_NULL);
 655 }
 656
 657 static bool
 658 knoteuse_needs_boost(struct knote *kn, struct kevent_internal_s *kev)
 659 {
 660         if (knote_fops(kn)->f_needs_boost) {
 661                 return knote_fops(kn)->f_needs_boost(kev);
 662         }
 663         return false;
 664 }
 665
 666 /*
 667  * Convert from a knote use reference back to kq lock.
 668  *
 669  *      Drop a use reference and wake any waiters if
 670  *      this is the last one.
 671  *
 672  *  If someone is trying to drop the knote, but the
 673  *  caller has events they must deliver, take
 674  *  responsibility for the drop later - and wake the
 675  *  other attempted dropper in a manner that informs
 676  *  him of the transfer of responsibility.
 677  *
 678  *      The exit return indicates if the knote is still alive
 679  *  (or if not, the other dropper has been given the green
 680  *  light to drop it).
 681  *
 682  *  The kqueue lock is re-taken unconditionally.
 683  */
 684 static int
 685 knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int flags)
 686 {
 687         int dropped = 0;
 688         int steal_drop = (flags & KNUSE_STEAL_DROP);
 689
 690         kqlock(kq);
 691         if (flags & KNUSE_BOOST) {
 692                 clear_thread_rwlock_boost();
 693         }
 694
 695         if (--kn->kn_inuse == 0) {
 696
 697                 if ((kn->kn_status & KN_ATTACHING) != 0) {
 698                         kn->kn_status &= ~KN_ATTACHING;
 699                 }
 700
 701                 if ((kn->kn_status & KN_USEWAIT) != 0) {
 702                         wait_result_t result;
 703
 704                         /* If we need to, try and steal the drop */
 705                         if (kn->kn_status & KN_DROPPING) {
 706                                 if (steal_drop && !(kn->kn_status & KN_STOLENDROP)) {
 707                                         kn->kn_status |= KN_STOLENDROP;
 708                                 } else {
 709                                         dropped = 1;
 710                                 }
 711                         }
 712
 713                         /* wakeup indicating if ANY USE stole the drop */
 714                         result = (kn->kn_status & KN_STOLENDROP) ?
 715                                  THREAD_RESTART : THREAD_AWAKENED;
 716
 717                         kn->kn_status &= ~KN_USEWAIT;
 718                         waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
 719                                            CAST_EVENT64_T(&kn->kn_status),
 720                                            result,
 721                                            WAITQ_ALL_PRIORITIES);
 722                 } else {
 723                         /* should have seen use-wait if dropping with use refs */
 724                         assert((kn->kn_status & (KN_DROPPING|KN_STOLENDROP)) == 0);
 725                 }
 726
 727         } else if (kn->kn_status & KN_DROPPING) {
 728                 /* not the last ref but want to steal a drop if present */
 729                 if (steal_drop && ((kn->kn_status & KN_STOLENDROP) == 0)) {
 730                         kn->kn_status |= KN_STOLENDROP;
 731
 732                         /* but we now have to wait to be the last ref */
 733                         knoteusewait(kq, kn);
 734                         kqlock(kq);
 735                 } else {
 736                         dropped = 1;
 737                 }
 738         }
 739
 740         return (!dropped);
 741 }
 742
 743 /*
 744  * Convert a kq lock to a knote use reference
 745  * (for the purpose of detaching AND vanishing it).
 746  *
 747  *      If the knote is being dropped, we can't get
 748  *      a detach reference, so wait for the knote to
 749  *  finish dropping before returning.
 750  *
 751  *  If the knote is being used for other purposes,
 752  *  we cannot detach it until those uses are done
 753  *  as well. Again, just wait for them to finish
 754  *  (caller will start over at lookup).
 755  *
 756  *      - kq locked at entry
 757  *      - unlocked on exit
 758  */
 759 static int
 760 kqlock2knotedetach(struct kqueue *kq, struct knote *kn, int flags)
 761 {
 762         if ((kn->kn_status & KN_DROPPING) || kn->kn_inuse) {
 763                 /* have to wait for dropper or current uses to go away */
 764                 knoteusewait(kq, kn);
 765                 return (0);
 766         }
 767         assert((kn->kn_status & KN_VANISHED) == 0);
 768         assert(kn->kn_status & KN_ATTACHED);
 769         kn->kn_status &= ~KN_ATTACHED;
 770         kn->kn_status |= KN_VANISHED;
 771         if (flags & KNUSE_BOOST) {
 772                 clear_thread_rwlock_boost();
 773         }
 774         kn->kn_inuse++;
 775         kqunlock(kq);
 776         return (1);
 777 }
 778
 779 /*
 780  * Convert a kq lock to a knote drop reference.
 781  *
 782  *      If the knote is in use, wait for the use count
 783  *      to subside.  We first mark our intention to drop
 784  *      it - keeping other users from "piling on."
 785  *      If we are too late, we have to wait for the
 786  *      other drop to complete.
 787  *
 788  *      - kq locked at entry
 789  *      - always unlocked on exit.
 790  *      - caller can't hold any locks that would prevent
 791  *        the other dropper from completing.
 792  */
 793 static int
 794 kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
 795 {
 796         int oktodrop;
 797         wait_result_t result;
 798
 799         oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
 800         /* if another thread is attaching, they will become the dropping thread */
 801         kn->kn_status |= KN_DROPPING;
 802         knote_unsuppress(kn);
 803         knote_dequeue(kn);
 804         if (oktodrop) {
 805                 if (kn->kn_inuse == 0) {
 806                         kqunlock(kq);
 807                         return (oktodrop);
 808                 }
 809         }
 810         result = knoteusewait(kq, kn);
 811         /* THREAD_RESTART == another thread stole the knote drop */
 812         return (result == THREAD_AWAKENED);
 813 }
 814
 815 #if 0
 816 /*
 817  * Release a knote use count reference.
 818  */
 819 static void
 820 knote_put(struct knote *kn)
 821 {
 822         struct kqueue *kq = knote_get_kq(kn);
 823
 824         kqlock(kq);
 825         if (--kn->kn_inuse == 0) {
 826                 if ((kn->kn_status & KN_USEWAIT) != 0) {
 827                         kn->kn_status &= ~KN_USEWAIT;
 828                         waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
 829                                            CAST_EVENT64_T(&kn->kn_status),
 830                                            THREAD_AWAKENED,
 831                                            WAITQ_ALL_PRIORITIES);
 832                 }
 833         }
 834         kqunlock(kq);
 835 }
 836 #endif
 837
 838 static int
 839 filt_fileattach(struct knote *kn, struct kevent_internal_s *kev)
 840 {
 841         return (fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current()));
 842 }
 843
 844 #define f_flag f_fglob->fg_flag
 845 #define f_msgcount f_fglob->fg_msgcount
 846 #define f_cred f_fglob->fg_cred
 847 #define f_ops f_fglob->fg_ops
 848 #define f_offset f_fglob->fg_offset
 849 #define f_data f_fglob->fg_data
 850
 851 static void
 852 filt_kqdetach(struct knote *kn)
 853 {
 854         struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
 855         struct kqueue *kq = &kqf->kqf_kqueue;
 856
 857         kqlock(kq);
 858         KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
 859         kqunlock(kq);
 860 }
 861
 862 /*ARGSUSED*/
 863 static int
 864 filt_kqueue(struct knote *kn, __unused long hint)
 865 {
 866         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 867         int count;
 868
 869         count = kq->kq_count;
 870         return (count > 0);
 871 }
 872
 873 static int
 874 filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev)
 875 {
 876 #pragma unused(kev)
 877         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 878         int res;
 879
 880         kqlock(kq);
 881         kn->kn_data = kq->kq_count;
 882         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
 883                 kn->kn_udata = kev->udata;
 884         res = (kn->kn_data > 0);
 885
 886         kqunlock(kq);
 887
 888         return res;
 889 }
 890
 891 static int
 892 filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
 893 {
 894 #pragma unused(data)
 895         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 896         int res;
 897
 898         kqlock(kq);
 899         kn->kn_data = kq->kq_count;
 900         res = (kn->kn_data > 0);
 901         if (res) {
 902                 *kev = kn->kn_kevent;
 903                 if (kn->kn_flags & EV_CLEAR)
 904                         kn->kn_data = 0;
 905         }
 906         kqunlock(kq);
 907
 908         return res;
 909 }
 910
 911 #pragma mark EVFILT_PROC
 912
 913 static int
 914 filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
 915 {
 916         struct proc *p;
 917
 918         assert(PID_MAX < NOTE_PDATAMASK);
 919
 920         if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
 921                 kn->kn_flags = EV_ERROR;
 922                 kn->kn_data = ENOTSUP;
 923                 return 0;
 924         }
 925
 926         p = proc_find(kn->kn_id);
 927         if (p == NULL) {
 928                 kn->kn_flags = EV_ERROR;
 929                 kn->kn_data = ESRCH;
 930                 return 0;
 931         }
 932
 933         const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
 934
 935         if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
 936                 do {
 937                         pid_t selfpid = proc_selfpid();
 938
 939                         if (p->p_ppid == selfpid)
 940                                 break;  /* parent => ok */
 941
 942                         if ((p->p_lflag & P_LTRACED) != 0 &&
 943                             (p->p_oppid == selfpid))
 944                                 break;  /* parent-in-waiting => ok */
 945
 946                         proc_rele(p);
 947                         kn->kn_flags = EV_ERROR;
 948                         kn->kn_data = EACCES;
 949                         return 0;
 950                 } while (0);
 951
 952         proc_klist_lock();
 953
 954         kn->kn_ptr.p_proc = p;          /* store the proc handle */
 955
 956         KNOTE_ATTACH(&p->p_klist, kn);
 957
 958         proc_klist_unlock();
 959
 960         proc_rele(p);
 961
 962         /*
 963          * only captures edge-triggered events after this point
 964          * so it can't already be fired.
 965          */
 966         return (0);
 967 }
 968
 969
 970 /*
 971  * The knote may be attached to a different process, which may exit,
 972  * leaving nothing for the knote to be attached to.  In that case,
 973  * the pointer to the process will have already been nulled out.
 974  */
 975 static void
 976 filt_procdetach(struct knote *kn)
 977 {
 978         struct proc *p;
 979
 980         proc_klist_lock();
 981
 982         p = kn->kn_ptr.p_proc;
 983         if (p != PROC_NULL) {
 984                 kn->kn_ptr.p_proc = PROC_NULL;
 985                 KNOTE_DETACH(&p->p_klist, kn);
 986         }
 987
 988         proc_klist_unlock();
 989 }
 990
 991 static int
 992 filt_proc(struct knote *kn, long hint)
 993 {
 994         u_int event;
 995
 996         /* ALWAYS CALLED WITH proc_klist_lock */
 997
 998         /*
 999          * Note: a lot of bits in hint may be obtained from the knote
1000          * To free some of those bits, see <rdar://problem/12592988> Freeing up
1001          * bits in hint for filt_proc
1002          *
1003          * mask off extra data
1004          */
1005         event = (u_int)hint & NOTE_PCTRLMASK;
1006
1007         /*
1008          * termination lifecycle events can happen while a debugger
1009          * has reparented a process, in which case notifications
1010          * should be quashed except to the tracing parent. When
1011          * the debugger reaps the child (either via wait4(2) or
1012          * process exit), the child will be reparented to the original
1013          * parent and these knotes re-fired.
1014          */
1015         if (event & NOTE_EXIT) {
1016                 if ((kn->kn_ptr.p_proc->p_oppid != 0)
1017                     && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
1018                         /*
1019                          * This knote is not for the current ptrace(2) parent, ignore.
1020                          */
1021                         return 0;
1022                 }
1023         }
1024
1025         /*
1026          * if the user is interested in this event, record it.
1027          */
1028         if (kn->kn_sfflags & event)
1029                 kn->kn_fflags |= event;
1030
1031 #pragma clang diagnostic push
1032 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1033         if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1034                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1035         }
1036 #pragma clang diagnostic pop
1037
1038
1039         /*
1040          * The kernel has a wrapper in place that returns the same data
1041          * as is collected here, in kn_data.  Any changes to how
1042          * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1043          * should also be reflected in the proc_pidnoteexit() wrapper.
1044          */
1045         if (event == NOTE_EXIT) {
1046                 kn->kn_data = 0;
1047                 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1048                         kn->kn_fflags |= NOTE_EXITSTATUS;
1049                         kn->kn_data |= (hint & NOTE_PDATAMASK);
1050                 }
1051                 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1052                         kn->kn_fflags |= NOTE_EXIT_DETAIL;
1053                         if ((kn->kn_ptr.p_proc->p_lflag &
1054                              P_LTERM_DECRYPTFAIL) != 0) {
1055                                 kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
1056                         }
1057                         if ((kn->kn_ptr.p_proc->p_lflag &
1058                              P_LTERM_JETSAM) != 0) {
1059                                 kn->kn_data |= NOTE_EXIT_MEMORY;
1060                                 switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) {
1061                                 case P_JETSAM_VMPAGESHORTAGE:
1062                                         kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1063                                         break;
1064                                 case P_JETSAM_VMTHRASHING:
1065                                         kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
1066                                         break;
1067                                 case P_JETSAM_FCTHRASHING:
1068                                         kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
1069                                         break;
1070                                 case P_JETSAM_VNODE:
1071                                         kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
1072                                         break;
1073                                 case P_JETSAM_HIWAT:
1074                                         kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
1075                                         break;
1076                                 case P_JETSAM_PID:
1077                                         kn->kn_data |= NOTE_EXIT_MEMORY_PID;
1078                                         break;
1079                                 case P_JETSAM_IDLEEXIT:
1080                                         kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
1081                                         break;
1082                                 }
1083                         }
1084                         if ((kn->kn_ptr.p_proc->p_csflags &
1085                              CS_KILLED) != 0) {
1086                                 kn->kn_data |= NOTE_EXIT_CSERROR;
1087                         }
1088                 }
1089         }
1090
1091         /* if we have any matching state, activate the knote */
1092         return (kn->kn_fflags != 0);
1093 }
1094
1095 static int
1096 filt_proctouch(struct knote *kn, struct kevent_internal_s *kev)
1097 {
1098         int res;
1099
1100         proc_klist_lock();
1101
1102         /* accept new filter flags and mask off output events no long interesting */
1103         kn->kn_sfflags = kev->fflags;
1104         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1105                 kn->kn_udata = kev->udata;
1106
1107         /* restrict the current results to the (smaller?) set of new interest */
1108         /*
1109          * For compatibility with previous implementations, we leave kn_fflags
1110          * as they were before.
1111          */
1112         //kn->kn_fflags &= kn->kn_sfflags;
1113
1114         res = (kn->kn_fflags != 0);
1115
1116         proc_klist_unlock();
1117
1118         return res;
1119 }
1120
1121 static int
1122 filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
1123 {
1124 #pragma unused(data)
1125         int res;
1126
1127         proc_klist_lock();
1128         res = (kn->kn_fflags != 0);
1129         if (res) {
1130                 *kev = kn->kn_kevent;
1131                 kn->kn_flags |= EV_CLEAR;       /* automatically set */
1132                 kn->kn_fflags = 0;
1133                 kn->kn_data = 0;
1134         }
1135         proc_klist_unlock();
1136         return res;
1137 }
1138
1139
1140 #pragma mark EVFILT_TIMER
1141
1142
1143 /*
1144  * Values stored in the knote at rest (using Mach absolute time units)
1145  *
1146  * kn->kn_hook          where the thread_call object is stored
1147  * kn->kn_ext[0]        next deadline or 0 if immediate expiration
1148  * kn->kn_ext[1]        leeway value
1149  * kn->kn_sdata         interval timer: the interval
1150  *                      absolute/deadline timer: 0
1151  * kn->kn_data          fire count
1152  */
1153
1154 static lck_mtx_t _filt_timerlock;
1155
1156 static void filt_timerlock(void)   { lck_mtx_lock(&_filt_timerlock);   }
1157 static void filt_timerunlock(void) { lck_mtx_unlock(&_filt_timerlock); }
1158
1159 static inline void filt_timer_assert_locked(void)
1160 {
1161         LCK_MTX_ASSERT(&_filt_timerlock, LCK_MTX_ASSERT_OWNED);
1162 }
1163
1164 /* state flags stored in kn_hookid */
1165 #define TIMER_RUNNING           0x1
1166 #define TIMER_CANCELWAIT        0x2
1167
1168 /*
1169  * filt_timervalidate - process data from user
1170  *
1171  * Sets up the deadline, interval, and leeway from the provided user data
1172  *
1173  * Input:
1174  *      kn_sdata        timer deadline or interval time
1175  *      kn_sfflags      style of timer, unit of measurement
1176  *
1177  * Output:
1178  *      kn_sdata        either interval in abstime or 0 if non-repeating timer
1179  *      ext[0]          fire deadline in abs/cont time
1180  *                      (or 0 if NOTE_ABSOLUTE and deadline is in past)
1181  *
1182  * Returns:
1183  *      EINVAL          Invalid user data parameters
1184  *
1185  * Called with timer filter lock held.
1186  */
1187 static int
1188 filt_timervalidate(struct knote *kn)
1189 {
1190         /*
1191          * There are 4 knobs that need to be chosen for a timer registration:
1192          *
1193          * A) Units of time (what is the time duration of the specified number)
1194          *      Absolute and interval take:
1195          *              NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1196          *      Defaults to milliseconds if not specified
1197          *
1198          * B) Clock epoch (what is the zero point of the specified number)
1199          *      For interval, there is none
1200          *      For absolute, defaults to the gettimeofday/calendar epoch
1201          *      With NOTE_MACHTIME, uses mach_absolute_time()
1202          *      With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1203          *
1204          * C) The knote's behavior on delivery
1205          *      Interval timer causes the knote to arm for the next interval unless one-shot is set
1206          *      Absolute is a forced one-shot timer which deletes on delivery
1207          *      TODO: Add a way for absolute to be not forced one-shot
1208          *
1209          * D) Whether the time duration is relative to now or absolute
1210          *      Interval fires at now + duration when it is set up
1211          *      Absolute fires at now + difference between now walltime and passed in walltime
1212          *      With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1213          *
1214          * E) Whether the timer continues to tick across sleep
1215          *      By default all three do not.
1216          *      For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1217          *      With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1218          *              expires when mach_continuous_time() is > the passed in value.
1219          */
1220
1221         filt_timer_assert_locked();
1222
1223         uint64_t multiplier;
1224
1225         boolean_t use_abstime = FALSE;
1226
1227         switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS|NOTE_MACHTIME)) {
1228         case NOTE_SECONDS:
1229                 multiplier = NSEC_PER_SEC;
1230                 break;
1231         case NOTE_USECONDS:
1232                 multiplier = NSEC_PER_USEC;
1233                 break;
1234         case NOTE_NSECONDS:
1235                 multiplier = 1;
1236                 break;
1237         case NOTE_MACHTIME:
1238                 multiplier = 0;
1239                 use_abstime = TRUE;
1240                 break;
1241         case 0: /* milliseconds (default) */
1242                 multiplier = NSEC_PER_SEC / 1000;
1243                 break;
1244         default:
1245                 return (EINVAL);
1246         }
1247
1248         /* transform the leeway in kn_ext[1] to same time scale */
1249         if (kn->kn_sfflags & NOTE_LEEWAY) {
1250                 uint64_t leeway_abs;
1251
1252                 if (use_abstime) {
1253                         leeway_abs = (uint64_t)kn->kn_ext[1];
1254                 } else  {
1255                         uint64_t leeway_ns;
1256                         if (os_mul_overflow((uint64_t)kn->kn_ext[1], multiplier, &leeway_ns))
1257                                 return (ERANGE);
1258
1259                         nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1260                 }
1261
1262                 kn->kn_ext[1] = leeway_abs;
1263         }
1264
1265         if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1266                 uint64_t deadline_abs;
1267
1268                 if (use_abstime) {
1269                         deadline_abs = (uint64_t)kn->kn_sdata;
1270                 } else {
1271                         uint64_t calendar_deadline_ns;
1272
1273                         if (os_mul_overflow((uint64_t)kn->kn_sdata, multiplier, &calendar_deadline_ns))
1274                                 return (ERANGE);
1275
1276                         /* calendar_deadline_ns is in nanoseconds since the epoch */
1277
1278                         clock_sec_t seconds;
1279                         clock_nsec_t nanoseconds;
1280
1281                         /*
1282                          * Note that the conversion through wall-time is only done once.
1283                          *
1284                          * If the relationship between MAT and gettimeofday changes,
1285                          * the underlying timer does not update.
1286                          *
1287                          * TODO: build a wall-time denominated timer_call queue
1288                          * and a flag to request DTRTing with wall-time timers
1289                          */
1290                         clock_get_calendar_nanotime(&seconds, &nanoseconds);
1291
1292                         uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1293
1294                         /* if deadline is in the future */
1295                         if (calendar_now_ns < calendar_deadline_ns) {
1296                                 uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1297                                 uint64_t interval_abs;
1298
1299                                 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1300
1301                                 /*
1302                                  * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1303                                  * causes the timer to keep ticking across sleep, but
1304                                  * it does not change the calendar timebase.
1305                                  */
1306
1307                                 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1308                                         clock_continuoustime_interval_to_deadline(interval_abs,
1309                                                                                   &deadline_abs);
1310                                 else
1311                                         clock_absolutetime_interval_to_deadline(interval_abs,
1312                                                                                 &deadline_abs);
1313                         } else {
1314                                 deadline_abs = 0; /* cause immediate expiration */
1315                         }
1316                 }
1317
1318                 kn->kn_ext[0] = deadline_abs;
1319                 kn->kn_sdata  = 0;       /* NOTE_ABSOLUTE is non-repeating */
1320         } else if (kn->kn_sdata < 0) {
1321                 /*
1322                  * Negative interval timers fire immediately, once.
1323                  *
1324                  * Ideally a negative interval would be an error, but certain clients
1325                  * pass negative values on accident, and expect an event back.
1326                  *
1327                  * In the old implementation the timer would repeat with no delay
1328                  * N times until mach_absolute_time() + (N * interval) underflowed,
1329                  * then it would wait ~forever by accidentally arming a timer for the far future.
1330                  *
1331                  * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1332                  */
1333
1334                 kn->kn_sdata  = 0;      /* non-repeating */
1335                 kn->kn_ext[0] = 0;      /* expire immediately */
1336         } else {
1337                 uint64_t interval_abs = 0;
1338
1339                 if (use_abstime) {
1340                         interval_abs = (uint64_t)kn->kn_sdata;
1341                 } else {
1342                         uint64_t interval_ns;
1343                         if (os_mul_overflow((uint64_t)kn->kn_sdata, multiplier, &interval_ns))
1344                                 return (ERANGE);
1345
1346                         nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1347                 }
1348
1349                 uint64_t deadline = 0;
1350
1351                 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1352                         clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1353                 else
1354                         clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1355
1356                 kn->kn_sdata  = interval_abs;   /* default to a repeating timer */
1357                 kn->kn_ext[0] = deadline;
1358         }
1359
1360         return (0);
1361 }
1362
1363
1364
1365
1366 /*
1367  * filt_timerexpire - the timer callout routine
1368  *
1369  * Just propagate the timer event into the knote
1370  * filter routine (by going through the knote
1371  * synchronization point).  Pass a hint to
1372  * indicate this is a real event, not just a
1373  * query from above.
1374  */
1375 static void
1376 filt_timerexpire(void *knx, __unused void *spare)
1377 {
1378         struct klist timer_list;
1379         struct knote *kn = knx;
1380
1381         filt_timerlock();
1382
1383         kn->kn_hookid &= ~TIMER_RUNNING;
1384
1385         /* no "object" for timers, so fake a list */
1386         SLIST_INIT(&timer_list);
1387         SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
1388
1389         KNOTE(&timer_list, 1);
1390
1391         /* if someone is waiting for timer to pop */
1392         if (kn->kn_hookid & TIMER_CANCELWAIT) {
1393                 struct kqueue *kq = knote_get_kq(kn);
1394                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
1395                                    CAST_EVENT64_T(&kn->kn_hook),
1396                                    THREAD_AWAKENED,
1397                                    WAITQ_ALL_PRIORITIES);
1398
1399                 kn->kn_hookid &= ~TIMER_CANCELWAIT;
1400         }
1401
1402         filt_timerunlock();
1403 }
1404
1405 /*
1406  * Cancel a running timer (or wait for the pop).
1407  * Timer filter lock is held.
1408  * May drop and retake the timer filter lock.
1409  */
1410 static void
1411 filt_timercancel(struct knote *kn)
1412 {
1413         filt_timer_assert_locked();
1414
1415         assert((kn->kn_hookid & TIMER_CANCELWAIT) == 0);
1416
1417         /* if no timer, then we're good */
1418         if ((kn->kn_hookid & TIMER_RUNNING) == 0)
1419                 return;
1420
1421         thread_call_t callout = (thread_call_t)kn->kn_hook;
1422
1423         /* cancel the callout if we can */
1424         if (thread_call_cancel(callout)) {
1425                 kn->kn_hookid &= ~TIMER_RUNNING;
1426                 return;
1427         }
1428
1429         /* cancel failed, we have to wait for the in-flight expire routine */
1430
1431         kn->kn_hookid |= TIMER_CANCELWAIT;
1432
1433         struct kqueue *kq = knote_get_kq(kn);
1434
1435         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
1436                             CAST_EVENT64_T(&kn->kn_hook),
1437                             THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1438
1439         filt_timerunlock();
1440         thread_block(THREAD_CONTINUE_NULL);
1441         filt_timerlock();
1442
1443         assert((kn->kn_hookid & TIMER_CANCELWAIT) == 0);
1444         assert((kn->kn_hookid & TIMER_RUNNING) == 0);
1445 }
1446
1447 static void
1448 filt_timerarm(struct knote *kn)
1449 {
1450         filt_timer_assert_locked();
1451
1452         assert((kn->kn_hookid & TIMER_RUNNING) == 0);
1453
1454         thread_call_t callout = (thread_call_t)kn->kn_hook;
1455
1456         uint64_t deadline = kn->kn_ext[0];
1457         uint64_t leeway   = kn->kn_ext[1];
1458
1459         int filter_flags = kn->kn_sfflags;
1460         unsigned int timer_flags = 0;
1461
1462         if (filter_flags & NOTE_CRITICAL)
1463                 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1464         else if (filter_flags & NOTE_BACKGROUND)
1465                 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1466         else
1467                 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1468
1469         if (filter_flags & NOTE_LEEWAY)
1470                 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1471
1472         if (filter_flags & NOTE_MACH_CONTINUOUS_TIME)
1473                 timer_flags |= THREAD_CALL_CONTINUOUS;
1474
1475         thread_call_enter_delayed_with_leeway(callout, NULL,
1476                                               deadline, leeway,
1477                                               timer_flags);
1478
1479         kn->kn_hookid |= TIMER_RUNNING;
1480 }
1481
1482 /*
1483  * Does this knote need a timer armed for it, or should it be ready immediately?
1484  */
1485 static boolean_t
1486 filt_timer_is_ready(struct knote *kn)
1487 {
1488         uint64_t now;
1489
1490         if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1491                 now = mach_continuous_time();
1492         else
1493                 now = mach_absolute_time();
1494
1495         uint64_t deadline = kn->kn_ext[0];
1496
1497         if (deadline < now)
1498                 return TRUE;
1499         else
1500                 return FALSE;
1501 }
1502
1503 /*
1504  * Allocate a thread call for the knote's lifetime, and kick off the timer.
1505  */
1506 static int
1507 filt_timerattach(struct knote *kn, __unused struct kevent_internal_s *kev)
1508 {
1509         thread_call_t callout;
1510         int error;
1511
1512         callout = thread_call_allocate_with_options(filt_timerexpire,
1513                         (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1514                         THREAD_CALL_OPTIONS_ONCE);
1515
1516         if (NULL == callout) {
1517                 kn->kn_flags = EV_ERROR;
1518                 kn->kn_data = ENOMEM;
1519                 return 0;
1520         }
1521
1522         filt_timerlock();
1523
1524         if ((error = filt_timervalidate(kn)) != 0) {
1525                 kn->kn_flags = EV_ERROR;
1526                 kn->kn_data  = error;
1527                 filt_timerunlock();
1528
1529                 __assert_only boolean_t freed = thread_call_free(callout);
1530                 assert(freed);
1531                 return 0;
1532         }
1533
1534         kn->kn_hook = (void*)callout;
1535         kn->kn_hookid = 0;
1536         kn->kn_flags |= EV_CLEAR;
1537
1538         /* NOTE_ABSOLUTE implies EV_ONESHOT */
1539         if (kn->kn_sfflags & NOTE_ABSOLUTE)
1540                 kn->kn_flags |= EV_ONESHOT;
1541
1542         boolean_t timer_ready = FALSE;
1543
1544         if ((timer_ready = filt_timer_is_ready(kn))) {
1545                 /* cause immediate expiration */
1546                 kn->kn_data = 1;
1547         } else {
1548                 filt_timerarm(kn);
1549         }
1550
1551         filt_timerunlock();
1552
1553         return timer_ready;
1554 }
1555
1556 /*
1557  * Shut down the timer if it's running, and free the callout.
1558  */
1559 static void
1560 filt_timerdetach(struct knote *kn)
1561 {
1562         thread_call_t callout;
1563
1564         filt_timerlock();
1565
1566         callout = (thread_call_t)kn->kn_hook;
1567         filt_timercancel(kn);
1568
1569         filt_timerunlock();
1570
1571         __assert_only boolean_t freed = thread_call_free(callout);
1572         assert(freed);
1573 }
1574
1575 /*
1576  * filt_timerevent - post events to a timer knote
1577  *
1578  * Called in the context of filt_timerexpire with
1579  * the filt_timerlock held
1580  */
1581 static int
1582 filt_timerevent(struct knote *kn, __unused long hint)
1583 {
1584         filt_timer_assert_locked();
1585
1586         kn->kn_data = 1;
1587         return (1);
1588 }
1589
1590 /*
1591  * filt_timertouch - update timer knote with new user input
1592  *
1593  * Cancel and restart the timer based on new user data. When
1594  * the user picks up a knote, clear the count of how many timer
1595  * pops have gone off (in kn_data).
1596  */
1597 static int
1598 filt_timertouch(
1599         struct knote *kn,
1600         struct kevent_internal_s *kev)
1601 {
1602         int error;
1603
1604         filt_timerlock();
1605
1606         /*
1607          * cancel current call - drops and retakes lock
1608          * TODO: not safe against concurrent touches?
1609          */
1610         filt_timercancel(kn);
1611
1612         /* clear if the timer had previously fired, the user no longer wants to see it */
1613         kn->kn_data = 0;
1614
1615         /* capture the new values used to compute deadline */
1616         kn->kn_sdata = kev->data;
1617         kn->kn_sfflags = kev->fflags;
1618         kn->kn_ext[0] = kev->ext[0];
1619         kn->kn_ext[1] = kev->ext[1];
1620
1621         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1622                 kn->kn_udata = kev->udata;
1623
1624         /* recalculate deadline */
1625         error = filt_timervalidate(kn);
1626         if (error) {
1627                 /* no way to report error, so mark it in the knote */
1628                 kn->kn_flags |= EV_ERROR;
1629                 kn->kn_data = error;
1630                 filt_timerunlock();
1631                 return 1;
1632         }
1633
1634         boolean_t timer_ready = FALSE;
1635
1636         if ((timer_ready = filt_timer_is_ready(kn))) {
1637                 /* cause immediate expiration */
1638                 kn->kn_data = 1;
1639         } else {
1640                 filt_timerarm(kn);
1641         }
1642
1643         filt_timerunlock();
1644
1645         return timer_ready;
1646 }
1647
1648 /*
1649  * filt_timerprocess - query state of knote and snapshot event data
1650  *
1651  * Determine if the timer has fired in the past, snapshot the state
1652  * of the kevent for returning to user-space, and clear pending event
1653  * counters for the next time.
1654  */
1655 static int
1656 filt_timerprocess(
1657         struct knote *kn,
1658         __unused struct filt_process_s *data,
1659         struct kevent_internal_s *kev)
1660 {
1661         filt_timerlock();
1662
1663         if (kn->kn_data == 0 || (kn->kn_hookid & TIMER_CANCELWAIT)) {
1664                 /*
1665                  * kn_data = 0:
1666                  * The timer hasn't yet fired, so there's nothing to deliver
1667                  * TIMER_CANCELWAIT:
1668                  * touch is in the middle of canceling the timer,
1669                  * so don't deliver or re-arm anything
1670                  *
1671                  * This can happen if a touch resets a timer that had fired
1672                  * without being processed
1673                  */
1674                 filt_timerunlock();
1675                 return 0;
1676         }
1677
1678         if (kn->kn_sdata != 0 && ((kn->kn_flags & EV_ERROR) == 0)) {
1679                 /*
1680                  * This is a 'repeating' timer, so we have to emit
1681                  * how many intervals expired between the arm
1682                  * and the process.
1683                  *
1684                  * A very strange style of interface, because
1685                  * this could easily be done in the client...
1686                  */
1687
1688                 /* The timer better have had expired... */
1689                 assert((kn->kn_hookid & TIMER_RUNNING) == 0);
1690
1691                 uint64_t now;
1692
1693                 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1694                         now = mach_continuous_time();
1695                 else
1696                         now = mach_absolute_time();
1697
1698                 uint64_t first_deadline = kn->kn_ext[0];
1699                 uint64_t interval_abs   = kn->kn_sdata;
1700                 uint64_t orig_arm_time  = first_deadline - interval_abs;
1701
1702                 assert(now > orig_arm_time);
1703                 assert(now > first_deadline);
1704
1705                 uint64_t elapsed = now - orig_arm_time;
1706
1707                 uint64_t num_fired = elapsed / interval_abs;
1708
1709                 /*
1710                  * To reach this code, we must have seen the timer pop
1711                  * and be in repeating mode, so therefore it must have been
1712                  * more than 'interval' time since the attach or last
1713                  * successful touch.
1714                  *
1715                  * An unsuccessful touch would:
1716                  * disarm the timer
1717                  * clear kn_data
1718                  * clear kn_sdata
1719                  * set EV_ERROR
1720                  * all of which will prevent this code from running.
1721                  */
1722                 assert(num_fired > 0);
1723
1724                 /* report how many intervals have elapsed to the user */
1725                 kn->kn_data = (int64_t) num_fired;
1726
1727                 /* We only need to re-arm the timer if it's not about to be destroyed */
1728                 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1729                         /* fire at the end of the next interval */
1730                         uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1731
1732                         assert(new_deadline > now);
1733
1734                         kn->kn_ext[0] = new_deadline;
1735
1736                         filt_timerarm(kn);
1737                 }
1738         }
1739
1740         /*
1741          * Copy out the interesting kevent state,
1742          * but don't leak out the raw time calculations.
1743          *
1744          * TODO: potential enhancements - tell the user about:
1745          *      - deadline to which this timer thought it was expiring
1746          *      - return kn_sfflags in the fflags field so the client can know
1747          *        under what flags the timer fired
1748          */
1749         *kev = kn->kn_kevent;
1750         kev->ext[0] = 0;
1751         /* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
1752
1753         /* we have delivered the event, reset the timer pop count */
1754         kn->kn_data = 0;
1755
1756         filt_timerunlock();
1757         return 1;
1758 }
1759
1760 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1761         .f_attach   = filt_timerattach,
1762         .f_detach   = filt_timerdetach,
1763         .f_event    = filt_timerevent,
1764         .f_touch    = filt_timertouch,
1765         .f_process  = filt_timerprocess,
1766 };
1767
1768
1769 #pragma mark EVFILT_USER
1770
1771
1772 static void
1773 filt_userlock(void)
1774 {
1775         lck_spin_lock(&_filt_userlock);
1776 }
1777
1778 static void
1779 filt_userunlock(void)
1780 {
1781         lck_spin_unlock(&_filt_userlock);
1782 }
1783
1784 static int
1785 filt_userattach(struct knote *kn, __unused struct kevent_internal_s *kev)
1786 {
1787         /* EVFILT_USER knotes are not attached to anything in the kernel */
1788         /* Cant discover this knote until after attach - so no lock needed */
1789         kn->kn_hook = NULL;
1790         if (kn->kn_sfflags & NOTE_TRIGGER) {
1791                 kn->kn_hookid = 1;
1792         } else {
1793                 kn->kn_hookid = 0;
1794         }
1795         return (kn->kn_hookid);
1796 }
1797
1798 static void
1799 filt_userdetach(__unused struct knote *kn)
1800 {
1801         /* EVFILT_USER knotes are not attached to anything in the kernel */
1802 }
1803
1804 static int
1805 filt_user(
1806         __unused struct knote *kn,
1807         __unused long hint)
1808 {
1809         panic("filt_user");
1810         return 0;
1811 }
1812
1813 static int
1814 filt_usertouch(
1815         struct knote *kn,
1816         struct kevent_internal_s *kev)
1817 {
1818         uint32_t ffctrl;
1819         int fflags;
1820         int active;
1821
1822         filt_userlock();
1823
1824         ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1825         fflags = kev->fflags & NOTE_FFLAGSMASK;
1826         switch (ffctrl) {
1827         case NOTE_FFNOP:
1828                 break;
1829         case NOTE_FFAND:
1830                 kn->kn_sfflags &= fflags;
1831                 break;
1832         case NOTE_FFOR:
1833                 kn->kn_sfflags |= fflags;
1834                 break;
1835         case NOTE_FFCOPY:
1836                 kn->kn_sfflags = fflags;
1837                 break;
1838         }
1839         kn->kn_sdata = kev->data;
1840
1841         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1842                 kn->kn_udata = kev->udata;
1843
1844         if (kev->fflags & NOTE_TRIGGER) {
1845                 kn->kn_hookid = 1;
1846         }
1847         active = kn->kn_hookid;
1848
1849         filt_userunlock();
1850
1851         return (active);
1852 }
1853
1854 static int
1855 filt_userprocess(
1856         struct knote *kn,
1857         __unused struct filt_process_s *data,
1858         struct kevent_internal_s *kev)
1859 {
1860         filt_userlock();
1861
1862         if (kn->kn_hookid == 0) {
1863                 filt_userunlock();
1864                 return 0;
1865         }
1866
1867         *kev = kn->kn_kevent;
1868         kev->fflags = (volatile UInt32)kn->kn_sfflags;
1869         kev->data = kn->kn_sdata;
1870         if (kn->kn_flags & EV_CLEAR) {
1871                 kn->kn_hookid = 0;
1872                 kn->kn_data = 0;
1873                 kn->kn_fflags = 0;
1874         }
1875         filt_userunlock();
1876
1877         return 1;
1878 }
1879
1880 #pragma mark EVFILT_WORKLOOP
1881
1882 #if DEBUG || DEVELOPMENT
1883 /*
1884  * see src/queue_internal.h in libdispatch
1885  */
1886 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
1887 #endif
1888
1889 static inline void
1890 filt_wllock(struct kqworkloop *kqwl)
1891 {
1892         lck_mtx_lock(&kqwl->kqwl_statelock);
1893 }
1894
1895 static inline void
1896 filt_wlunlock(struct kqworkloop *kqwl)
1897 {
1898         lck_mtx_unlock(&kqwl->kqwl_statelock);
1899 }
1900
1901 static inline void
1902 filt_wlheld(__assert_only struct kqworkloop *kqwl)
1903 {
1904         LCK_MTX_ASSERT(&kqwl->kqwl_statelock, LCK_MTX_ASSERT_OWNED);
1905 }
1906
1907 #define WL_OWNER_SUSPENDED    ((thread_t)(~0ull))  /* special owner when suspended */
1908
1909 static inline bool
1910 filt_wlowner_is_valid(thread_t owner)
1911 {
1912         return owner != THREAD_NULL && owner != WL_OWNER_SUSPENDED;
1913 }
1914
1915 static inline bool
1916 filt_wlshould_end_ownership(struct kqworkloop *kqwl,
1917                 struct kevent_internal_s *kev, int error)
1918 {
1919         thread_t owner = kqwl->kqwl_owner;
1920         return (error == 0 || error == ESTALE) &&
1921                         (kev->fflags & NOTE_WL_END_OWNERSHIP) &&
1922                         (owner == current_thread() || owner == WL_OWNER_SUSPENDED);
1923 }
1924
1925 static inline bool
1926 filt_wlshould_update_ownership(struct kevent_internal_s *kev, int error)
1927 {
1928         return error == 0 && (kev->fflags & NOTE_WL_DISCOVER_OWNER) &&
1929                         kev->ext[EV_EXTIDX_WL_ADDR];
1930 }
1931
1932 static inline bool
1933 filt_wlshould_set_async_qos(struct kevent_internal_s *kev, int error,
1934                 kq_index_t async_qos)
1935 {
1936         if (error != 0) {
1937                 return false;
1938         }
1939         if (async_qos != THREAD_QOS_UNSPECIFIED) {
1940                 return true;
1941         }
1942         if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
1943                 /* see filt_wlprocess() */
1944                 return true;
1945         }
1946         return false;
1947 }
1948
1949 __result_use_check
1950 static int
1951 filt_wlupdateowner(struct kqworkloop *kqwl, struct kevent_internal_s *kev,
1952                 int error, kq_index_t async_qos)
1953 {
1954         struct kqrequest *kqr = &kqwl->kqwl_request;
1955         thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
1956         kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
1957         kq_index_t old_owner_override = THREAD_QOS_UNSPECIFIED;
1958         boolean_t ipc_override_is_sync = false;
1959         boolean_t old_owner_override_is_sync = false;
1960         int action = KQWL_UTQ_NONE;
1961
1962         filt_wlheld(kqwl);
1963
1964         /*
1965          * The owner is only changed under both the filt_wllock and the
1966          * kqwl_req_lock. Looking at it with either one held is fine.
1967          */
1968         cur_owner = kqwl->kqwl_owner;
1969         if (filt_wlshould_end_ownership(kqwl, kev, error)) {
1970                 new_owner = THREAD_NULL;
1971         } else if (filt_wlshould_update_ownership(kev, error)) {
1972                 /*
1973                  * Decipher the owner port name, and translate accordingly.
1974                  * The low 2 bits were borrowed for other flags, so mask them off.
1975                  */
1976                 uint64_t udata = kev->ext[EV_EXTIDX_WL_VALUE];
1977                 mach_port_name_t new_owner_name = (mach_port_name_t)udata & ~0x3;
1978                 if (new_owner_name != MACH_PORT_NULL) {
1979                         new_owner_name = ipc_entry_name_mask(new_owner_name);
1980                 }
1981
1982                 if (MACH_PORT_VALID(new_owner_name)) {
1983                         new_owner = port_name_to_thread(new_owner_name);
1984                         if (new_owner == THREAD_NULL)
1985                                 return EOWNERDEAD;
1986                         extra_thread_ref = new_owner;
1987                 } else if (new_owner_name == MACH_PORT_DEAD) {
1988                         new_owner = WL_OWNER_SUSPENDED;
1989                 } else {
1990                         /*
1991                          * We never want to learn a new owner that is NULL.
1992                          * Ownership should be ended with END_OWNERSHIP.
1993                          */
1994                         new_owner = cur_owner;
1995                 }
1996         } else {
1997                 new_owner = cur_owner;
1998         }
1999
2000         if (filt_wlshould_set_async_qos(kev, error, async_qos)) {
2001                 action = KQWL_UTQ_SET_ASYNC_QOS;
2002         }
2003         if (cur_owner == new_owner && action == KQWL_UTQ_NONE) {
2004                 goto out;
2005         }
2006
2007         kqwl_req_lock(kqwl);
2008
2009         /* If already tracked as servicer, don't track as owner */
2010         if ((kqr->kqr_state & KQR_BOUND) && new_owner == kqr->kqr_thread) {
2011                 kqwl->kqwl_owner = new_owner = THREAD_NULL;
2012         }
2013
2014         if (cur_owner != new_owner) {
2015                 kqwl->kqwl_owner = new_owner;
2016                 if (new_owner == extra_thread_ref) {
2017                         /* we just transfered this ref to kqwl_owner */
2018                         extra_thread_ref = THREAD_NULL;
2019                 }
2020                 cur_override = kqworkloop_combined_qos(kqwl, &ipc_override_is_sync);
2021                 old_owner_override = kqr->kqr_dsync_owner_qos;
2022                 old_owner_override_is_sync = kqr->kqr_owner_override_is_sync;
2023
2024                 if (filt_wlowner_is_valid(new_owner)) {
2025                         /* override it before we drop the old */
2026                         if (cur_override != THREAD_QOS_UNSPECIFIED) {
2027                                 thread_add_ipc_override(new_owner, cur_override);
2028                         }
2029                         if (ipc_override_is_sync) {
2030                                 thread_add_sync_ipc_override(new_owner);
2031                         }
2032                         /* Update the kqr to indicate that owner has sync ipc override */
2033                         kqr->kqr_dsync_owner_qos = cur_override;
2034                         kqr->kqr_owner_override_is_sync = ipc_override_is_sync;
2035                         thread_starts_owning_workloop(new_owner);
2036                         if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED) {
2037                                 if (action == KQWL_UTQ_NONE) {
2038                                         action = KQWL_UTQ_REDRIVE_EVENTS;
2039                                 }
2040                         }
2041                 } else if (new_owner == THREAD_NULL) {
2042                         kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
2043                         kqr->kqr_owner_override_is_sync = false;
2044                         if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_WAKEUP)) == KQR_WAKEUP) {
2045                                 if (action == KQWL_UTQ_NONE) {
2046                                         action = KQWL_UTQ_REDRIVE_EVENTS;
2047                                 }
2048                         }
2049                 }
2050         }
2051
2052         if (action != KQWL_UTQ_NONE) {
2053                 kqworkloop_update_threads_qos(kqwl, action, async_qos);
2054         }
2055
2056         kqwl_req_unlock(kqwl);
2057
2058         /* Now that we are unlocked, drop the override and ref on old owner */
2059         if (new_owner != cur_owner && filt_wlowner_is_valid(cur_owner)) {
2060                 if (old_owner_override != THREAD_QOS_UNSPECIFIED) {
2061                         thread_drop_ipc_override(cur_owner);
2062                 }
2063                 if (old_owner_override_is_sync) {
2064                         thread_drop_sync_ipc_override(cur_owner);
2065                 }
2066                 thread_ends_owning_workloop(cur_owner);
2067                 thread_deallocate(cur_owner);
2068         }
2069
2070 out:
2071         if (extra_thread_ref) {
2072                 thread_deallocate(extra_thread_ref);
2073         }
2074         return error;
2075 }
2076
2077 static int
2078 filt_wldebounce(
2079         struct kqworkloop *kqwl,
2080         struct kevent_internal_s *kev,
2081         int default_result)
2082 {
2083         user_addr_t addr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
2084         uint64_t udata;
2085         int error;
2086
2087         /* we must have the workloop state mutex held */
2088         filt_wlheld(kqwl);
2089
2090         /* Do we have a debounce address to work with? */
2091         if (addr) {
2092                 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2093                 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2094
2095                 error = copyin_word(addr, &udata, sizeof(udata));
2096                 if (error) {
2097                         return error;
2098                 }
2099
2100                 /* update state as copied in */
2101                 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2102
2103                 /* If the masked bits don't match, reject it as stale */
2104                 if ((udata & mask) != (kdata & mask)) {
2105                         return ESTALE;
2106                 }
2107
2108 #if DEBUG || DEVELOPMENT
2109                 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && !(kev->flags & EV_DELETE)) {
2110                         if ((udata & DISPATCH_QUEUE_ENQUEUED) == 0) {
2111                                 panic("kevent: workloop %#016llx is not enqueued "
2112                                                 "(kev:%p dq_state:%#016llx)", kev->udata, kev, udata);
2113                         }
2114                 }
2115 #endif
2116         }
2117
2118         return default_result;
2119 }
2120
2121 /*
2122  * Remembers the last updated that came in from userspace for debugging reasons.
2123  * - fflags is mirrored from the userspace kevent
2124  * - ext[i, i != VALUE] is mirrored from the userspace kevent
2125  * - ext[VALUE] is set to what the kernel loaded atomically
2126  * - data is set to the error if any
2127  */
2128 static inline void
2129 filt_wlremember_last_update(
2130         __assert_only struct kqworkloop *kqwl,
2131         struct knote *kn,
2132         struct kevent_internal_s *kev,
2133         int error)
2134 {
2135         filt_wlheld(kqwl);
2136         kn->kn_fflags = kev->fflags;
2137         kn->kn_data = error;
2138         memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2139 }
2140
2141 /*
2142  * Return which operations on EVFILT_WORKLOOP need to be protected against
2143  * knoteusewait() causing priority inversions.
2144  */
2145 static bool
2146 filt_wlneeds_boost(struct kevent_internal_s *kev)
2147 {
2148         if (kev == NULL) {
2149                 /*
2150                  * this is an f_process() usecount, and it can cause a drop to wait
2151                  */
2152                 return true;
2153         }
2154         if (kev->fflags & NOTE_WL_THREAD_REQUEST) {
2155                 /*
2156                  * All operations on thread requests may starve drops or re-attach of
2157                  * the same knote, all of them need boosts. None of what we do under
2158                  * thread-request usecount holds blocks anyway.
2159                  */
2160                 return true;
2161         }
2162         if (kev->fflags & NOTE_WL_SYNC_WAIT) {
2163                 /*
2164                  * this may call filt_wlwait() and we don't want to hold any boost when
2165                  * woken up, this would cause background threads contending on
2166                  * dispatch_sync() to wake up at 64 and be preempted immediately when
2167                  * this drops.
2168                  */
2169                 return false;
2170         }
2171
2172         /*
2173          * SYNC_WAIT knotes when deleted don't need to be rushed, there's no
2174          * detach/reattach race with these ever. In addition to this, when the
2175          * SYNC_WAIT knote is dropped, the caller is no longer receiving the
2176          * workloop overrides if any, and we'd rather schedule other threads than
2177          * him, he's not possibly stalling anything anymore.
2178          */
2179         return (kev->flags & EV_DELETE) == 0;
2180 }
2181
2182 static int
2183 filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
2184 {
2185         struct kqueue *kq = knote_get_kq(kn);
2186         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2187         int error = 0;
2188         kq_index_t qos_index = 0;
2189
2190         if ((kq->kq_state & KQ_WORKLOOP) == 0) {
2191                 error = ENOTSUP;
2192                 goto out;
2193         }
2194
2195 #if DEVELOPMENT || DEBUG
2196         if (kev->ident == 0 && kev->udata == 0 && kev->fflags == 0) {
2197                 struct kqrequest *kqr = &kqwl->kqwl_request;
2198
2199                 kqwl_req_lock(kqwl);
2200                 kev->fflags = 0;
2201                 if (kqr->kqr_dsync_waiters) {
2202                         kev->fflags |= NOTE_WL_SYNC_WAIT;
2203                 }
2204                 if (kqr->kqr_qos_index) {
2205                         kev->fflags |= NOTE_WL_THREAD_REQUEST;
2206                 }
2207                 if (kqwl->kqwl_owner == WL_OWNER_SUSPENDED) {
2208                         kev->ext[0] = ~0ull;
2209                 } else {
2210                         kev->ext[0] = thread_tid(kqwl->kqwl_owner);
2211                 }
2212                 kev->ext[1] = thread_tid(kqwl->kqwl_request.kqr_thread);
2213                 kev->ext[2] = thread_owned_workloops_count(current_thread());
2214                 kev->ext[3] = kn->kn_kevent.ext[3];
2215                 kqwl_req_unlock(kqwl);
2216                 error = EBUSY;
2217                 goto out;
2218         }
2219 #endif
2220
2221         /* Some simple validation */
2222         int command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2223         switch (command) {
2224         case NOTE_WL_THREAD_REQUEST:
2225                 if (kn->kn_id != kqwl->kqwl_dynamicid) {
2226                         error = EINVAL;
2227                         goto out;
2228                 }
2229                 qos_index = qos_index_from_qos(kn, kn->kn_qos, FALSE);
2230                 if (qos_index < THREAD_QOS_MAINTENANCE ||
2231                                 qos_index > THREAD_QOS_USER_INTERACTIVE) {
2232                         error = ERANGE;
2233                         goto out;
2234                 }
2235                 break;
2236         case NOTE_WL_SYNC_WAIT:
2237         case NOTE_WL_SYNC_WAKE:
2238                 if (kq->kq_state & KQ_NO_WQ_THREAD) {
2239                         error = ENOTSUP;
2240                         goto out;
2241                 }
2242                 if (kn->kn_id == kqwl->kqwl_dynamicid) {
2243                         error = EINVAL;
2244                         goto out;
2245                 }
2246                 if ((kn->kn_flags & EV_DISABLE) == 0) {
2247                         error = EINVAL;
2248                         goto out;
2249                 }
2250                 if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2251                         error = EINVAL;
2252                         goto out;
2253                 }
2254                 break;
2255         default:
2256                 error = EINVAL;
2257                 goto out;
2258         }
2259
2260         filt_wllock(kqwl);
2261         kn->kn_hook = NULL;
2262
2263         if (command == NOTE_WL_THREAD_REQUEST && kqwl->kqwl_request.kqr_qos_index) {
2264                 /*
2265                  * There already is a thread request, and well, you're only allowed
2266                  * one per workloop, so fail the attach.
2267                  *
2268                  * Note: kqr_qos_index is always set with the wllock held, so we
2269                  * don't need to take the kqr lock.
2270                  */
2271                 error = EALREADY;
2272         } else {
2273                 /* Make sure user and kernel are in agreement on important state */
2274                 error = filt_wldebounce(kqwl, kev, 0);
2275         }
2276
2277         error = filt_wlupdateowner(kqwl, kev, error, qos_index);
2278         filt_wlunlock(kqwl);
2279 out:
2280         if (error) {
2281                 kn->kn_flags |= EV_ERROR;
2282                 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2283                 if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2284                         error = 0;
2285                 }
2286                 kn->kn_data = error;
2287                 return 0;
2288         }
2289
2290         /* Just attaching the thread request successfully will fire it */
2291         return command == NOTE_WL_THREAD_REQUEST;
2292 }
2293
2294 __attribute__((noinline,not_tail_called))
2295 static int
2296 filt_wlwait(struct kqworkloop           *kqwl,
2297             struct knote                *kn,
2298             struct kevent_internal_s    *kev)
2299 {
2300         filt_wlheld(kqwl);
2301         assert((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0);
2302
2303         /*
2304          * Hint to the wakeup side that this thread is waiting.  Also used by
2305          * stackshot for waitinfo.
2306          */
2307         kn->kn_hook = current_thread();
2308
2309         thread_set_pending_block_hint(current_thread(), kThreadWaitWorkloopSyncWait);
2310
2311         wait_result_t wr = assert_wait(kn, THREAD_ABORTSAFE);
2312
2313         if (wr == THREAD_WAITING) {
2314                 kq_index_t qos_index = qos_index_from_qos(kn, kev->qos, TRUE);
2315                 struct kqrequest *kqr = &kqwl->kqwl_request;
2316
2317                 thread_t thread_to_handoff = THREAD_NULL; /* holds +1 thread ref */
2318
2319                 thread_t kqwl_owner = kqwl->kqwl_owner;
2320                 if (filt_wlowner_is_valid(kqwl_owner)) {
2321                         thread_reference(kqwl_owner);
2322                         thread_to_handoff = kqwl_owner;
2323                 }
2324
2325                 kqwl_req_lock(kqwl);
2326
2327                 if (qos_index) {
2328                         assert(kqr->kqr_dsync_waiters < UINT16_MAX);
2329                         kqr->kqr_dsync_waiters++;
2330                         if (qos_index > kqr->kqr_dsync_waiters_qos) {
2331                                 kqworkloop_update_threads_qos(kqwl,
2332                                                 KQWL_UTQ_SET_SYNC_WAITERS_QOS, qos_index);
2333                         }
2334                 }
2335
2336                 if ((kqr->kqr_state & KQR_BOUND) && thread_to_handoff == THREAD_NULL) {
2337                         assert(kqr->kqr_thread != THREAD_NULL);
2338                         thread_t servicer = kqr->kqr_thread;
2339
2340                         thread_reference(servicer);
2341                         thread_to_handoff = servicer;
2342                 }
2343
2344                 kqwl_req_unlock(kqwl);
2345
2346                 filt_wlunlock(kqwl);
2347
2348                 /* TODO: use continuation based blocking <rdar://problem/31299584> */
2349
2350                 /* consume a refcount on thread_to_handoff, then thread_block() */
2351                 wr = thread_handoff(thread_to_handoff);
2352                 thread_to_handoff = THREAD_NULL;
2353
2354                 filt_wllock(kqwl);
2355
2356                 /* clear waiting state (only one waiting thread - so no race) */
2357                 assert(kn->kn_hook == current_thread());
2358
2359                 if (qos_index) {
2360                         kqwl_req_lock(kqwl);
2361                         assert(kqr->kqr_dsync_waiters > 0);
2362                         if (--kqr->kqr_dsync_waiters == 0) {
2363                                 assert(kqr->kqr_dsync_waiters_qos);
2364                                 kqworkloop_update_threads_qos(kqwl,
2365                                                 KQWL_UTQ_SET_SYNC_WAITERS_QOS, 0);
2366                         }
2367                         kqwl_req_unlock(kqwl);
2368                 }
2369         }
2370
2371         kn->kn_hook = NULL;
2372
2373         switch (wr) {
2374         case THREAD_AWAKENED:
2375                 return 0;
2376         case THREAD_INTERRUPTED:
2377                 return EINTR;
2378         case THREAD_RESTART:
2379                 return ECANCELED;
2380         default:
2381                 panic("filt_wlattach: unexpected wait result %d", wr);
2382                 return EINVAL;
2383         }
2384 }
2385
2386 /* called in stackshot context to report the thread responsible for blocking this thread */
2387 void
2388 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2389                                   event64_t event,
2390                                   thread_waitinfo_t *waitinfo)
2391 {
2392         struct knote *kn = (struct knote*) event;
2393         assert(kdp_is_in_zone(kn, "knote zone"));
2394
2395         assert(kn->kn_hook == thread);
2396
2397         struct kqueue *kq = knote_get_kq(kn);
2398         assert(kdp_is_in_zone(kq, "kqueue workloop zone"));
2399         assert(kq->kq_state & KQ_WORKLOOP);
2400
2401         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2402         struct kqrequest *kqr = &kqwl->kqwl_request;
2403
2404         thread_t kqwl_owner = kqwl->kqwl_owner;
2405         thread_t servicer = kqr->kqr_thread;
2406
2407         if (kqwl_owner == WL_OWNER_SUSPENDED) {
2408                 waitinfo->owner = STACKSHOT_WAITOWNER_SUSPENDED;
2409         } else if (kqwl_owner != THREAD_NULL) {
2410                 assert(kdp_is_in_zone(kqwl_owner, "threads"));
2411
2412                 waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2413         } else if (servicer != THREAD_NULL) {
2414                 assert(kdp_is_in_zone(servicer, "threads"));
2415
2416                 waitinfo->owner = thread_tid(servicer);
2417         } else if (kqr->kqr_state & KQR_THREQUESTED) {
2418                 waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2419         } else {
2420                 waitinfo->owner = 0;
2421         }
2422
2423         waitinfo->context = kqwl->kqwl_dynamicid;
2424
2425         return;
2426 }
2427
2428 /*
2429  * Takes kqueue locked, returns locked, may drop in the middle and/or block for a while
2430  */
2431 static int
2432 filt_wlpost_attach(struct knote *kn, struct  kevent_internal_s *kev)
2433 {
2434         struct kqueue *kq = knote_get_kq(kn);
2435         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2436         int error = 0;
2437
2438         if (kev->fflags & NOTE_WL_SYNC_WAIT) {
2439                 if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
2440                         filt_wllock(kqwl);
2441                         /* if the wake has already preposted, don't wait */
2442                         if ((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0)
2443                                 error = filt_wlwait(kqwl, kn, kev);
2444                         filt_wlunlock(kqwl);
2445                         knoteuse2kqlock(kq, kn, KNUSE_NONE);
2446                 }
2447         }
2448         return error;
2449 }
2450
2451 static void
2452 filt_wldetach(__assert_only struct knote *kn)
2453 {
2454         assert(knote_get_kq(kn)->kq_state & KQ_WORKLOOP);
2455
2456         /*
2457          * Thread requests have nothing to detach.
2458          * Sync waiters should have been aborted out
2459          * and drop their refs before we could drop/
2460          * detach their knotes.
2461          */
2462         assert(kn->kn_hook == NULL);
2463 }
2464
2465 static int
2466 filt_wlevent(
2467         __unused struct knote *kn,
2468         __unused long hint)
2469 {
2470         panic("filt_wlevent");
2471         return 0;
2472 }
2473
2474 static int
2475 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev)
2476 {
2477         int new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2478         int sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2479         int error = 0;
2480
2481         switch (new_commands) {
2482         case NOTE_WL_THREAD_REQUEST:
2483                 /* thread requests can only update themselves */
2484                 if (sav_commands != new_commands)
2485                         error = EINVAL;
2486                 break;
2487
2488         case NOTE_WL_SYNC_WAIT:
2489                 if (kev->fflags & NOTE_WL_END_OWNERSHIP)
2490                         error = EINVAL;
2491                 /* FALLTHROUGH */
2492         case NOTE_WL_SYNC_WAKE:
2493                 /* waits and wakes can update themselves or their counterparts */
2494                 if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)))
2495                         error = EINVAL;
2496                 if (kev->fflags & NOTE_WL_UPDATE_QOS)
2497                         error = EINVAL;
2498                 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE)
2499                         error = EINVAL;
2500                 if (kev->flags & EV_DELETE) {
2501                         /*
2502                          * Really this is not supported: there is absolutely no reason
2503                          * whatsoever to want to fail the drop of a NOTE_WL_SYNC_WAIT knote.
2504                          */
2505                         if (kev->ext[EV_EXTIDX_WL_ADDR] && kev->ext[EV_EXTIDX_WL_MASK]) {
2506                                 error = EINVAL;
2507                         }
2508                 }
2509                 break;
2510
2511         default:
2512                 error = EINVAL;
2513         }
2514         if ((kev->flags & EV_DELETE) && (kev->fflags & NOTE_WL_DISCOVER_OWNER)) {
2515                 error = EINVAL;
2516         }
2517         return error;
2518 }
2519
2520 static int
2521 filt_wltouch(
2522         struct knote *kn,
2523         struct kevent_internal_s *kev)
2524 {
2525         struct kqueue *kq = knote_get_kq(kn);
2526         int error = 0;
2527         struct kqworkloop *kqwl;
2528
2529         assert(kq->kq_state & KQ_WORKLOOP);
2530         kqwl = (struct kqworkloop *)kq;
2531
2532         error = filt_wlvalidate_kev_flags(kn, kev);
2533         if (error) {
2534                 goto out;
2535         }
2536
2537         filt_wllock(kqwl);
2538
2539         /* Make sure user and kernel are in agreement on important state */
2540         error = filt_wldebounce(kqwl, kev, 0);
2541         if (error) {
2542                 error = filt_wlupdateowner(kqwl, kev, error, 0);
2543                 goto out_unlock;
2544         }
2545
2546         int new_command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2547         switch (new_command) {
2548         case NOTE_WL_THREAD_REQUEST:
2549                 assert(kqwl->kqwl_request.kqr_qos_index != THREAD_QOS_UNSPECIFIED);
2550                 break;
2551
2552         case NOTE_WL_SYNC_WAIT:
2553                 /*
2554                  * we need to allow waiting several times on the same knote because
2555                  * of EINTR. If it's already woken though, it won't block.
2556                  */
2557                 break;
2558
2559         case NOTE_WL_SYNC_WAKE:
2560                 if (kn->kn_sfflags & NOTE_WL_SYNC_WAKE) {
2561                         /* disallow waking the same knote twice */
2562                         error = EALREADY;
2563                         goto out_unlock;
2564                 }
2565                 if (kn->kn_hook) {
2566                         thread_wakeup_thread((event_t)kn, (thread_t)kn->kn_hook);
2567                 }
2568                 break;
2569
2570         default:
2571                 error = EINVAL;
2572                 goto out_unlock;
2573         }
2574
2575         /*
2576          * Save off any additional fflags/data we just accepted
2577          * But only keep the last round of "update" bits we acted on which helps
2578          * debugging a lot.
2579          */
2580         kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2581         kn->kn_sfflags |= kev->fflags;
2582         kn->kn_sdata = kev->data;
2583
2584         kq_index_t qos_index = THREAD_QOS_UNSPECIFIED;
2585
2586         if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2587                 qos_t qos = pthread_priority_canonicalize(kev->qos, FALSE);
2588
2589                 if (kn->kn_qos != qos) {
2590                         qos_index = qos_index_from_qos(kn, qos, FALSE);
2591                         if (qos_index == THREAD_QOS_UNSPECIFIED) {
2592                                 error = ERANGE;
2593                                 goto out_unlock;
2594                         }
2595                         kqlock(kq);
2596                         if (kn->kn_status & KN_QUEUED) {
2597                                 knote_dequeue(kn);
2598                                 knote_set_qos_index(kn, qos_index);
2599                                 knote_enqueue(kn);
2600                                 knote_wakeup(kn);
2601                         } else {
2602                                 knote_set_qos_index(kn, qos_index);
2603                         }
2604                         kn->kn_qos = qos;
2605                         kqunlock(kq);
2606                 }
2607         }
2608
2609         error = filt_wlupdateowner(kqwl, kev, 0, qos_index);
2610         if (error) {
2611                 goto out_unlock;
2612         }
2613
2614         if (new_command == NOTE_WL_SYNC_WAIT) {
2615                 /* if the wake has already preposted, don't wait */
2616                 if ((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0)
2617                         error = filt_wlwait(kqwl, kn, kev);
2618         }
2619
2620 out_unlock:
2621         filt_wlremember_last_update(kqwl, kn, kev, error);
2622         filt_wlunlock(kqwl);
2623 out:
2624         if (error) {
2625                 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2626                         /* If userland wants ESTALE to be hidden, do not activate */
2627                         return 0;
2628                 }
2629                 kev->flags |= EV_ERROR;
2630                 kev->data = error;
2631                 return 0;
2632         }
2633         /* Just touching the thread request successfully will fire it */
2634         return new_command == NOTE_WL_THREAD_REQUEST;
2635 }
2636
2637 static int
2638 filt_wldrop_and_unlock(
2639         struct knote *kn,
2640         struct kevent_internal_s *kev)
2641 {
2642         struct kqueue *kq = knote_get_kq(kn);
2643         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2644         int error = 0, knoteuse_flags = KNUSE_NONE;
2645
2646         kqlock_held(kq);
2647
2648         assert(kev->flags & EV_DELETE);
2649         assert(kq->kq_state & KQ_WORKLOOP);
2650
2651         error = filt_wlvalidate_kev_flags(kn, kev);
2652         if (error) {
2653                 goto out;
2654         }
2655
2656         if (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) {
2657                 knoteuse_flags |= KNUSE_BOOST;
2658         }
2659
2660         /* take a usecount to allow taking the filt_wllock */
2661         if (!kqlock2knoteuse(kq, kn, knoteuse_flags)) {
2662                 /* knote is being dropped already */
2663                 error = EINPROGRESS;
2664                 goto out;
2665         }
2666
2667         filt_wllock(kqwl);
2668
2669         /*
2670          * Make sure user and kernel are in agreement on important state
2671          *
2672          * Userland will modify bits to cause this to fail for the touch / drop
2673          * race case (when a drop for a thread request quiescing comes in late after
2674          * the workloop has been woken up again).
2675          */
2676         error = filt_wldebounce(kqwl, kev, 0);
2677
2678         if (!knoteuse2kqlock(kq, kn, knoteuse_flags)) {
2679                 /* knote is no longer alive */
2680                 error = EINPROGRESS;
2681                 goto out_unlock;
2682         }
2683
2684         if (!error && (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) && kn->kn_inuse) {
2685                 /*
2686                  * There is a concurrent drop or touch happening, we can't resolve this,
2687                  * userland has to redrive.
2688                  *
2689                  * The race we're worried about here is the following:
2690                  *
2691                  *   f_touch               |  f_drop_and_unlock
2692                  * ------------------------+--------------------------------------------
2693                  *                         | kqlock()
2694                  *                         | kqlock2knoteuse()
2695                  *                         | filt_wllock()
2696                  *                         | debounces successfully
2697                  *  kqlock()               |
2698                  *  kqlock2knoteuse        |
2699                  *  filt_wllock() <BLOCKS> |
2700                  *                         | knoteuse2kqlock()
2701                  *                         | filt_wlunlock()
2702                  *                         | kqlock2knotedrop() <BLOCKS, WAKES f_touch>
2703                  *  debounces successfully |
2704                  *  filt_wlunlock()        |
2705                  *  caller WAKES f_drop    |
2706                  *                         | performs drop, but f_touch should have won
2707                  *
2708                  * So if the usecount is not 0 here, we need to wait for it to drop and
2709                  * redrive the whole logic (including looking up the knote again).
2710                  */
2711                 filt_wlunlock(kqwl);
2712                 knoteusewait(kq, kn);
2713                 return ERESTART;
2714         }
2715
2716         /*
2717          * If error is 0 this will set kqr_qos_index to THREAD_QOS_UNSPECIFIED
2718          *
2719          * If error is 0 or ESTALE this may drop ownership and cause a thread
2720          * request redrive, however the kqlock is held which prevents f_process() to
2721          * run until we did the drop for real.
2722          */
2723         error = filt_wlupdateowner(kqwl, kev, error, 0);
2724         if (error) {
2725                 goto out_unlock;
2726         }
2727
2728         if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2729                         NOTE_WL_SYNC_WAIT) {
2730                 /*
2731                  * When deleting a SYNC_WAIT knote that hasn't been woken up
2732                  * explicitly, issue a wake up.
2733                  */
2734                 kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2735                 if (kn->kn_hook) {
2736                         thread_wakeup_thread((event_t)kn, (thread_t)kn->kn_hook);
2737                 }
2738         }
2739
2740 out_unlock:
2741         filt_wlremember_last_update(kqwl, kn, kev, error);
2742         filt_wlunlock(kqwl);
2743
2744 out:
2745         if (error == 0) {
2746                 /* If nothing failed, do the regular knote drop. */
2747                 if (kqlock2knotedrop(kq, kn)) {
2748                         knote_drop(kn, current_proc());
2749                 } else {
2750                         error = EINPROGRESS;
2751                 }
2752         } else {
2753                 kqunlock(kq);
2754         }
2755         if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2756                 error = 0;
2757         }
2758         if (error == EINPROGRESS) {
2759                 /*
2760                  * filt_wlprocess() makes sure that no event can be delivered for
2761                  * NOTE_WL_THREAD_REQUEST knotes once a drop is happening, and
2762                  * NOTE_WL_SYNC_* knotes are never fired.
2763                  *
2764                  * It means that EINPROGRESS is about a state that userland cannot
2765                  * observe for this filter (an event being delivered concurrently from
2766                  * a drop), so silence the error.
2767                  */
2768                 error = 0;
2769         }
2770         return error;
2771 }
2772
2773 static int
2774 filt_wlprocess(
2775         struct knote *kn,
2776         __unused struct filt_process_s *data,
2777         struct kevent_internal_s *kev)
2778 {
2779         struct kqueue *kq = knote_get_kq(kn);
2780         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2781         struct kqrequest *kqr = &kqwl->kqwl_request;
2782         int rc = 0;
2783
2784         assert(kq->kq_state & KQ_WORKLOOP);
2785
2786         /* only thread requests should get here */
2787         assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2788         if (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) {
2789                 filt_wllock(kqwl);
2790                 assert(kqr->kqr_qos_index != THREAD_QOS_UNSPECIFIED);
2791                 if (kqwl->kqwl_owner) {
2792                         /*
2793                          * <rdar://problem/33584321> userspace sometimes due to events being
2794                          * delivered but not triggering a drain session can cause a process
2795                          * of the thread request knote.
2796                          *
2797                          * When that happens, the automatic deactivation due to process
2798                          * would swallow the event, so we have to activate the knote again.
2799                          */
2800                         kqlock(kq);
2801                         knote_activate(kn);
2802                         kqunlock(kq);
2803                 } else if (kqr->kqr_qos_index) {
2804 #if DEBUG || DEVELOPMENT
2805                         user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2806                         task_t t = current_task();
2807                         uint64_t val;
2808                         if (addr && task_is_active(t) && !task_is_halting(t) &&
2809                                         copyin_word(addr, &val, sizeof(val)) == 0 &&
2810                                         val && (val & DISPATCH_QUEUE_ENQUEUED) == 0) {
2811                                 panic("kevent: workloop %#016llx is not enqueued "
2812                                                 "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2813                                                 kn->kn_udata, kn, val,
2814                                                 kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2815                         }
2816 #endif
2817                         *kev = kn->kn_kevent;
2818                         kev->fflags = kn->kn_sfflags;
2819                         kev->data = kn->kn_sdata;
2820                         kev->qos = kn->kn_qos;
2821                         rc = 1;
2822                 }
2823                 filt_wlunlock(kqwl);
2824         }
2825         return rc;
2826 }
2827
2828 #pragma mark kevent / knotes
2829
2830 /*
2831  * JMM - placeholder for not-yet-implemented filters
2832  */
2833 static int
2834 filt_badattach(__unused struct knote *kn, __unused struct kevent_internal_s *kev)
2835 {
2836         kn->kn_flags |= EV_ERROR;
2837         kn->kn_data = ENOTSUP;
2838         return 0;
2839 }
2840
2841 struct kqueue *
2842 kqueue_alloc(struct proc *p, unsigned int flags)
2843 {
2844         struct filedesc *fdp = p->p_fd;
2845         struct kqueue *kq = NULL;
2846         int policy;
2847         void *hook = NULL;
2848         uint64_t kq_addr_offset;
2849
2850         if (flags & KEVENT_FLAG_WORKQ) {
2851                 struct kqworkq *kqwq;
2852                 int i;
2853
2854                 kqwq = (struct kqworkq *)zalloc(kqworkq_zone);
2855                 if (kqwq == NULL)
2856                         return NULL;
2857
2858                 kq = &kqwq->kqwq_kqueue;
2859                 bzero(kqwq, sizeof (struct kqworkq));
2860
2861                 kqwq->kqwq_state = KQ_WORKQ;
2862
2863                 for (i = 0; i < KQWQ_NBUCKETS; i++) {
2864                         TAILQ_INIT(&kq->kq_queue[i]);
2865                 }
2866                 for (i = 0; i < KQWQ_NQOS; i++) {
2867                         kqwq->kqwq_request[i].kqr_qos_index = i;
2868                 }
2869
2870                 lck_spin_init(&kqwq->kqwq_reqlock, kq_lck_grp, kq_lck_attr);
2871                 policy = SYNC_POLICY_FIFO;
2872                 hook = (void *)kqwq;
2873
2874         } else if (flags & KEVENT_FLAG_WORKLOOP) {
2875                 struct kqworkloop *kqwl;
2876                 int i;
2877
2878                 kqwl = (struct kqworkloop *)zalloc(kqworkloop_zone);
2879                 if (kqwl == NULL)
2880                         return NULL;
2881
2882                 bzero(kqwl, sizeof (struct kqworkloop));
2883
2884                 kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC;
2885                 kqwl->kqwl_retains = 1; /* donate a retain to creator */
2886
2887                 kq = &kqwl->kqwl_kqueue;
2888                 for (i = 0; i < KQWL_NBUCKETS; i++) {
2889                         TAILQ_INIT(&kq->kq_queue[i]);
2890                 }
2891                 TAILQ_INIT(&kqwl->kqwl_request.kqr_suppressed);
2892
2893                 lck_spin_init(&kqwl->kqwl_reqlock, kq_lck_grp, kq_lck_attr);
2894                 lck_mtx_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr);
2895
2896                 policy = SYNC_POLICY_FIFO;
2897                 if (flags & KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD) {
2898                         policy |= SYNC_POLICY_PREPOST;
2899                         kq->kq_state |= KQ_NO_WQ_THREAD;
2900                 } else {
2901                         hook = (void *)kqwl;
2902                 }
2903
2904         } else {
2905                 struct kqfile *kqf;
2906
2907                 kqf = (struct kqfile *)zalloc(kqfile_zone);
2908                 if (kqf == NULL)
2909                         return NULL;
2910
2911                 kq = &kqf->kqf_kqueue;
2912                 bzero(kqf, sizeof (struct kqfile));
2913                 TAILQ_INIT(&kq->kq_queue[0]);
2914                 TAILQ_INIT(&kqf->kqf_suppressed);
2915
2916                 policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST;
2917         }
2918
2919         waitq_set_init(&kq->kq_wqs, policy, NULL, hook);
2920         lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
2921         kq->kq_p = p;
2922
2923         if (fdp->fd_knlistsize < 0) {
2924                 proc_fdlock(p);
2925                 if (fdp->fd_knlistsize < 0)
2926                         fdp->fd_knlistsize = 0; /* this process has had a kq */
2927                 proc_fdunlock(p);
2928         }
2929
2930         kq_addr_offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS);
2931         /* Assert that the address can be pointer compacted for use with knote */
2932         assert(kq_addr_offset < (uint64_t)(1ull << KNOTE_KQ_BITSIZE));
2933         return (kq);
2934 }
2935
2936 /*
2937  * knotes_dealloc - detach all knotes for the process and drop them
2938  *
2939  *              Called with proc_fdlock held.
2940  *              Returns with it locked.
2941  *              May drop it temporarily.
2942  *              Process is in such a state that it will not try to allocate
2943  *              any more knotes during this process (stopped for exit or exec).
2944  */
2945 void
2946 knotes_dealloc(proc_t p)
2947 {
2948         struct filedesc *fdp = p->p_fd;
2949         struct kqueue *kq;
2950         struct knote *kn;
2951         struct  klist *kn_hash = NULL;
2952         int i;
2953
2954         /* Close all the fd-indexed knotes up front */
2955         if (fdp->fd_knlistsize > 0) {
2956                 for (i = 0; i < fdp->fd_knlistsize; i++) {
2957                         while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
2958                                 kq = knote_get_kq(kn);
2959                                 kqlock(kq);
2960                                 proc_fdunlock(p);
2961                                 /* drop it ourselves or wait */
2962                                 if (kqlock2knotedrop(kq, kn)) {
2963                                         knote_drop(kn, p);
2964                                 }
2965                                 proc_fdlock(p);
2966                         }
2967                 }
2968                 /* free the table */
2969                 FREE(fdp->fd_knlist, M_KQUEUE);
2970                 fdp->fd_knlist = NULL;
2971         }
2972         fdp->fd_knlistsize = -1;
2973
2974         knhash_lock(p);
2975         proc_fdunlock(p);
2976
2977         /* Clean out all the hashed knotes as well */
2978         if (fdp->fd_knhashmask != 0) {
2979                 for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
2980                         while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
2981                                 kq = knote_get_kq(kn);
2982                                 kqlock(kq);
2983                                 knhash_unlock(p);
2984                                 /* drop it ourselves or wait */
2985                                 if (kqlock2knotedrop(kq, kn)) {
2986                                         knote_drop(kn, p);
2987                                 }
2988                                 knhash_lock(p);
2989                         }
2990                 }
2991                 kn_hash = fdp->fd_knhash;
2992                 fdp->fd_knhashmask = 0;
2993                 fdp->fd_knhash = NULL;
2994         }
2995
2996         knhash_unlock(p);
2997
2998         /* free the kn_hash table */
2999         if (kn_hash)
3000                 FREE(kn_hash, M_KQUEUE);
3001
3002         proc_fdlock(p);
3003 }
3004
3005
3006 /*
3007  * kqueue_dealloc - detach all knotes from a kqueue and free it
3008  *
3009  *      We walk each list looking for knotes referencing this
3010  *      this kqueue.  If we find one, we try to drop it.  But
3011  *      if we fail to get a drop reference, that will wait
3012  *      until it is dropped.  So, we can just restart again
3013  *      safe in the assumption that the list will eventually
3014  *      not contain any more references to this kqueue (either
3015  *      we dropped them all, or someone else did).
3016  *
3017  *      Assumes no new events are being added to the kqueue.
3018  *      Nothing locked on entry or exit.
3019  *
3020  * Workloop kqueues cant get here unless all the knotes
3021  * are already gone and all requested threads have come
3022  * and gone (cancelled or arrived).
3023  */
3024 void
3025 kqueue_dealloc(struct kqueue *kq)
3026 {
3027         struct proc *p;
3028         struct filedesc *fdp;
3029         struct knote *kn;
3030         int i;
3031
3032         if (kq == NULL)
3033                 return;
3034
3035         p = kq->kq_p;
3036         fdp = p->p_fd;
3037
3038         proc_fdlock(p);
3039         for (i = 0; i < fdp->fd_knlistsize; i++) {
3040                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
3041                 while (kn != NULL) {
3042                         if (kq == knote_get_kq(kn)) {
3043                                 assert((kq->kq_state & KQ_WORKLOOP) == 0);
3044                                 kqlock(kq);
3045                                 proc_fdunlock(p);
3046                                 /* drop it ourselves or wait */
3047                                 if (kqlock2knotedrop(kq, kn)) {
3048                                         knote_drop(kn, p);
3049                                 }
3050                                 proc_fdlock(p);
3051                                 /* start over at beginning of list */
3052                                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
3053                                 continue;
3054                         }
3055                         kn = SLIST_NEXT(kn, kn_link);
3056                 }
3057         }
3058         knhash_lock(p);
3059         proc_fdunlock(p);
3060
3061         if (fdp->fd_knhashmask != 0) {
3062                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
3063                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3064                         while (kn != NULL) {
3065                                 if (kq == knote_get_kq(kn)) {
3066                                         assert((kq->kq_state & KQ_WORKLOOP) == 0);
3067                                         kqlock(kq);
3068                                         knhash_unlock(p);
3069                                         /* drop it ourselves or wait */
3070                                         if (kqlock2knotedrop(kq, kn)) {
3071                                                 knote_drop(kn, p);
3072                                         }
3073                                         knhash_lock(p);
3074                                         /* start over at beginning of list */
3075                                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3076                                         continue;
3077                                 }
3078                                 kn = SLIST_NEXT(kn, kn_link);
3079                         }
3080                 }
3081         }
3082         knhash_unlock(p);
3083
3084         if (kq->kq_state & KQ_WORKLOOP) {
3085                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3086                 struct kqrequest *kqr = &kqwl->kqwl_request;
3087                 thread_t cur_owner = kqwl->kqwl_owner;
3088
3089                 assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed));
3090                 if (filt_wlowner_is_valid(cur_owner)) {
3091                         /*
3092                          * If the kqueue had an owner that prevented the thread request to
3093                          * go through, then no unbind happened, and we may have lingering
3094                          * overrides to drop.
3095                          */
3096                         if (kqr->kqr_dsync_owner_qos != THREAD_QOS_UNSPECIFIED) {
3097                                 thread_drop_ipc_override(cur_owner);
3098                                 kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
3099                         }
3100
3101                         if (kqr->kqr_owner_override_is_sync) {
3102                                 thread_drop_sync_ipc_override(cur_owner);
3103                                 kqr->kqr_owner_override_is_sync = 0;
3104                         }
3105                         thread_ends_owning_workloop(cur_owner);
3106                         thread_deallocate(cur_owner);
3107                         kqwl->kqwl_owner = THREAD_NULL;
3108                 }
3109         }
3110
3111         /*
3112          * waitq_set_deinit() remove the KQ's waitq set from
3113          * any select sets to which it may belong.
3114          */
3115         waitq_set_deinit(&kq->kq_wqs);
3116         lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
3117
3118         if (kq->kq_state & KQ_WORKQ) {
3119                 struct kqworkq *kqwq = (struct kqworkq *)kq;
3120
3121                 lck_spin_destroy(&kqwq->kqwq_reqlock, kq_lck_grp);
3122                 zfree(kqworkq_zone, kqwq);
3123         } else if (kq->kq_state & KQ_WORKLOOP) {
3124                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3125
3126                 assert(kqwl->kqwl_retains == 0);
3127                 lck_spin_destroy(&kqwl->kqwl_reqlock, kq_lck_grp);
3128                 lck_mtx_destroy(&kqwl->kqwl_statelock, kq_lck_grp);
3129                 zfree(kqworkloop_zone, kqwl);
3130         } else {
3131                 struct kqfile *kqf = (struct kqfile *)kq;
3132
3133                 zfree(kqfile_zone, kqf);
3134         }
3135 }
3136
3137 static inline void
3138 kqueue_retain(struct kqueue *kq)
3139 {
3140         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3141         uint32_t previous;
3142
3143         if ((kq->kq_state & KQ_DYNAMIC) == 0)
3144                 return;
3145
3146         previous = OSIncrementAtomic(&kqwl->kqwl_retains);
3147         if (previous == KQ_WORKLOOP_RETAINS_MAX)
3148                 panic("kq(%p) retain overflow", kq);
3149
3150         if (previous == 0)
3151                 panic("kq(%p) resurrection", kq);
3152 }
3153
3154 #define KQUEUE_CANT_BE_LAST_REF  0
3155 #define KQUEUE_MIGHT_BE_LAST_REF 1
3156
3157 static inline int
3158 kqueue_release(struct kqueue *kq, __assert_only int possibly_last)
3159 {
3160         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3161
3162         if ((kq->kq_state & KQ_DYNAMIC) == 0) {
3163                 return 0;
3164         }
3165
3166         assert(kq->kq_state & KQ_WORKLOOP); /* for now */
3167         uint32_t refs = OSDecrementAtomic(&kqwl->kqwl_retains);
3168         if (__improbable(refs == 0)) {
3169                 panic("kq(%p) over-release", kq);
3170         }
3171         if (refs == 1) {
3172                 assert(possibly_last);
3173         }
3174         return refs == 1;
3175 }
3176
3177 int
3178 kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
3179 {
3180         struct kqueue *kq;
3181         struct fileproc *fp;
3182         int fd, error;
3183
3184         error = falloc_withalloc(p,
3185             &fp, &fd, vfs_context_current(), fp_zalloc, cra);
3186         if (error) {
3187                 return (error);
3188         }
3189
3190         kq = kqueue_alloc(p, 0);
3191         if (kq == NULL) {
3192                 fp_free(p, fd, fp);
3193                 return (ENOMEM);
3194         }
3195
3196         fp->f_flag = FREAD | FWRITE;
3197         fp->f_ops = &kqueueops;
3198         fp->f_data = kq;
3199
3200         proc_fdlock(p);
3201         *fdflags(p, fd) |= UF_EXCLOSE;
3202         procfdtbl_releasefd(p, fd, NULL);
3203         fp_drop(p, fd, fp, 1);
3204         proc_fdunlock(p);
3205
3206         *retval = fd;
3207         return (error);
3208 }
3209
3210 int
3211 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3212 {
3213         return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
3214 }
3215
3216 static int
3217 kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
3218     unsigned int flags)
3219 {
3220         int advance;
3221         int error;
3222
3223         if (flags & KEVENT_FLAG_LEGACY32) {
3224                 bzero(kevp, sizeof (*kevp));
3225
3226                 if (IS_64BIT_PROCESS(p)) {
3227                         struct user64_kevent kev64;
3228
3229                         advance = sizeof (kev64);
3230                         error = copyin(*addrp, (caddr_t)&kev64, advance);
3231                         if (error)
3232                                 return (error);
3233                         kevp->ident = kev64.ident;
3234                         kevp->filter = kev64.filter;
3235                         kevp->flags = kev64.flags;
3236                         kevp->udata = kev64.udata;
3237                         kevp->fflags = kev64.fflags;
3238                         kevp->data = kev64.data;
3239                 } else {
3240                         struct user32_kevent kev32;
3241
3242                         advance = sizeof (kev32);
3243                         error = copyin(*addrp, (caddr_t)&kev32, advance);
3244                         if (error)
3245                                 return (error);
3246                         kevp->ident = (uintptr_t)kev32.ident;
3247                         kevp->filter = kev32.filter;
3248                         kevp->flags = kev32.flags;
3249                         kevp->udata = CAST_USER_ADDR_T(kev32.udata);
3250                         kevp->fflags = kev32.fflags;
3251                         kevp->data = (intptr_t)kev32.data;
3252                 }
3253         } else if (flags & KEVENT_FLAG_LEGACY64) {
3254                 struct kevent64_s kev64;
3255
3256                 bzero(kevp, sizeof (*kevp));
3257
3258                 advance = sizeof (struct kevent64_s);
3259                 error = copyin(*addrp, (caddr_t)&kev64, advance);
3260                 if (error)
3261                         return(error);
3262                 kevp->ident = kev64.ident;
3263                 kevp->filter = kev64.filter;
3264                 kevp->flags = kev64.flags;
3265                 kevp->udata = kev64.udata;
3266                 kevp->fflags = kev64.fflags;
3267                 kevp->data = kev64.data;
3268                 kevp->ext[0] = kev64.ext[0];
3269                 kevp->ext[1] = kev64.ext[1];
3270
3271         } else {
3272                 struct kevent_qos_s kevqos;
3273
3274                 bzero(kevp, sizeof (*kevp));
3275
3276                 advance = sizeof (struct kevent_qos_s);
3277                 error = copyin(*addrp, (caddr_t)&kevqos, advance);
3278                 if (error)
3279                         return error;
3280                 kevp->ident = kevqos.ident;
3281                 kevp->filter = kevqos.filter;
3282                 kevp->flags = kevqos.flags;
3283                 kevp->qos = kevqos.qos;
3284 //              kevp->xflags = kevqos.xflags;
3285                 kevp->udata = kevqos.udata;
3286                 kevp->fflags = kevqos.fflags;
3287                 kevp->data = kevqos.data;
3288                 kevp->ext[0] = kevqos.ext[0];
3289                 kevp->ext[1] = kevqos.ext[1];
3290                 kevp->ext[2] = kevqos.ext[2];
3291                 kevp->ext[3] = kevqos.ext[3];
3292         }
3293         if (!error)
3294                 *addrp += advance;
3295         return (error);
3296 }
3297
3298 static int
3299 kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
3300     unsigned int flags)
3301 {
3302         user_addr_t addr = *addrp;
3303         int advance;
3304         int error;
3305
3306         /*
3307          * fully initialize the differnt output event structure
3308          * types from the internal kevent (and some universal
3309          * defaults for fields not represented in the internal
3310          * form).
3311          */
3312         if (flags & KEVENT_FLAG_LEGACY32) {
3313                 assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0);
3314
3315                 if (IS_64BIT_PROCESS(p)) {
3316                         struct user64_kevent kev64;
3317
3318                         advance = sizeof (kev64);
3319                         bzero(&kev64, advance);
3320
3321                         /*
3322                          * deal with the special case of a user-supplied
3323                          * value of (uintptr_t)-1.
3324                          */
3325                         kev64.ident = (kevp->ident == (uintptr_t)-1) ?
3326                                 (uint64_t)-1LL : (uint64_t)kevp->ident;
3327
3328                         kev64.filter = kevp->filter;
3329                         kev64.flags = kevp->flags;
3330                         kev64.fflags = kevp->fflags;
3331                         kev64.data = (int64_t) kevp->data;
3332                         kev64.udata = kevp->udata;
3333                         error = copyout((caddr_t)&kev64, addr, advance);
3334                 } else {
3335                         struct user32_kevent kev32;
3336
3337                         advance = sizeof (kev32);
3338                         bzero(&kev32, advance);
3339                         kev32.ident = (uint32_t)kevp->ident;
3340                         kev32.filter = kevp->filter;
3341                         kev32.flags = kevp->flags;
3342                         kev32.fflags = kevp->fflags;
3343                         kev32.data = (int32_t)kevp->data;
3344                         kev32.udata = kevp->udata;
3345                         error = copyout((caddr_t)&kev32, addr, advance);
3346                 }
3347         } else if (flags & KEVENT_FLAG_LEGACY64) {
3348                 struct kevent64_s kev64;
3349
3350                 advance = sizeof (struct kevent64_s);
3351                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
3352                         addr -= advance;
3353                 }
3354                 bzero(&kev64, advance);
3355                 kev64.ident = kevp->ident;
3356                 kev64.filter = kevp->filter;
3357                 kev64.flags = kevp->flags;
3358                 kev64.fflags = kevp->fflags;
3359                 kev64.data = (int64_t) kevp->data;
3360                 kev64.udata = kevp->udata;
3361                 kev64.ext[0] = kevp->ext[0];
3362                 kev64.ext[1] = kevp->ext[1];
3363                 error = copyout((caddr_t)&kev64, addr, advance);
3364         } else {
3365                 struct kevent_qos_s kevqos;
3366
3367                 advance = sizeof (struct kevent_qos_s);
3368                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
3369                         addr -= advance;
3370                 }
3371                 bzero(&kevqos, advance);
3372                 kevqos.ident = kevp->ident;
3373                 kevqos.filter = kevp->filter;
3374                 kevqos.flags = kevp->flags;
3375                 kevqos.qos = kevp->qos;
3376                 kevqos.udata = kevp->udata;
3377                 kevqos.fflags = kevp->fflags;
3378                 kevqos.xflags = 0;
3379                 kevqos.data = (int64_t) kevp->data;
3380                 kevqos.ext[0] = kevp->ext[0];
3381                 kevqos.ext[1] = kevp->ext[1];
3382                 kevqos.ext[2] = kevp->ext[2];
3383                 kevqos.ext[3] = kevp->ext[3];
3384                 error = copyout((caddr_t)&kevqos, addr, advance);
3385         }
3386         if (!error) {
3387                 if (flags & KEVENT_FLAG_STACK_EVENTS)
3388                         *addrp = addr;
3389                 else
3390                         *addrp = addr + advance;
3391         }
3392         return (error);
3393 }
3394
3395 static int
3396 kevent_get_data_size(struct proc *p,
3397                      uint64_t data_available,
3398                      unsigned int flags,
3399                      user_size_t *residp)
3400 {
3401         user_size_t resid;
3402         int error = 0;
3403
3404         if (data_available != USER_ADDR_NULL) {
3405                 if (flags & KEVENT_FLAG_KERNEL) {
3406                         resid = *(user_size_t *)(uintptr_t)data_available;
3407                 } else if (IS_64BIT_PROCESS(p)) {
3408                         user64_size_t usize;
3409                         error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
3410                         resid = (user_size_t)usize;
3411                 } else {
3412                         user32_size_t usize;
3413                         error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
3414                         resid = (user_size_t)usize;
3415                 }
3416                 if (error)
3417                         return(error);
3418         } else {
3419                 resid = 0;
3420         }
3421         *residp = resid;
3422         return 0;
3423 }
3424
3425 static int
3426 kevent_put_data_size(struct proc *p,
3427                      uint64_t data_available,
3428                      unsigned int flags,
3429                      user_size_t resid)
3430 {
3431         int error = 0;
3432
3433         if (data_available) {
3434                 if (flags & KEVENT_FLAG_KERNEL) {
3435                         *(user_size_t *)(uintptr_t)data_available = resid;
3436                 } else if (IS_64BIT_PROCESS(p)) {
3437                         user64_size_t usize = (user64_size_t)resid;
3438                         error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
3439                 } else {
3440                         user32_size_t usize = (user32_size_t)resid;
3441                         error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
3442                 }
3443         }
3444         return error;
3445 }
3446
3447 /*
3448  * kevent_continue - continue a kevent syscall after blocking
3449  *
3450  *      assume we inherit a use count on the kq fileglob.
3451  */
3452
3453 __attribute__((noreturn))
3454 static void
3455 kevent_continue(__unused struct kqueue *kq, void *data, int error)
3456 {
3457         struct _kevent *cont_args;
3458         struct fileproc *fp;
3459         uint64_t data_available;
3460         user_size_t data_size;
3461         user_size_t data_resid;
3462         unsigned int flags;
3463         int32_t *retval;
3464         int noutputs;
3465         int fd;
3466         struct proc *p = current_proc();
3467
3468         cont_args = (struct _kevent *)data;
3469         data_available = cont_args->data_available;
3470         flags = cont_args->process_data.fp_flags;
3471         data_size = cont_args->process_data.fp_data_size;
3472         data_resid = cont_args->process_data.fp_data_resid;
3473         noutputs = cont_args->eventout;
3474         retval = cont_args->retval;
3475         fd = cont_args->fd;
3476         fp = cont_args->fp;
3477
3478         kevent_put_kq(p, fd, fp, kq);
3479
3480         /* don't abandon other output just because of residual copyout failures */
3481         if (error == 0 && data_available && data_resid != data_size) {
3482                 (void)kevent_put_data_size(p, data_available, flags, data_resid);
3483         }
3484
3485         /* don't restart after signals... */
3486         if (error == ERESTART)
3487                 error = EINTR;
3488         else if (error == EWOULDBLOCK)
3489                 error = 0;
3490         if (error == 0)
3491                 *retval = noutputs;
3492         unix_syscall_return(error);
3493 }
3494
3495 /*
3496  * kevent - [syscall] register and wait for kernel events
3497  *
3498  */
3499 int
3500 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
3501 {
3502         unsigned int flags = KEVENT_FLAG_LEGACY32;
3503
3504         return kevent_internal(p,
3505                                (kqueue_id_t)uap->fd, NULL,
3506                                uap->changelist, uap->nchanges,
3507                                uap->eventlist, uap->nevents,
3508                                0ULL, 0ULL,
3509                                flags,
3510                                uap->timeout,
3511                                kevent_continue,
3512                                retval);
3513 }
3514
3515 int
3516 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
3517 {
3518         unsigned int flags;
3519
3520         /* restrict to user flags and set legacy64 */
3521         flags = uap->flags & KEVENT_FLAG_USER;
3522         flags |= KEVENT_FLAG_LEGACY64;
3523
3524         return kevent_internal(p,
3525                                (kqueue_id_t)uap->fd, NULL,
3526                                uap->changelist, uap->nchanges,
3527                                uap->eventlist, uap->nevents,
3528                                0ULL, 0ULL,
3529                                flags,
3530                                uap->timeout,
3531                                kevent_continue,
3532                                retval);
3533 }
3534
3535 int
3536 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
3537 {
3538         /* restrict to user flags */
3539         uap->flags &= KEVENT_FLAG_USER;
3540
3541         return kevent_internal(p,
3542                                (kqueue_id_t)uap->fd, NULL,
3543                                uap->changelist, uap->nchanges,
3544                                uap->eventlist,  uap->nevents,
3545                                uap->data_out, (uint64_t)uap->data_available,
3546                                uap->flags,
3547                                0ULL,
3548                                kevent_continue,
3549                                retval);
3550 }
3551
3552 int
3553 kevent_qos_internal(struct proc *p, int fd,
3554                     user_addr_t changelist, int nchanges,
3555                     user_addr_t eventlist, int nevents,
3556                     user_addr_t data_out, user_size_t *data_available,
3557                     unsigned int flags,
3558                     int32_t *retval)
3559 {
3560         return kevent_internal(p,
3561                                (kqueue_id_t)fd, NULL,
3562                                changelist, nchanges,
3563                                eventlist, nevents,
3564                                data_out, (uint64_t)data_available,
3565                                (flags | KEVENT_FLAG_KERNEL),
3566                                0ULL,
3567                                NULL,
3568                                retval);
3569 }
3570
3571 int
3572 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
3573 {
3574         /* restrict to user flags */
3575         uap->flags &= KEVENT_FLAG_USER;
3576
3577         return kevent_internal(p,
3578                                (kqueue_id_t)uap->id, NULL,
3579                                uap->changelist, uap->nchanges,
3580                                uap->eventlist,  uap->nevents,
3581                                uap->data_out, (uint64_t)uap->data_available,
3582                                (uap->flags | KEVENT_FLAG_DYNAMIC_KQUEUE),
3583                                0ULL,
3584                                kevent_continue,
3585                                retval);
3586 }
3587
3588 int
3589 kevent_id_internal(struct proc *p, kqueue_id_t *id,
3590                     user_addr_t changelist, int nchanges,
3591                     user_addr_t eventlist, int nevents,
3592                     user_addr_t data_out, user_size_t *data_available,
3593                     unsigned int flags,
3594                     int32_t *retval)
3595 {
3596         return kevent_internal(p,
3597                                *id, id,
3598                                changelist, nchanges,
3599                                eventlist, nevents,
3600                                data_out, (uint64_t)data_available,
3601                                (flags | KEVENT_FLAG_KERNEL | KEVENT_FLAG_DYNAMIC_KQUEUE),
3602                                0ULL,
3603                                NULL,
3604                                retval);
3605 }
3606
3607 static int
3608 kevent_get_timeout(struct proc *p,
3609                    user_addr_t utimeout,
3610                    unsigned int flags,
3611                    struct timeval *atvp)
3612 {
3613         struct timeval atv;
3614         int error = 0;
3615
3616         if (flags & KEVENT_FLAG_IMMEDIATE) {
3617                 getmicrouptime(&atv);
3618         } else if (utimeout != USER_ADDR_NULL) {
3619                 struct timeval rtv;
3620                 if (flags & KEVENT_FLAG_KERNEL) {
3621                         struct timespec *tsp = (struct timespec *)utimeout;
3622                         TIMESPEC_TO_TIMEVAL(&rtv, tsp);
3623                 } else if (IS_64BIT_PROCESS(p)) {
3624                         struct user64_timespec ts;
3625                         error = copyin(utimeout, &ts, sizeof(ts));
3626                         if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
3627                                 error = EINVAL;
3628                         else
3629                                 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
3630                 } else {
3631                         struct user32_timespec ts;
3632                         error = copyin(utimeout, &ts, sizeof(ts));
3633                         TIMESPEC_TO_TIMEVAL(&rtv, &ts);
3634                 }
3635                 if (error)
3636                         return (error);
3637                 if (itimerfix(&rtv))
3638                         return (EINVAL);
3639                 getmicrouptime(&atv);
3640                 timevaladd(&atv, &rtv);
3641         } else {
3642                 /* wait forever value */
3643                 atv.tv_sec = 0;
3644                 atv.tv_usec = 0;
3645         }
3646         *atvp = atv;
3647         return 0;
3648 }
3649
3650 static int
3651 kevent_set_kq_mode(struct kqueue *kq, unsigned int flags)
3652 {
3653         /* each kq should only be used for events of one type */
3654         kqlock(kq);
3655         if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) {
3656                 if (flags & KEVENT_FLAG_LEGACY32) {
3657                         if ((kq->kq_state & KQ_KEV32) == 0) {
3658                                 kqunlock(kq);
3659                                 return EINVAL;
3660                         }
3661                 } else if (kq->kq_state & KQ_KEV32) {
3662                         kqunlock(kq);
3663                         return EINVAL;
3664                 }
3665         } else if (flags & KEVENT_FLAG_LEGACY32) {
3666                 kq->kq_state |= KQ_KEV32;
3667         } else if (flags & KEVENT_FLAG_LEGACY64) {
3668                 kq->kq_state |= KQ_KEV64;
3669         } else {
3670                 kq->kq_state |= KQ_KEV_QOS;
3671         }
3672         kqunlock(kq);
3673         return 0;
3674 }
3675
3676 #define KQ_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
3677 #define CONFIG_KQ_HASHSIZE  CONFIG_KN_HASHSIZE
3678
3679 static inline void
3680 kqhash_lock(proc_t p)
3681 {
3682         lck_mtx_lock_spin_always(&p->p_fd->fd_kqhashlock);
3683 }
3684
3685 static inline void
3686 kqhash_lock_held(__assert_only proc_t p)
3687 {
3688         LCK_MTX_ASSERT(&p->p_fd->fd_kqhashlock, LCK_MTX_ASSERT_OWNED);
3689 }
3690
3691 static inline void
3692 kqhash_unlock(proc_t p)
3693 {
3694         lck_mtx_unlock(&p->p_fd->fd_kqhashlock);
3695 }
3696
3697 static void
3698 kqueue_hash_init_if_needed(proc_t p)
3699 {
3700         struct filedesc *fdp = p->p_fd;
3701
3702         kqhash_lock_held(p);
3703
3704         if (__improbable(fdp->fd_kqhash == NULL)) {
3705                 struct kqlist *alloc_hash;
3706                 u_long alloc_mask;
3707
3708                 kqhash_unlock(p);
3709                 alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3710                 kqhash_lock(p);
3711
3712                 /* See if we won the race */
3713                 if (fdp->fd_kqhashmask == 0) {
3714                         fdp->fd_kqhash = alloc_hash;
3715                         fdp->fd_kqhashmask = alloc_mask;
3716                 } else {
3717                         kqhash_unlock(p);
3718                         FREE(alloc_hash, M_KQUEUE);
3719                         kqhash_lock(p);
3720                 }
3721         }
3722 }
3723
3724 /*
3725  * Called with the kqhash_lock() held
3726  */
3727 static void
3728 kqueue_hash_insert(
3729         struct proc *p,
3730         kqueue_id_t id,
3731         struct kqueue *kq)
3732 {
3733         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3734         struct filedesc *fdp = p->p_fd;
3735         struct kqlist *list;
3736
3737         /* should hold the kq hash lock */
3738         kqhash_lock_held(p);
3739
3740         if ((kq->kq_state & KQ_DYNAMIC) == 0) {
3741                 assert(kq->kq_state & KQ_DYNAMIC);
3742                 return;
3743         }
3744
3745         /* only dynamically allocate workloop kqs for now */
3746         assert(kq->kq_state & KQ_WORKLOOP);
3747         assert(fdp->fd_kqhash);
3748
3749         kqwl->kqwl_dynamicid = id;
3750
3751         list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3752         SLIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3753 }
3754
3755 /* Called with kqhash_lock held */
3756 static void
3757 kqueue_hash_remove(
3758         struct proc *p,
3759         struct kqueue *kq)
3760 {
3761         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3762         struct filedesc *fdp = p->p_fd;
3763         struct kqlist *list;
3764
3765         /* should hold the kq hash lock */
3766         kqhash_lock_held(p);
3767
3768         if ((kq->kq_state & KQ_DYNAMIC) == 0) {
3769                 assert(kq->kq_state & KQ_DYNAMIC);
3770                 return;
3771         }
3772         assert(kq->kq_state & KQ_WORKLOOP); /* for now */
3773         list = &fdp->fd_kqhash[KQ_HASH(kqwl->kqwl_dynamicid, fdp->fd_kqhashmask)];
3774         SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
3775 }
3776
3777 /* Called with kqhash_lock held */
3778 static struct kqueue *
3779 kqueue_hash_lookup(struct proc *p, kqueue_id_t id)
3780 {
3781         struct filedesc *fdp = p->p_fd;
3782         struct kqlist *list;
3783         struct kqworkloop *kqwl;
3784
3785         /* should hold the kq hash lock */
3786         kqhash_lock_held(p);
3787
3788         if (fdp->fd_kqhashmask == 0) return NULL;
3789
3790         list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3791         SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
3792                 if (kqwl->kqwl_dynamicid == id) {
3793                         struct kqueue *kq = (struct kqueue *)kqwl;
3794
3795                         assert(kq->kq_state & KQ_DYNAMIC);
3796                         assert(kq->kq_state & KQ_WORKLOOP); /* for now */
3797                         return kq;
3798                 }
3799         }
3800         return NULL;
3801 }
3802
3803 static inline void
3804 kqueue_release_last(struct proc *p, struct kqueue *kq)
3805 {
3806         if (kq->kq_state & KQ_DYNAMIC) {
3807                 kqhash_lock(p);
3808                 if (kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF)) {
3809                         kqueue_hash_remove(p, kq);
3810                         kqhash_unlock(p);
3811                         kqueue_dealloc(kq);
3812                 } else {
3813                         kqhash_unlock(p);
3814                 }
3815         }
3816 }
3817
3818 static struct kqueue *
3819 kevent_get_bound_kq(__assert_only struct proc *p, thread_t thread,
3820                     unsigned int kev_flags, unsigned int kq_flags)
3821 {
3822         struct kqueue *kq;
3823         struct uthread *ut = get_bsdthread_info(thread);
3824
3825         assert(p == get_bsdthreadtask_info(thread));
3826
3827         if (!(ut->uu_kqueue_flags & kev_flags))
3828                 return NULL;
3829
3830         kq = ut->uu_kqueue_bound;
3831         if (!kq)
3832                 return NULL;
3833
3834         if (!(kq->kq_state & kq_flags))
3835                 return NULL;
3836
3837         return kq;
3838 }
3839
3840 static int
3841 kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct fileproc **fpp, int *fdp, struct kqueue **kqp)
3842 {
3843         struct filedesc *descp = p->p_fd;
3844         struct fileproc *fp = NULL;
3845         struct kqueue *kq;
3846         int fd = 0;
3847         int error = 0;
3848
3849         /* Was the workloop flag passed?  Then it is for sure only a workloop */
3850         if (flags & KEVENT_FLAG_DYNAMIC_KQUEUE) {
3851                 assert(flags & KEVENT_FLAG_WORKLOOP);
3852                 if (id == (kqueue_id_t)-1 &&
3853                     (flags & KEVENT_FLAG_KERNEL) &&
3854                     (flags & KEVENT_FLAG_WORKLOOP)) {
3855
3856                         assert(is_workqueue_thread(current_thread()));
3857
3858                         /*
3859                          * when kevent_id_internal is called from within the
3860                          * kernel, and the passed 'id' value is '-1' then we
3861                          * look for the currently bound workloop kq.
3862                          *
3863                          * Until pthread kext avoids calling in to kevent_id_internal
3864                          * for threads whose fulfill is canceled, calling in unbound
3865                          * can't be fatal.
3866                          */
3867                         kq = kevent_get_bound_kq(p, current_thread(),
3868                                                  KEVENT_FLAG_WORKLOOP, KQ_WORKLOOP);
3869                         if (kq) {
3870                                 kqueue_retain(kq);
3871                         } else {
3872                                 struct uthread *ut = get_bsdthread_info(current_thread());
3873
3874                                 /* If thread is unbound due to cancel, just return an error */
3875                                 if (ut->uu_kqueue_flags == KEVENT_FLAG_WORKLOOP_CANCELED) {
3876                                         ut->uu_kqueue_flags = 0;
3877                                         error = ECANCELED;
3878                                 } else {
3879                                         panic("Unbound thread called kevent_internal with id=-1"
3880                                               " uu_kqueue_flags:0x%x, uu_kqueue_bound:%p",
3881                                               ut->uu_kqueue_flags, ut->uu_kqueue_bound);
3882                                 }
3883                         }
3884
3885                         *fpp = NULL;
3886                         *fdp = 0;
3887                         *kqp = kq;
3888                         return error;
3889                 }
3890
3891                 /* try shortcut on kq lookup for bound threads */
3892                 kq = kevent_get_bound_kq(p, current_thread(), KEVENT_FLAG_WORKLOOP, KQ_WORKLOOP);
3893                 if (kq != NULL && ((struct kqworkloop *)kq)->kqwl_dynamicid == id) {
3894
3895                         if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3896                                 error = EEXIST;
3897                                 kq = NULL;
3898                                 goto out;
3899                         }
3900
3901                         /* retain a reference while working with this kq. */
3902                         assert(kq->kq_state & KQ_DYNAMIC);
3903                         kqueue_retain(kq);
3904                         error = 0;
3905                         goto out;
3906                 }
3907
3908                 /* look for the kq on the hash table */
3909                 kqhash_lock(p);
3910                 kq = kqueue_hash_lookup(p, id);
3911                 if (kq == NULL) {
3912                         kqhash_unlock(p);
3913
3914                         if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST) {
3915                                 error = ENOENT;
3916                                 goto out;
3917                         }
3918
3919                         struct kqueue *alloc_kq;
3920                         alloc_kq = kqueue_alloc(p, flags);
3921                         if (alloc_kq) {
3922                                 kqhash_lock(p);
3923                                 kqueue_hash_init_if_needed(p);
3924                                 kq = kqueue_hash_lookup(p, id);
3925                                 if (kq == NULL) {
3926                                         /* insert our new one */
3927                                         kq = alloc_kq;
3928                                         kqueue_hash_insert(p, id, kq);
3929                                         kqhash_unlock(p);
3930                                 } else {
3931                                         /* lost race, retain existing workloop */
3932                                         kqueue_retain(kq);
3933                                         kqhash_unlock(p);
3934                                         kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
3935                                         kqueue_dealloc(alloc_kq);
3936                                 }
3937                         } else {
3938                                 error = ENOMEM;
3939                                 goto out;
3940                         }
3941                 } else {
3942
3943                         if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3944                                 kqhash_unlock(p);
3945                                 kq = NULL;
3946                                 error =  EEXIST;
3947                                 goto out;
3948                         }
3949
3950                         /* retain a reference while working with this kq. */
3951                         assert(kq->kq_state & KQ_DYNAMIC);
3952                         kqueue_retain(kq);
3953                         kqhash_unlock(p);
3954                 }
3955
3956         } else if (flags & KEVENT_FLAG_WORKQ) {
3957                 /* must already exist for bound threads. */
3958                 if (flags & KEVENT_FLAG_KERNEL) {
3959                         assert(descp->fd_wqkqueue != NULL);
3960                 }
3961
3962                 /*
3963                  * use the private kq associated with the proc workq.
3964                  * Just being a thread within the process (and not
3965                  * being the exit/exec thread) is enough to hold a
3966                  * reference on this special kq.
3967                  */
3968                 kq = descp->fd_wqkqueue;
3969                 if (kq == NULL) {
3970                         struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ);
3971                         if (alloc_kq == NULL)
3972                                 return ENOMEM;
3973
3974                         knhash_lock(p);
3975                         if (descp->fd_wqkqueue == NULL) {
3976                                 kq = descp->fd_wqkqueue = alloc_kq;
3977                                 knhash_unlock(p);
3978                         } else {
3979                                 knhash_unlock(p);
3980                                 kq = descp->fd_wqkqueue;
3981                                 kqueue_dealloc(alloc_kq);
3982                         }
3983                 }
3984         } else {
3985                 /* get a usecount for the kq itself */
3986                 fd = (int)id;
3987                 if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
3988                         return (error);
3989         }
3990         if ((error = kevent_set_kq_mode(kq, flags)) != 0) {
3991                 /* drop the usecount */
3992                 if (fp != NULL)
3993                         fp_drop(p, fd, fp, 0);
3994                 return error;
3995         }
3996
3997 out:
3998         *fpp = fp;
3999         *fdp = fd;
4000         *kqp = kq;
4001
4002         return error;
4003 }
4004
4005 static void
4006 kevent_put_kq(
4007         struct proc *p,
4008         kqueue_id_t id,
4009         struct fileproc *fp,
4010         struct kqueue *kq)
4011 {
4012         kqueue_release_last(p, kq);
4013         if (fp != NULL) {
4014                 assert((kq->kq_state & KQ_WORKQ) == 0);
4015                 fp_drop(p, (int)id, fp, 0);
4016         }
4017 }
4018
4019 static uint64_t
4020 kevent_workloop_serial_no_copyin(proc_t p, uint64_t workloop_id)
4021 {
4022         uint64_t serial_no = 0;
4023         user_addr_t addr;
4024         int rc;
4025
4026         if (workloop_id == 0 || p->p_dispatchqueue_serialno_offset == 0) {
4027                 return 0;
4028         }
4029         addr = (user_addr_t)(workloop_id + p->p_dispatchqueue_serialno_offset);
4030
4031         if (proc_is64bit(p)) {
4032                 rc = copyin(addr, (caddr_t)&serial_no, sizeof(serial_no));
4033         } else {
4034                 uint32_t serial_no32 = 0;
4035                 rc = copyin(addr, (caddr_t)&serial_no32, sizeof(serial_no32));
4036                 serial_no = serial_no32;
4037         }
4038         return rc == 0 ? serial_no : 0;
4039 }
4040
4041 int
4042 kevent_exit_on_workloop_ownership_leak(thread_t thread)
4043 {
4044         proc_t p = current_proc();
4045         struct filedesc *fdp = p->p_fd;
4046         kqueue_id_t workloop_id = 0;
4047         os_reason_t reason;
4048         mach_vm_address_t addr;
4049         uint32_t reason_size;
4050
4051         kqhash_lock(p);
4052         if (fdp->fd_kqhashmask > 0) {
4053                 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
4054                         struct kqworkloop *kqwl;
4055
4056                         SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
4057                                 struct kqueue *kq = &kqwl->kqwl_kqueue;
4058                                 if ((kq->kq_state & KQ_DYNAMIC) && kqwl->kqwl_owner == thread) {
4059                                         workloop_id = kqwl->kqwl_dynamicid;
4060                                         break;
4061                                 }
4062                         }
4063                 }
4064         }
4065         kqhash_unlock(p);
4066         assert(workloop_id);
4067
4068         reason = os_reason_create(OS_REASON_LIBSYSTEM,
4069                         OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK);
4070         if (reason == OS_REASON_NULL) {
4071                 goto out;
4072         }
4073
4074         reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
4075         reason_size = 2 * sizeof(uint64_t);
4076         reason_size = kcdata_estimate_required_buffer_size(2, reason_size);
4077         if (os_reason_alloc_buffer(reason, reason_size) != 0) {
4078                 goto out;
4079         }
4080
4081         struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor;
4082
4083         if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID,
4084                         sizeof(workloop_id), &addr) == KERN_SUCCESS) {
4085                 kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id));
4086         }
4087
4088         uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id);
4089         if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO,
4090                         sizeof(serial_no), &addr) == KERN_SUCCESS) {
4091                 kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no));
4092         }
4093
4094 out:
4095 #if DEVELOPMENT || DEBUG
4096         psignal_try_thread_with_reason(p, thread, SIGABRT, reason);
4097         return 0;
4098 #else
4099         return exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL,
4100                         FALSE, FALSE, 0, reason);
4101 #endif
4102 }
4103
4104
4105 static int
4106 kevent_servicer_detach_preflight(thread_t thread, unsigned int flags, struct kqueue *kq)
4107 {
4108         int error = 0;
4109         struct kqworkloop *kqwl;
4110         struct uthread *ut;
4111         struct kqrequest *kqr;
4112
4113         if (!(flags & KEVENT_FLAG_WORKLOOP) || !(kq->kq_state & KQ_WORKLOOP))
4114                 return EINVAL;
4115
4116         /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads */
4117         if (!(kq->kq_state & KQ_NO_WQ_THREAD))
4118                 return EINVAL;
4119
4120         /* allow detach only on not wq threads */
4121         if (is_workqueue_thread(thread))
4122                 return EINVAL;
4123
4124         /* check that the current thread is bound to the requested wq */
4125         ut = get_bsdthread_info(thread);
4126         if (ut->uu_kqueue_bound != kq)
4127                 return EINVAL;
4128
4129         kqwl = (struct kqworkloop *)kq;
4130         kqwl_req_lock(kqwl);
4131         kqr = &kqwl->kqwl_request;
4132
4133         /* check that the wq is bound to the thread */
4134         if ((kqr->kqr_state & KQR_BOUND) == 0  || (kqr->kqr_thread != thread))
4135                 error = EINVAL;
4136
4137         kqwl_req_unlock(kqwl);
4138
4139         return error;
4140 }
4141
4142 static void
4143 kevent_servicer_detach_thread(struct proc *p, kqueue_id_t id, thread_t thread,
4144                 unsigned int flags, struct kqueue *kq)
4145 {
4146         struct kqworkloop *kqwl;
4147         struct uthread *ut;
4148
4149         assert((flags & KEVENT_FLAG_WORKLOOP) && (kq->kq_state & KQ_WORKLOOP));
4150
4151         /* allow detach only on not wqthreads threads */
4152         assert(!is_workqueue_thread(thread));
4153
4154         /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads */
4155         assert(kq->kq_state & KQ_NO_WQ_THREAD);
4156
4157         /* check that the current thread is bound to the requested kq */
4158         ut = get_bsdthread_info(thread);
4159         assert(ut->uu_kqueue_bound == kq);
4160
4161         kqwl = (struct kqworkloop *)kq;
4162
4163         kqlock(kq);
4164
4165         /* unbind the thread.
4166          * unbind itself checks if still processing and ends it.
4167          */
4168         kqworkloop_unbind_thread(kqwl, thread, flags);
4169
4170         kqunlock(kq);
4171
4172         kevent_put_kq(p, id, NULL, kq);
4173
4174         return;
4175 }
4176
4177 static int
4178 kevent_servicer_attach_thread(thread_t thread, unsigned int flags, struct kqueue *kq)
4179 {
4180         int error = 0;
4181         struct kqworkloop *kqwl;
4182         struct uthread *ut;
4183         struct kqrequest *kqr;
4184
4185         if (!(flags & KEVENT_FLAG_WORKLOOP) || !(kq->kq_state & KQ_WORKLOOP))
4186                 return EINVAL;
4187
4188         /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads*/
4189         if (!(kq->kq_state & KQ_NO_WQ_THREAD))
4190                 return EINVAL;
4191
4192         /* allow attach only on not wqthreads */
4193         if (is_workqueue_thread(thread))
4194                 return EINVAL;
4195
4196         /* check that the thread is not already bound */
4197         ut = get_bsdthread_info(thread);
4198         if (ut->uu_kqueue_bound != NULL)
4199                 return EINVAL;
4200
4201         assert(ut->uu_kqueue_flags == 0);
4202
4203         kqlock(kq);
4204         kqwl = (struct kqworkloop *)kq;
4205         kqwl_req_lock(kqwl);
4206         kqr = &kqwl->kqwl_request;
4207
4208         /* check that the kqueue is not already bound */
4209         if (kqr->kqr_state & (KQR_BOUND | KQR_THREQUESTED | KQR_DRAIN)) {
4210                 error = EINVAL;
4211                 goto out;
4212         }
4213
4214         assert(kqr->kqr_thread == NULL);
4215         assert((kqr->kqr_state & KQR_PROCESSING) == 0);
4216
4217         kqr->kqr_state |= KQR_THREQUESTED;
4218         kqr->kqr_qos_index = THREAD_QOS_UNSPECIFIED;
4219         kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
4220         kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
4221         kqr->kqr_owner_override_is_sync = 0;
4222
4223         kqworkloop_bind_thread_impl(kqwl, thread, KEVENT_FLAG_WORKLOOP);
4224
4225         /* get a ref on the wlkq on behalf of the attached thread */
4226         kqueue_retain(kq);
4227
4228 out:
4229         kqwl_req_unlock(kqwl);
4230         kqunlock(kq);
4231
4232         return error;
4233 }
4234
4235 static inline
4236 boolean_t kevent_args_requesting_events(unsigned int flags, int nevents)
4237 {
4238         return (!(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0);
4239 }
4240
4241 static int
4242 kevent_internal(struct proc *p,
4243                 kqueue_id_t id, kqueue_id_t *id_out,
4244                 user_addr_t changelist, int nchanges,
4245                 user_addr_t ueventlist, int nevents,
4246                 user_addr_t data_out, uint64_t data_available,
4247                 unsigned int flags,
4248                 user_addr_t utimeout,
4249                 kqueue_continue_t continuation,
4250                 int32_t *retval)
4251 {
4252         struct _kevent *cont_args;
4253         uthread_t ut;
4254         struct kqueue *kq;
4255         struct fileproc *fp = NULL;
4256         int fd = 0;
4257         struct kevent_internal_s kev;
4258         int error, noutputs;
4259         struct timeval atv;
4260         user_size_t data_size;
4261         user_size_t data_resid;
4262         thread_t thread = current_thread();
4263
4264         /* Don't allow user-space threads to process output events from the workq kqs */
4265         if (((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ) &&
4266             kevent_args_requesting_events(flags, nevents))
4267                 return EINVAL;
4268
4269         /* restrict dynamic kqueue allocation to workloops (for now) */
4270         if ((flags & (KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP)) == KEVENT_FLAG_DYNAMIC_KQUEUE)
4271                 return EINVAL;
4272
4273         if (flags & (KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH | KEVENT_FLAG_WORKLOOP_SERVICER_DETACH |
4274             KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST | KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD)) {
4275
4276                 /* allowed only on workloops when calling kevent_id from user-space */
4277                 if (!(flags & KEVENT_FLAG_WORKLOOP) || (flags & KEVENT_FLAG_KERNEL) || !(flags & KEVENT_FLAG_DYNAMIC_KQUEUE))
4278                         return EINVAL;
4279
4280                 /* cannot attach and detach simultaneously*/
4281                 if ((flags & KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH) && (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH))
4282                         return EINVAL;
4283
4284                 /* cannot ask for events and detach */
4285                 if ((flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) && kevent_args_requesting_events(flags, nevents))
4286                         return EINVAL;
4287
4288         }
4289
4290         /* prepare to deal with stack-wise allocation of out events */
4291         if (flags & KEVENT_FLAG_STACK_EVENTS) {
4292                 int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
4293                              (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
4294                                                     sizeof(struct user32_kevent)) :
4295                              ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
4296                                                                sizeof(struct kevent_qos_s)));
4297                 ueventlist += nevents * scale;
4298         }
4299
4300         /* convert timeout to absolute - if we have one (and not immediate) */
4301         error = kevent_get_timeout(p, utimeout, flags, &atv);
4302         if (error)
4303                 return error;
4304
4305         /* copyin initial value of data residual from data_available */
4306         error = kevent_get_data_size(p, data_available, flags, &data_size);
4307         if (error)
4308                 return error;
4309
4310         /* get the kq we are going to be working on */
4311         error = kevent_get_kq(p, id, flags, &fp, &fd, &kq);
4312         if (error)
4313                 return error;
4314
4315         /* only bound threads can receive events on workloops */
4316         if ((flags & KEVENT_FLAG_WORKLOOP) && kevent_args_requesting_events(flags, nevents)) {
4317                 ut = (uthread_t)get_bsdthread_info(thread);
4318                 if (ut->uu_kqueue_bound != kq) {
4319                         error = EXDEV;
4320                         goto out;
4321                 }
4322
4323         }
4324
4325         /* attach the current thread if necessary */
4326         if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH) {
4327                 error = kevent_servicer_attach_thread(thread, flags, kq);
4328                 if (error)
4329                         goto out;
4330         }
4331         else {
4332                 /* before processing events and committing to the system call, return an error if the thread cannot be detached when requested */
4333                 if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) {
4334                         error = kevent_servicer_detach_preflight(thread, flags, kq);
4335                         if (error)
4336                                 goto out;
4337                 }
4338         }
4339
4340         if (id_out && kq && (flags & KEVENT_FLAG_WORKLOOP)) {
4341                 assert(kq->kq_state & KQ_WORKLOOP);
4342                 struct kqworkloop *kqwl;
4343                 kqwl = (struct kqworkloop *)kq;
4344                 *id_out = kqwl->kqwl_dynamicid;
4345         }
4346
4347         /* register all the change requests the user provided... */
4348         noutputs = 0;
4349         while (nchanges > 0 && error == 0) {
4350                 error = kevent_copyin(&changelist, &kev, p, flags);
4351                 if (error)
4352                         break;
4353
4354                 /* Make sure user doesn't pass in any system flags */
4355                 kev.flags &= ~EV_SYSFLAGS;
4356
4357                 kevent_register(kq, &kev, p);
4358
4359                 if (nevents > 0 &&
4360                     ((kev.flags & EV_ERROR) || (kev.flags & EV_RECEIPT))) {
4361                         if (kev.flags & EV_RECEIPT) {
4362                                 kev.flags |= EV_ERROR;
4363                                 kev.data = 0;
4364                         }
4365                         error = kevent_copyout(&kev, &ueventlist, p, flags);
4366                         if (error == 0) {
4367                                 nevents--;
4368                                 noutputs++;
4369                         }
4370                 } else if (kev.flags & EV_ERROR) {
4371                         error = kev.data;
4372                 }
4373                 nchanges--;
4374         }
4375
4376         /* short-circuit the scan if we only want error events */
4377         if (flags & KEVENT_FLAG_ERROR_EVENTS)
4378                 nevents = 0;
4379
4380         /* process pending events */
4381         if (nevents > 0 && noutputs == 0 && error == 0) {
4382                 /* store the continuation/completion data in the uthread */
4383                 ut = (uthread_t)get_bsdthread_info(thread);
4384                 cont_args = &ut->uu_kevent.ss_kevent;
4385                 cont_args->fp = fp;
4386                 cont_args->fd = fd;
4387                 cont_args->retval = retval;
4388                 cont_args->eventlist = ueventlist;
4389                 cont_args->eventcount = nevents;
4390                 cont_args->eventout = noutputs;
4391                 cont_args->data_available = data_available;
4392                 cont_args->process_data.fp_fd = (int)id;
4393                 cont_args->process_data.fp_flags = flags;
4394                 cont_args->process_data.fp_data_out = data_out;
4395                 cont_args->process_data.fp_data_size = data_size;
4396                 cont_args->process_data.fp_data_resid = data_size;
4397
4398                 error = kqueue_scan(kq, kevent_callback,
4399                                     continuation, cont_args,
4400                                     &cont_args->process_data,
4401                                     &atv, p);
4402
4403                 /* process remaining outputs */
4404                 noutputs = cont_args->eventout;
4405                 data_resid = cont_args->process_data.fp_data_resid;
4406
4407                 /* copyout residual data size value (if it needs to be copied out) */
4408                 /* don't abandon other output just because of residual copyout failures */
4409                 if (error == 0 && data_available && data_resid != data_size) {
4410                         (void)kevent_put_data_size(p, data_available, flags, data_resid);
4411                 }
4412         }
4413
4414         /* detach the current thread if necessary */
4415         if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) {
4416                 assert(fp == NULL);
4417                 kevent_servicer_detach_thread(p, id, thread, flags, kq);
4418         }
4419
4420 out:
4421         kevent_put_kq(p, id, fp, kq);
4422
4423         /* don't restart after signals... */
4424         if (error == ERESTART)
4425                 error = EINTR;
4426         else if (error == EWOULDBLOCK)
4427                 error = 0;
4428         if (error == 0)
4429                 *retval = noutputs;
4430         return (error);
4431 }
4432
4433
4434 /*
4435  * kevent_callback - callback for each individual event
4436  *
4437  * called with nothing locked
4438  * caller holds a reference on the kqueue
4439  */
4440 static int
4441 kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
4442     void *data)
4443 {
4444         struct _kevent *cont_args;
4445         int error;
4446
4447         cont_args = (struct _kevent *)data;
4448         assert(cont_args->eventout < cont_args->eventcount);
4449
4450         /*
4451          * Copy out the appropriate amount of event data for this user.
4452          */
4453         error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
4454                                cont_args->process_data.fp_flags);
4455
4456         /*
4457          * If there isn't space for additional events, return
4458          * a harmless error to stop the processing here
4459          */
4460         if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
4461                 error = EWOULDBLOCK;
4462         return (error);
4463 }
4464
4465 /*
4466  * kevent_description - format a description of a kevent for diagnostic output
4467  *
4468  * called with a 256-byte string buffer
4469  */
4470
4471 char *
4472 kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
4473 {
4474         snprintf(s, n,
4475             "kevent="
4476             "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
4477             kevp->ident,
4478             kevp->filter,
4479             kevp->flags,
4480             kevp->udata,
4481             kevp->fflags,
4482             kevp->data,
4483             kevp->ext[0],
4484             kevp->ext[1] );
4485
4486         return (s);
4487 }
4488
4489 /*
4490  * kevent_register - add a new event to a kqueue
4491  *
4492  *      Creates a mapping between the event source and
4493  *      the kqueue via a knote data structure.
4494  *
4495  *      Because many/most the event sources are file
4496  *      descriptor related, the knote is linked off
4497  *      the filedescriptor table for quick access.
4498  *
4499  *      called with nothing locked
4500  *      caller holds a reference on the kqueue
4501  */
4502
4503 void
4504 kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
4505     __unused struct proc *ctxp)
4506 {
4507         struct proc *p = kq->kq_p;
4508         const struct filterops *fops;
4509         struct knote *kn = NULL;
4510         int result = 0;
4511         int error = 0;
4512         unsigned short kev_flags = kev->flags;
4513         int knoteuse_flags = KNUSE_NONE;
4514
4515         if (kev->filter < 0) {
4516                 if (kev->filter + EVFILT_SYSCOUNT < 0) {
4517                         error = EINVAL;
4518                         goto out;
4519                 }
4520                 fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
4521         } else {
4522                 error = EINVAL;
4523                 goto out;
4524         }
4525
4526         /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
4527         if ((kev->flags & EV_VANISHED) &&
4528             (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) {
4529                 error = EINVAL;
4530                 goto out;
4531         }
4532
4533         /* Simplify the flags - delete and disable overrule */
4534         if (kev->flags & EV_DELETE)
4535                 kev->flags &= ~EV_ADD;
4536         if (kev->flags & EV_DISABLE)
4537                 kev->flags &= ~EV_ENABLE;
4538
4539         if (kq->kq_state & KQ_WORKLOOP) {
4540                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
4541                               ((struct kqworkloop *)kq)->kqwl_dynamicid,
4542                               kev->udata, kev->flags, kev->filter);
4543         } else if (kq->kq_state & KQ_WORKQ) {
4544                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
4545                               0, kev->udata, kev->flags, kev->filter);
4546         } else {
4547                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
4548                               VM_KERNEL_UNSLIDE_OR_PERM(kq),
4549                               kev->udata, kev->flags, kev->filter);
4550         }
4551
4552 restart:
4553
4554         /* find the matching knote from the fd tables/hashes */
4555         kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
4556
4557         if (kn == NULL) {
4558                 if (kev->flags & EV_ADD) {
4559                         struct fileproc *knote_fp = NULL;
4560
4561                         /* grab a file reference for the new knote */
4562                         if (fops->f_isfd) {
4563                                 if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) {
4564                                         goto out;
4565                                 }
4566                         }
4567
4568                         kn = knote_alloc();
4569                         if (kn == NULL) {
4570                                 error = ENOMEM;
4571                                 if (knote_fp != NULL)
4572                                         fp_drop(p, kev->ident, knote_fp, 0);
4573                                 goto out;
4574                         }
4575
4576                         kn->kn_fp = knote_fp;
4577                         knote_set_kq(kn, kq);
4578                         kqueue_retain(kq); /* retain a kq ref */
4579                         kn->kn_filtid = ~kev->filter;
4580                         kn->kn_inuse = 1;  /* for f_attach() */
4581                         kn->kn_status = KN_ATTACHING | KN_ATTACHED;
4582
4583                         /* was vanish support requested */
4584                         if (kev->flags & EV_VANISHED) {
4585                                 kev->flags &= ~EV_VANISHED;
4586                                 kn->kn_status |= KN_REQVANISH;
4587                         }
4588
4589                         /* snapshot matching/dispatching protcol flags into knote */
4590                         if (kev->flags & EV_DISPATCH)
4591                                 kn->kn_status |= KN_DISPATCH;
4592                         if (kev->flags & EV_UDATA_SPECIFIC)
4593                                 kn->kn_status |= KN_UDATA_SPECIFIC;
4594
4595                         /*
4596                          * copy the kevent state into knote
4597                          * protocol is that fflags and data
4598                          * are saved off, and cleared before
4599                          * calling the attach routine.
4600                          */
4601                         kn->kn_kevent = *kev;
4602                         kn->kn_sfflags = kev->fflags;
4603                         kn->kn_sdata = kev->data;
4604                         kn->kn_fflags = 0;
4605                         kn->kn_data = 0;
4606
4607                         /* invoke pthread kext to convert kevent qos to thread qos */
4608                         knote_canonicalize_kevent_qos(kn);
4609                         knote_set_qos_index(kn, qos_index_from_qos(kn, kn->kn_qos, FALSE));
4610
4611                         /* before anyone can find it */
4612                         if (kev->flags & EV_DISABLE) {
4613                                 /*
4614                                  * do this before anyone can find it,
4615                                  * this can't call knote_disable() because it expects having
4616                                  * the kqlock held
4617                                  */
4618                                 kn->kn_status |= KN_DISABLED;
4619                         }
4620
4621                         /* Add the knote for lookup thru the fd table */
4622                         error = kq_add_knote(kq, kn, kev, p, &knoteuse_flags);
4623                         if (error) {
4624                                 (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
4625                                 knote_free(kn);
4626                                 if (knote_fp != NULL)
4627                                         fp_drop(p, kev->ident, knote_fp, 0);
4628
4629                                 if (error == ERESTART) {
4630                                         error = 0;
4631                                         goto restart;
4632                                 }
4633                                 goto out;
4634                         }
4635
4636                         /* fp reference count now applies to knote */
4637                         /* rwlock boost is now held */
4638
4639                         /* call filter attach routine */
4640                         result = fops->f_attach(kn, kev);
4641
4642                         /*
4643                          * Trade knote use count for kq lock.
4644                          * Cannot be dropped because we held
4645                          * KN_ATTACHING throughout.
4646                          */
4647                         knoteuse2kqlock(kq, kn, KNUSE_STEAL_DROP | knoteuse_flags);
4648
4649                         if (kn->kn_flags & EV_ERROR) {
4650                                 /*
4651                                  * Failed to attach correctly, so drop.
4652                                  * All other possible users/droppers
4653                                  * have deferred to us.  Save the error
4654                                  * to return to our caller.
4655                                  */
4656                                 kn->kn_status &= ~KN_ATTACHED;
4657                                 kn->kn_status |= KN_DROPPING;
4658                                 error = kn->kn_data;
4659                                 kqunlock(kq);
4660                                 knote_drop(kn, p);
4661                                 goto out;
4662                         }
4663
4664                         /* end "attaching" phase - now just attached */
4665                         kn->kn_status &= ~KN_ATTACHING;
4666
4667                         if (kn->kn_status & KN_DROPPING) {
4668                                 /*
4669                                  * Attach succeeded, but someone else
4670                                  * deferred their drop - now we have
4671                                  * to do it for them.
4672                                  */
4673                                 kqunlock(kq);
4674                                 knote_drop(kn, p);
4675                                 goto out;
4676                         }
4677
4678                         /* Mark the thread request overcommit - if appropos */
4679                         knote_set_qos_overcommit(kn);
4680
4681                         /*
4682                          * If the attach routine indicated that an
4683                          * event is already fired, activate the knote.
4684                          */
4685                         if (result)
4686                                 knote_activate(kn);
4687
4688                         if (knote_fops(kn)->f_post_attach) {
4689                                 error = knote_fops(kn)->f_post_attach(kn, kev);
4690                                 if (error) {
4691                                         kqunlock(kq);
4692                                         goto out;
4693                                 }
4694                         }
4695
4696                 } else {
4697                         if ((kev_flags & (EV_ADD | EV_DELETE)) == (EV_ADD | EV_DELETE) &&
4698                                         (kq->kq_state & KQ_WORKLOOP)) {
4699                                 /*
4700                                  * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
4701                                  * that doesn't care about ENOENT, so just pretend the deletion
4702                                  * happened.
4703                                  */
4704                         } else {
4705                                 error = ENOENT;
4706                         }
4707                         goto out;
4708                 }
4709
4710         } else {
4711                 /* existing knote: kqueue lock already taken by kq_find_knote_and_kq_lock */
4712
4713                 if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
4714                         /*
4715                          * The knote is not in a stable state, wait for that
4716                          * transition to complete and then redrive the lookup.
4717                          */
4718                         knoteusewait(kq, kn);
4719                         goto restart;
4720                 }
4721
4722                 if (kev->flags & EV_DELETE) {
4723
4724                         /*
4725                          * If attempting to delete a disabled dispatch2 knote,
4726                          * we must wait for the knote to be re-enabled (unless
4727                          * it is being re-enabled atomically here).
4728                          */
4729                         if ((kev->flags & EV_ENABLE) == 0 &&
4730                             (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) ==
4731                                              (KN_DISPATCH2 | KN_DISABLED)) {
4732                                 kn->kn_status |= KN_DEFERDELETE;
4733                                 kqunlock(kq);
4734                                 error = EINPROGRESS;
4735                         } else if (knote_fops(kn)->f_drop_and_unlock) {
4736                                 /*
4737                                  * The filter has requested to handle EV_DELETE events
4738                                  *
4739                                  * ERESTART means the kevent has to be re-evaluated
4740                                  */
4741                                 error = knote_fops(kn)->f_drop_and_unlock(kn, kev);
4742                                 if (error == ERESTART) {
4743                                         error = 0;
4744                                         goto restart;
4745                                 }
4746                         } else if (kqlock2knotedrop(kq, kn)) {
4747                                 /* standard/default EV_DELETE path */
4748                                 knote_drop(kn, p);
4749                         } else {
4750                                 /*
4751                                  * The kqueue is unlocked, it's not being
4752                                  * dropped, and kqlock2knotedrop returned 0:
4753                                  * this means that someone stole the drop of
4754                                  * the knote from us.
4755                                  */
4756                                 error = EINPROGRESS;
4757                         }
4758                         goto out;
4759                 }
4760
4761                 /*
4762                  * If we are re-enabling a deferred-delete knote,
4763                  * just enable it now and avoid calling the
4764                  * filter touch routine (it has delivered its
4765                  * last event already).
4766                  */
4767                 if ((kev->flags & EV_ENABLE) &&
4768                     (kn->kn_status & KN_DEFERDELETE)) {
4769                         assert(kn->kn_status & KN_DISABLED);
4770                         knote_activate(kn);
4771                         knote_enable(kn);
4772                         kqunlock(kq);
4773                         goto out;
4774                 }
4775
4776                 /*
4777                  * If we are disabling, do it before unlocking and
4778                  * calling the touch routine (so no processing can
4779                  * see the new kevent state before the disable is
4780                  * applied).
4781                  */
4782                 if (kev->flags & EV_DISABLE)
4783                         knote_disable(kn);
4784
4785                 /*
4786                  * Convert the kqlock to a use reference on the
4787                  * knote so we can call the filter touch routine.
4788                  */
4789                 if (knoteuse_needs_boost(kn, kev)) {
4790                         knoteuse_flags |= KNUSE_BOOST;
4791                 }
4792                 if (kqlock2knoteuse(kq, kn, knoteuse_flags)) {
4793                         /*
4794                          * Call touch routine to notify filter of changes
4795                          * in filter values (and to re-determine if any
4796                          * events are fired).
4797                          */
4798                         result = knote_fops(kn)->f_touch(kn, kev);
4799
4800                         /* Get the kq lock back (don't defer droppers). */
4801                         if (!knoteuse2kqlock(kq, kn, knoteuse_flags)) {
4802                                 kqunlock(kq);
4803                                 goto out;
4804                         }
4805
4806                         /* Handle errors during touch routine */
4807                         if (kev->flags & EV_ERROR) {
4808                                 error = kev->data;
4809                                 kqunlock(kq);
4810                                 goto out;
4811                         }
4812
4813                         /* Activate it if the touch routine said to */
4814                         if (result)
4815                                 knote_activate(kn);
4816                 }
4817
4818                 /* Enable the knote if called for */
4819                 if (kev->flags & EV_ENABLE)
4820                         knote_enable(kn);
4821
4822         }
4823
4824         /* still have kqlock held and knote is valid */
4825         kqunlock(kq);
4826
4827 out:
4828         /* output local errors through the kevent */
4829         if (error) {
4830                 kev->flags |= EV_ERROR;
4831                 kev->data = error;
4832         }
4833 }
4834
4835
4836 /*
4837  * knote_process - process a triggered event
4838  *
4839  *      Validate that it is really still a triggered event
4840  *      by calling the filter routines (if necessary).  Hold
4841  *      a use reference on the knote to avoid it being detached.
4842  *
4843  *      If it is still considered triggered, we will have taken
4844  *      a copy of the state under the filter lock.  We use that
4845  *      snapshot to dispatch the knote for future processing (or
4846  *      not, if this was a lost event).
4847  *
4848  *      Our caller assures us that nobody else can be processing
4849  *      events from this knote during the whole operation. But
4850  *      others can be touching or posting events to the knote
4851  *      interspersed with our processing it.
4852  *
4853  *      caller holds a reference on the kqueue.
4854  *      kqueue locked on entry and exit - but may be dropped
4855  */
4856 static int
4857 knote_process(struct knote *kn,
4858         kevent_callback_t callback,
4859         void *callback_data,
4860         struct filt_process_s *process_data,
4861         struct proc *p)
4862 {
4863         struct kevent_internal_s kev;
4864         struct kqueue *kq = knote_get_kq(kn);
4865         int result = 0;
4866         int error = 0;
4867
4868         bzero(&kev, sizeof(kev));
4869
4870         /*
4871          * Must be active or stayactive
4872          * Must be queued and not disabled/suppressed
4873          */
4874         assert(kn->kn_status & KN_QUEUED);
4875         assert(kn->kn_status & (KN_ACTIVE|KN_STAYACTIVE));
4876         assert(!(kn->kn_status & (KN_DISABLED|KN_SUPPRESSED|KN_DROPPING)));
4877
4878         if (kq->kq_state & KQ_WORKLOOP) {
4879                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4880                               ((struct kqworkloop *)kq)->kqwl_dynamicid,
4881                               kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4882                               kn->kn_filtid);
4883         } else if (kq->kq_state & KQ_WORKQ) {
4884                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4885                               0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4886                               kn->kn_filtid);
4887         } else {
4888                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4889                               VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4890                               kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4891         }
4892
4893         /*
4894          * For deferred-drop or vanished events, we just create a fake
4895          * event to acknowledge end-of-life.  Otherwise, we call the
4896          * filter's process routine to snapshot the kevent state under
4897          * the filter's locking protocol.
4898          */
4899         if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4900                 /* create fake event */
4901                 kev.filter = kn->kn_filter;
4902                 kev.ident = kn->kn_id;
4903                 kev.qos = kn->kn_qos;
4904                 kev.flags = (kn->kn_status & KN_DEFERDELETE) ?
4905                             EV_DELETE : EV_VANISHED;
4906                 kev.flags |= (EV_DISPATCH2 | EV_ONESHOT);
4907                 kev.udata = kn->kn_udata;
4908                 result = 1;
4909
4910                 knote_suppress(kn);
4911         } else {
4912                 int flags = KNUSE_NONE;
4913                 /* deactivate - so new activations indicate a wakeup */
4914                 knote_deactivate(kn);
4915
4916                 /* suppress knotes to avoid returning the same event multiple times in a single call. */
4917                 knote_suppress(kn);
4918
4919                 if (knoteuse_needs_boost(kn, NULL)) {
4920                         flags |= KNUSE_BOOST;
4921                 }
4922                 /* convert lock to a knote use reference */
4923                 if (!kqlock2knoteuse(kq, kn, flags))
4924                         panic("dropping knote found on queue\n");
4925
4926                 /* call out to the filter to process with just a ref */
4927                 result = knote_fops(kn)->f_process(kn, process_data, &kev);
4928                 if (result) flags |= KNUSE_STEAL_DROP;
4929
4930                 /*
4931                  * convert our reference back to a lock. accept drop
4932                  * responsibility from others if we've committed to
4933                  * delivering event data.
4934                  */
4935                 if (!knoteuse2kqlock(kq, kn, flags)) {
4936                         /* knote dropped */
4937                         kn = NULL;
4938                 }
4939         }
4940
4941         if (kn != NULL) {
4942                 /*
4943                  * Determine how to dispatch the knote for future event handling.
4944                  * not-fired: just return (do not callout, leave deactivated).
4945                  * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
4946                  *            is the deferred delete event delivery itself).  Otherwise,
4947                  *            drop it.
4948                  * stolendrop:We took responsibility for someone else's drop attempt.
4949                  *            treat this just like one-shot and prepare to turn it back
4950                  *            into a deferred delete if required.
4951                  * Dispatch:  don't clear state, just mark it disabled.
4952                  * Cleared:   just leave it deactivated.
4953                  * Others:    re-activate as there may be more events to handle.
4954                  *            This will not wake up more handlers right now, but
4955                  *            at the completion of handling events it may trigger
4956                  *            more handler threads (TODO: optimize based on more than
4957                  *            just this one event being detected by the filter).
4958                  */
4959
4960                 if (result == 0)
4961                         return (EJUSTRETURN);
4962
4963                 if ((kev.flags & EV_ONESHOT) || (kn->kn_status & KN_STOLENDROP)) {
4964                         if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) {
4965                                 /* defer dropping non-delete oneshot dispatch2 events */
4966                                 kn->kn_status |= KN_DEFERDELETE;
4967                                 knote_disable(kn);
4968
4969                                 /* if we took over another's drop clear those flags here */
4970                                 if (kn->kn_status & KN_STOLENDROP) {
4971                                         assert(kn->kn_status & KN_DROPPING);
4972                                         /*
4973                                          * the knote will be dropped when the
4974                                          * deferred deletion occurs
4975                                          */
4976                                         kn->kn_status &= ~(KN_DROPPING|KN_STOLENDROP);
4977                                 }
4978                         } else if (kn->kn_status & KN_STOLENDROP) {
4979                                 /* We now own the drop of the knote. */
4980                                 assert(kn->kn_status & KN_DROPPING);
4981                                 knote_unsuppress(kn);
4982                                 kqunlock(kq);
4983                                 knote_drop(kn, p);
4984                                 kqlock(kq);
4985                         } else if (kqlock2knotedrop(kq, kn)) {
4986                                 /* just EV_ONESHOT, _not_ DISPATCH2 */
4987                                 knote_drop(kn, p);
4988                                 kqlock(kq);
4989                         }
4990                 } else if (kn->kn_status & KN_DISPATCH) {
4991                         /* disable all dispatch knotes */
4992                         knote_disable(kn);
4993                 } else if ((kev.flags & EV_CLEAR) == 0) {
4994                         /* re-activate in case there are more events */
4995                         knote_activate(kn);
4996                 }
4997         }
4998
4999         /*
5000          * callback to handle each event as we find it.
5001          * If we have to detach and drop the knote, do
5002          * it while we have the kq unlocked.
5003          */
5004         if (result) {
5005                 kqunlock(kq);
5006                 error = (callback)(kq, &kev, callback_data);
5007                 kqlock(kq);
5008         }
5009         return (error);
5010 }
5011
5012
5013 /*
5014  * Return 0 to indicate that processing should proceed,
5015  * -1 if there is nothing to process.
5016  *
5017  * Called with kqueue locked and returns the same way,
5018  * but may drop lock temporarily.
5019  */
5020 static int
5021 kqworkq_begin_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
5022 {
5023         struct kqrequest *kqr;
5024         thread_t self = current_thread();
5025         __assert_only struct uthread *ut = get_bsdthread_info(self);
5026
5027         assert(kqwq->kqwq_state & KQ_WORKQ);
5028         assert(qos_index < KQWQ_NQOS);
5029
5030         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
5031                       flags, qos_index);
5032
5033         kqwq_req_lock(kqwq);
5034
5035         kqr = kqworkq_get_request(kqwq, qos_index);
5036
5037         /* manager skips buckets that haven't asked for its help */
5038         if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
5039
5040                 /* If nothing for manager to do, just return */
5041                 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
5042                         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
5043                                                 0, kqr->kqr_state);
5044                         kqwq_req_unlock(kqwq);
5045                         return -1;
5046                 }
5047                 /* bind manager thread from this time on */
5048                 kqworkq_bind_thread_impl(kqwq, qos_index, self, flags);
5049
5050         } else {
5051                 /* We should already be bound to this kqueue */
5052                 assert(kqr->kqr_state & KQR_BOUND);
5053                 assert(kqr->kqr_thread == self);
5054                 assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
5055                 assert(ut->uu_kqueue_qos_index == qos_index);
5056                 assert((ut->uu_kqueue_flags & flags) == ut->uu_kqueue_flags);
5057         }
5058
5059         /*
5060          * we should have been requested to be here
5061          * and nobody else should still be processing
5062          */
5063         assert(kqr->kqr_state & KQR_WAKEUP);
5064         assert(kqr->kqr_state & KQR_THREQUESTED);
5065         assert((kqr->kqr_state & KQR_PROCESSING) == 0);
5066
5067         /* reset wakeup trigger to catch new events after we start processing */
5068         kqr->kqr_state &= ~KQR_WAKEUP;
5069
5070         /* convert to processing mode */
5071         kqr->kqr_state |= KQR_PROCESSING;
5072
5073         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
5074                       kqr_thread_id(kqr), kqr->kqr_state);
5075
5076         kqwq_req_unlock(kqwq);
5077         return 0;
5078 }
5079
5080 static inline bool
5081 kqworkloop_is_processing_on_current_thread(struct kqworkloop *kqwl)
5082 {
5083         struct kqueue *kq = &kqwl->kqwl_kqueue;
5084
5085         kqlock_held(kq);
5086
5087         if (kq->kq_state & KQ_PROCESSING) {
5088                 /*
5089                  * KQ_PROCESSING is unset with the kqlock held, and the kqr thread is
5090                  * never modified while KQ_PROCESSING is set, meaning that peeking at
5091                  * its value is safe from this context.
5092                  */
5093                 return kqwl->kqwl_request.kqr_thread == current_thread();
5094         }
5095         return false;
5096 }
5097
5098 static void
5099 kqworkloop_acknowledge_events(struct kqworkloop *kqwl, boolean_t clear_ipc_override)
5100 {
5101         struct kqrequest *kqr = &kqwl->kqwl_request;
5102         struct knote *kn, *tmp;
5103
5104         kqlock_held(&kqwl->kqwl_kqueue);
5105
5106         TAILQ_FOREACH_SAFE(kn, &kqr->kqr_suppressed, kn_tqe, tmp) {
5107                 /*
5108                  * If a knote that can adjust QoS is disabled because of the automatic
5109                  * behavior of EV_DISPATCH, the knotes should stay suppressed so that
5110                  * further overrides keep pushing.
5111                  */
5112                 if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
5113                                 (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 &&
5114                                 (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
5115                         /*
5116                          * When called from unbind, clear the sync ipc override on the knote
5117                          * for events which are delivered.
5118                          */
5119                         if (clear_ipc_override) {
5120                                 knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
5121                         }
5122                         continue;
5123                 }
5124                 knote_unsuppress(kn);
5125         }
5126 }
5127
5128 static int
5129 kqworkloop_begin_processing(struct kqworkloop *kqwl,
5130                 __assert_only unsigned int flags)
5131 {
5132         struct kqrequest *kqr = &kqwl->kqwl_request;
5133         struct kqueue *kq = &kqwl->kqwl_kqueue;
5134
5135         kqlock_held(kq);
5136
5137         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
5138                       kqwl->kqwl_dynamicid, flags, 0);
5139
5140         kqwl_req_lock(kqwl);
5141
5142         /* nobody else should still be processing */
5143         assert((kqr->kqr_state & KQR_PROCESSING) == 0);
5144         assert((kq->kq_state & KQ_PROCESSING) == 0);
5145
5146         kqr->kqr_state |= KQR_PROCESSING | KQR_R2K_NOTIF_ARMED;
5147         kq->kq_state |= KQ_PROCESSING;
5148
5149         kqwl_req_unlock(kqwl);
5150
5151         kqworkloop_acknowledge_events(kqwl, FALSE);
5152
5153         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
5154                       kqwl->kqwl_dynamicid, flags, 0);
5155
5156         return 0;
5157 }
5158
5159 /*
5160  * Return 0 to indicate that processing should proceed,
5161  * -1 if there is nothing to process.
5162  *
5163  * Called with kqueue locked and returns the same way,
5164  * but may drop lock temporarily.
5165  * May block.
5166  */
5167 static int
5168 kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags)
5169 {
5170         struct kqtailq *suppressq;
5171
5172         kqlock_held(kq);
5173
5174         if (kq->kq_state & KQ_WORKQ) {
5175                 return kqworkq_begin_processing((struct kqworkq *)kq, qos_index, flags);
5176         } else if (kq->kq_state & KQ_WORKLOOP) {
5177                 return kqworkloop_begin_processing((struct kqworkloop*)kq, flags);
5178         }
5179
5180         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
5181                       VM_KERNEL_UNSLIDE_OR_PERM(kq), flags);
5182
5183         assert(qos_index == QOS_INDEX_KQFILE);
5184
5185         /* wait to become the exclusive processing thread */
5186         for (;;) {
5187                 if (kq->kq_state & KQ_DRAIN) {
5188                         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5189                                       VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
5190                         return -1;
5191                 }
5192
5193                 if ((kq->kq_state & KQ_PROCESSING) == 0)
5194                         break;
5195
5196                 /* if someone else is processing the queue, wait */
5197                 kq->kq_state |= KQ_PROCWAIT;
5198                 suppressq = kqueue_get_suppressed_queue(kq, qos_index);
5199                 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
5200                                     CAST_EVENT64_T(suppressq),
5201                                     THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
5202
5203                 kqunlock(kq);
5204                 thread_block(THREAD_CONTINUE_NULL);
5205                 kqlock(kq);
5206         }
5207
5208         /* Nobody else processing */
5209
5210         /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
5211         waitq_set_clear_preposts(&kq->kq_wqs);
5212         kq->kq_state &= ~KQ_WAKEUP;
5213
5214         /* anything left to process? */
5215         if (kqueue_queue_empty(kq, qos_index)) {
5216                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5217                               VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
5218                 return -1;
5219         }
5220
5221         /* convert to processing mode */
5222         kq->kq_state |= KQ_PROCESSING;
5223
5224         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5225                       VM_KERNEL_UNSLIDE_OR_PERM(kq));
5226
5227         return 0;
5228 }
5229
5230 /*
5231  *      kqworkq_end_processing - Complete the processing of a workq kqueue
5232  *
5233  *      We may have to request new threads.
5234  *      This can happen there are no waiting processing threads and:
5235  *      - there were active events we never got to (count > 0)
5236  *      - we pended waitq hook callouts during processing
5237  *      - we pended wakeups while processing (or unsuppressing)
5238  *
5239  *      Called with kqueue lock held.
5240  */
5241 static void
5242 kqworkq_end_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
5243 {
5244 #pragma unused(flags)
5245
5246         struct kqueue *kq = &kqwq->kqwq_kqueue;
5247         struct kqtailq *suppressq = kqueue_get_suppressed_queue(kq, qos_index);
5248
5249         thread_t self = current_thread();
5250         struct uthread *ut = get_bsdthread_info(self);
5251         struct knote *kn;
5252         struct kqrequest *kqr;
5253         thread_t thread;
5254
5255         assert(kqwq->kqwq_state & KQ_WORKQ);
5256         assert(qos_index < KQWQ_NQOS);
5257
5258         /* Are we really bound to this kqueue? */
5259         if (ut->uu_kqueue_bound != kq) {
5260                 assert(ut->uu_kqueue_bound == kq);
5261                 return;
5262         }
5263
5264         kqr = kqworkq_get_request(kqwq, qos_index);
5265
5266         kqwq_req_lock(kqwq);
5267
5268         /* Do we claim to be manager? */
5269         if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
5270
5271                 /* bail if not bound that way */
5272                 if (ut->uu_kqueue_qos_index != KQWQ_QOS_MANAGER ||
5273                     (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0) {
5274                         assert(ut->uu_kqueue_qos_index == KQWQ_QOS_MANAGER);
5275                         assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
5276                         kqwq_req_unlock(kqwq);
5277                         return;
5278                 }
5279
5280                 /* bail if this request wasn't already getting manager help */
5281                 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0 ||
5282                     (kqr->kqr_state & KQR_PROCESSING) == 0) {
5283                         kqwq_req_unlock(kqwq);
5284                         return;
5285                 }
5286         } else {
5287                 if (ut->uu_kqueue_qos_index != qos_index ||
5288                     (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER)) {
5289                         assert(ut->uu_kqueue_qos_index == qos_index);
5290                         assert((ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0);
5291                         kqwq_req_unlock(kqwq);
5292                         return;
5293                 }
5294         }
5295
5296         assert(kqr->kqr_state & KQR_BOUND);
5297         thread = kqr->kqr_thread;
5298         assert(thread == self);
5299
5300         assert(kqr->kqr_state & KQR_PROCESSING);
5301
5302         /* If we didn't drain the whole queue, re-mark a wakeup being needed */
5303         if (!kqueue_queue_empty(kq, qos_index))
5304                 kqr->kqr_state |= KQR_WAKEUP;
5305
5306         kqwq_req_unlock(kqwq);
5307
5308         /*
5309          * Return suppressed knotes to their original state.
5310          * For workq kqueues, suppressed ones that are still
5311          * truly active (not just forced into the queue) will
5312          * set flags we check below to see if anything got
5313          * woken up.
5314          */
5315         while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
5316                 assert(kn->kn_status & KN_SUPPRESSED);
5317                 knote_unsuppress(kn);
5318         }
5319
5320         kqwq_req_lock(kqwq);
5321
5322         /* Indicate that we are done processing this request */
5323         kqr->kqr_state &= ~KQR_PROCESSING;
5324
5325         /*
5326          * Drop our association with this one request and its
5327          * override on us.
5328          */
5329         kqworkq_unbind_thread(kqwq, qos_index, thread, flags);
5330
5331         /*
5332          * request a new thread if we didn't process the whole
5333          * queue or real events have happened (not just putting
5334          * stay-active events back).
5335          */
5336         if (kqr->kqr_state & KQR_WAKEUP) {
5337                 if (kqueue_queue_empty(kq, qos_index)) {
5338                         kqr->kqr_state &= ~KQR_WAKEUP;
5339                 } else {
5340                         kqworkq_request_thread(kqwq, qos_index);
5341                 }
5342         }
5343         kqwq_req_unlock(kqwq);
5344 }
5345
5346 static void
5347 kqworkloop_end_processing(struct kqworkloop *kqwl, int nevents,
5348                 unsigned int flags)
5349 {
5350         struct kqrequest *kqr = &kqwl->kqwl_request;
5351         struct kqueue *kq = &kqwl->kqwl_kqueue;
5352
5353         kqlock_held(kq);
5354
5355         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
5356                         kqwl->kqwl_dynamicid, flags, 0);
5357
5358         if ((kq->kq_state & KQ_NO_WQ_THREAD) && nevents == 0 &&
5359                         (flags & KEVENT_FLAG_IMMEDIATE) == 0) {
5360                 /*
5361                  * <rdar://problem/31634014> We may soon block, but have returned no
5362                  * kevents that need to be kept supressed for overriding purposes.
5363                  *
5364                  * It is hence safe to acknowledge events and unsuppress everything, so
5365                  * that if we block we can observe all events firing.
5366                  */
5367                 kqworkloop_acknowledge_events(kqwl, TRUE);
5368         }
5369
5370         kqwl_req_lock(kqwl);
5371
5372         assert(kqr->kqr_state & KQR_PROCESSING);
5373         assert(kq->kq_state & KQ_PROCESSING);
5374
5375         kq->kq_state &= ~KQ_PROCESSING;
5376         kqr->kqr_state &= ~KQR_PROCESSING;
5377         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
5378
5379         kqwl_req_unlock(kqwl);
5380
5381         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
5382                         kqwl->kqwl_dynamicid, flags, 0);
5383 }
5384
5385 /*
5386  * Called with kqueue lock held.
5387  */
5388 static void
5389 kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index,
5390                 int nevents, unsigned int flags)
5391 {
5392         struct knote *kn;
5393         struct kqtailq *suppressq;
5394         int procwait;
5395
5396         kqlock_held(kq);
5397
5398         assert((kq->kq_state & KQ_WORKQ) == 0);
5399
5400         if (kq->kq_state & KQ_WORKLOOP) {
5401                 return kqworkloop_end_processing((struct kqworkloop *)kq, nevents, flags);
5402         }
5403
5404         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
5405                       VM_KERNEL_UNSLIDE_OR_PERM(kq), flags);
5406
5407         assert(qos_index == QOS_INDEX_KQFILE);
5408
5409         /*
5410          * Return suppressed knotes to their original state.
5411          */
5412         suppressq = kqueue_get_suppressed_queue(kq, qos_index);
5413         while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
5414                 assert(kn->kn_status & KN_SUPPRESSED);
5415                 knote_unsuppress(kn);
5416         }
5417
5418         procwait = (kq->kq_state & KQ_PROCWAIT);
5419         kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
5420
5421         if (procwait) {
5422                 /* first wake up any thread already waiting to process */
5423                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
5424                                    CAST_EVENT64_T(suppressq),
5425                                    THREAD_AWAKENED,
5426                                    WAITQ_ALL_PRIORITIES);
5427         }
5428 }
5429
5430 /*
5431  *      kqwq_internal_bind - bind thread to processing workq kqueue
5432  *
5433  *      Determines if the provided thread will be responsible for
5434  *      servicing the particular QoS class index specified in the
5435  *      parameters. Once the binding is done, any overrides that may
5436  *      be associated with the cooresponding events can be applied.
5437  *
5438  *      This should be called as soon as the thread identity is known,
5439  *      preferably while still at high priority during creation.
5440  *
5441  *  - caller holds a reference on the process (and workq kq)
5442  *      - the thread MUST call kevent_qos_internal after being bound
5443  *        or the bucket of events may never be delivered.
5444  *      - Nothing locked
5445  *    (unless this is a synchronous bind, then the request is locked)
5446  */
5447 static int
5448 kqworkq_internal_bind(
5449         struct proc *p,
5450         kq_index_t qos_index,
5451         thread_t thread,
5452         unsigned int flags)
5453 {
5454         struct kqueue *kq;
5455         struct kqworkq *kqwq;
5456         struct kqrequest *kqr;
5457         struct uthread *ut = get_bsdthread_info(thread);
5458
5459         /* If no process workq, can't be our thread. */
5460         kq = p->p_fd->fd_wqkqueue;
5461
5462         if (kq == NULL)
5463                 return 0;
5464
5465         assert(kq->kq_state & KQ_WORKQ);
5466         kqwq = (struct kqworkq *)kq;
5467
5468         /*
5469          * No need to bind the manager thread to any specific
5470          * bucket, but still claim the thread.
5471          */
5472         if (qos_index == KQWQ_QOS_MANAGER) {
5473                 assert(ut->uu_kqueue_bound == NULL);
5474                 assert(flags & KEVENT_FLAG_WORKQ_MANAGER);
5475                 ut->uu_kqueue_bound = kq;
5476                 ut->uu_kqueue_qos_index = qos_index;
5477                 ut->uu_kqueue_flags = flags;
5478
5479                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND),
5480                               thread_tid(thread), flags, qos_index);
5481
5482                 return 1;
5483         }
5484
5485         /*
5486          * If this is a synchronous bind callback, the request
5487          * lock is already held, so just do the bind.
5488          */
5489         if (flags & KEVENT_FLAG_SYNCHRONOUS_BIND) {
5490                 kqwq_req_held(kqwq);
5491                 /* strip out synchronout bind flag */
5492                 flags &= ~KEVENT_FLAG_SYNCHRONOUS_BIND;
5493                 kqworkq_bind_thread_impl(kqwq, qos_index, thread, flags);
5494                 return 1;
5495         }
5496
5497         /*
5498          * check the request that corresponds to our qos_index
5499          * to see if there is an outstanding request.
5500          */
5501         kqr = kqworkq_get_request(kqwq, qos_index);
5502         assert(kqr->kqr_qos_index == qos_index);
5503         kqwq_req_lock(kqwq);
5504
5505         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND),
5506                       thread_tid(thread), flags, qos_index, kqr->kqr_state);
5507
5508         if ((kqr->kqr_state & KQR_THREQUESTED) &&
5509             (kqr->kqr_state & KQR_PROCESSING) == 0) {
5510
5511                 if ((kqr->kqr_state & KQR_BOUND) &&
5512                     thread == kqr->kqr_thread) {
5513                         /* duplicate bind - claim the thread */
5514                         assert(ut->uu_kqueue_bound == kq);
5515                         assert(ut->uu_kqueue_qos_index == qos_index);
5516                         kqwq_req_unlock(kqwq);
5517                         return 1;
5518                 }
5519                 if ((kqr->kqr_state & (KQR_BOUND | KQWQ_THMANAGER)) == 0) {
5520                         /* ours to bind to */
5521                         kqworkq_bind_thread_impl(kqwq, qos_index, thread, flags);
5522                         kqwq_req_unlock(kqwq);
5523                         return 1;
5524                 }
5525         }
5526         kqwq_req_unlock(kqwq);
5527         return 0;
5528 }
5529
5530 static void
5531 kqworkloop_bind_thread_impl(struct kqworkloop *kqwl,
5532                             thread_t thread,
5533                             __assert_only unsigned int flags)
5534 {
5535         assert(flags & KEVENT_FLAG_WORKLOOP);
5536
5537         /* the request object must be locked */
5538         kqwl_req_held(kqwl);
5539
5540         struct kqrequest *kqr = &kqwl->kqwl_request;
5541         struct uthread *ut = get_bsdthread_info(thread);
5542         boolean_t ipc_override_is_sync;
5543         kq_index_t qos_index = kqworkloop_combined_qos(kqwl, &ipc_override_is_sync);
5544
5545         /* nobody else bound so finally bind (as a workloop) */
5546         assert(kqr->kqr_state & KQR_THREQUESTED);
5547         assert((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) == 0);
5548         assert(thread != kqwl->kqwl_owner);
5549
5550         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND),
5551                       kqwl->kqwl_dynamicid, (uintptr_t)thread_tid(thread),
5552                       qos_index,
5553                       (uintptr_t)(((uintptr_t)kqr->kqr_override_index << 16) |
5554                       (((uintptr_t)kqr->kqr_state) << 8) |
5555                       ((uintptr_t)ipc_override_is_sync)));
5556
5557         kqr->kqr_state |= KQR_BOUND | KQR_R2K_NOTIF_ARMED;
5558         kqr->kqr_thread = thread;
5559
5560         /* bind the workloop to the uthread */
5561         ut->uu_kqueue_bound = (struct kqueue *)kqwl;
5562         ut->uu_kqueue_flags = flags;
5563         ut->uu_kqueue_qos_index = qos_index;
5564         assert(ut->uu_kqueue_override_is_sync == 0);
5565         ut->uu_kqueue_override_is_sync = ipc_override_is_sync;
5566         if (qos_index) {
5567                 thread_add_ipc_override(thread, qos_index);
5568         }
5569         if (ipc_override_is_sync) {
5570                 thread_add_sync_ipc_override(thread);
5571         }
5572 }
5573
5574 /*
5575  *  workloop_fulfill_threadreq - bind thread to processing workloop
5576  *
5577  * The provided thread will be responsible for delivering events
5578  * associated with the given kqrequest.  Bind it and get ready for
5579  * the thread to eventually arrive.
5580  *
5581  * If WORKLOOP_FULFILL_THREADREQ_SYNC is specified, the callback
5582  * within the context of the pthread_functions->workq_threadreq
5583  * callout.  In this case, the request structure is already locked.
5584  */
5585 int
5586 workloop_fulfill_threadreq(struct proc *p,
5587                            workq_threadreq_t req,
5588                            thread_t thread,
5589                            int flags)
5590 {
5591         int sync = (flags & WORKLOOP_FULFILL_THREADREQ_SYNC);
5592         int cancel = (flags & WORKLOOP_FULFILL_THREADREQ_CANCEL);
5593         struct kqrequest *kqr;
5594         struct kqworkloop *kqwl;
5595
5596         kqwl = (struct kqworkloop *)((uintptr_t)req -
5597                                      offsetof(struct kqworkloop, kqwl_request) -
5598                                      offsetof(struct kqrequest, kqr_req));
5599         kqr = &kqwl->kqwl_request;
5600
5601         /* validate we're looking at something valid */
5602         if (kqwl->kqwl_p != p ||
5603             (kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
5604                 assert(kqwl->kqwl_p == p);
5605                 assert(kqwl->kqwl_state & KQ_WORKLOOP);
5606                 return EINVAL;
5607         }
5608
5609         if (!sync)
5610                 kqwl_req_lock(kqwl);
5611
5612         /* Should be a pending request */
5613         if ((kqr->kqr_state & KQR_BOUND) ||
5614             (kqr->kqr_state & KQR_THREQUESTED) == 0) {
5615
5616                 assert((kqr->kqr_state & KQR_BOUND) == 0);
5617                 assert(kqr->kqr_state & KQR_THREQUESTED);
5618                 if (!sync)
5619                         kqwl_req_unlock(kqwl);
5620                 return EINPROGRESS;
5621         }
5622
5623         assert((kqr->kqr_state & KQR_DRAIN) == 0);
5624
5625         /*
5626          * Is it a cancel indication from pthread.
5627          * If so, we must be exiting/exec'ing. Forget
5628          * our pending request.
5629          */
5630         if (cancel) {
5631                 kqr->kqr_state &= ~KQR_THREQUESTED;
5632                 kqr->kqr_state |= KQR_DRAIN;
5633         } else {
5634                 /* do the actual bind? */
5635                 kqworkloop_bind_thread_impl(kqwl, thread, KEVENT_FLAG_WORKLOOP);
5636         }
5637
5638         if (!sync)
5639                 kqwl_req_unlock(kqwl);
5640
5641         if (cancel)
5642                 kqueue_release_last(p, &kqwl->kqwl_kqueue); /* may dealloc kq */
5643
5644         return 0;
5645 }
5646
5647
5648 /*
5649  *      kevent_qos_internal_bind - bind thread to processing kqueue
5650  *
5651  *      Indicates that the provided thread will be responsible for
5652  *      servicing the particular QoS class index specified in the
5653  *      parameters. Once the binding is done, any overrides that may
5654  *      be associated with the cooresponding events can be applied.
5655  *
5656  *      This should be called as soon as the thread identity is known,
5657  *      preferably while still at high priority during creation.
5658  *
5659  *  - caller holds a reference on the kqueue.
5660  *      - the thread MUST call kevent_qos_internal after being bound
5661  *        or the bucket of events may never be delivered.
5662  *      - Nothing locked (may take mutex or block).
5663  */
5664
5665 int
5666 kevent_qos_internal_bind(
5667         struct proc *p,
5668         int qos_class,
5669         thread_t thread,
5670         unsigned int flags)
5671 {
5672         kq_index_t qos_index;
5673
5674         assert(flags & KEVENT_FLAG_WORKQ);
5675
5676         if (thread == THREAD_NULL || (flags & KEVENT_FLAG_WORKQ) == 0) {
5677                 return EINVAL;
5678         }
5679
5680         /* get the qos index we're going to service */
5681         qos_index = qos_index_for_servicer(qos_class, thread, flags);
5682
5683         if (kqworkq_internal_bind(p, qos_index, thread, flags))
5684                 return 0;
5685
5686         return EINPROGRESS;
5687 }
5688
5689
5690 static void
5691 kqworkloop_internal_unbind(
5692         struct proc *p,
5693         thread_t thread,
5694         unsigned int flags)
5695 {
5696         struct kqueue *kq;
5697         struct kqworkloop *kqwl;
5698         struct uthread *ut = get_bsdthread_info(thread);
5699
5700         assert(ut->uu_kqueue_bound != NULL);
5701         kq = ut->uu_kqueue_bound;
5702         assert(kq->kq_state & KQ_WORKLOOP);
5703         kqwl = (struct kqworkloop *)kq;
5704
5705         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND),
5706                       kqwl->kqwl_dynamicid, (uintptr_t)thread_tid(thread),
5707                       flags, 0);
5708
5709         if (!(kq->kq_state & KQ_NO_WQ_THREAD)) {
5710                 assert(is_workqueue_thread(thread));
5711
5712                 kqlock(kq);
5713                 kqworkloop_unbind_thread(kqwl, thread, flags);
5714                 kqunlock(kq);
5715
5716                 /* If last reference, dealloc the workloop kq */
5717                 kqueue_release_last(p, kq);
5718         } else {
5719                 assert(!is_workqueue_thread(thread));
5720                 kevent_servicer_detach_thread(p, kqwl->kqwl_dynamicid, thread, flags, kq);
5721         }
5722 }
5723
5724 static void
5725 kqworkq_internal_unbind(
5726         struct proc *p,
5727         kq_index_t qos_index,
5728         thread_t thread,
5729         unsigned int flags)
5730 {
5731         struct kqueue *kq;
5732         struct kqworkq *kqwq;
5733         struct uthread *ut;
5734         kq_index_t end_index;
5735
5736         assert(thread == current_thread());
5737         ut = get_bsdthread_info(thread);
5738
5739         kq = p->p_fd->fd_wqkqueue;
5740         assert(kq->kq_state & KQ_WORKQ);
5741         assert(ut->uu_kqueue_bound == kq);
5742
5743         kqwq = (struct kqworkq *)kq;
5744
5745         /* end servicing any requests we might own */
5746         end_index = (qos_index == KQWQ_QOS_MANAGER) ?
5747             0 : qos_index;
5748         kqlock(kq);
5749
5750         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND),
5751                       (uintptr_t)thread_tid(thread), flags, qos_index);
5752
5753         do {
5754                 kqworkq_end_processing(kqwq, qos_index, flags);
5755         } while (qos_index-- > end_index);
5756
5757         ut->uu_kqueue_bound = NULL;
5758         ut->uu_kqueue_qos_index = 0;
5759         ut->uu_kqueue_flags = 0;
5760
5761         kqunlock(kq);
5762 }
5763
5764 /*
5765  *      kevent_qos_internal_unbind - unbind thread from processing kqueue
5766  *
5767  *      End processing the per-QoS bucket of events and allow other threads
5768  *      to be requested for future servicing.
5769  *
5770  *      caller holds a reference on the kqueue.
5771  *      thread is the current thread.
5772  */
5773
5774 int
5775 kevent_qos_internal_unbind(
5776         struct proc *p,
5777         int qos_class,
5778         thread_t thread,
5779         unsigned int flags)
5780 {
5781 #pragma unused(qos_class)
5782
5783         struct uthread *ut;
5784         struct kqueue *kq;
5785         unsigned int bound_flags;
5786         bool check_flags;
5787
5788         ut = get_bsdthread_info(thread);
5789         if (ut->uu_kqueue_bound == NULL) {
5790                 /* early out if we are already unbound */
5791                 assert(ut->uu_kqueue_flags == 0);
5792                 assert(ut->uu_kqueue_qos_index == 0);
5793                 assert(ut->uu_kqueue_override_is_sync == 0);
5794                 return EALREADY;
5795         }
5796
5797         assert(flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP));
5798         assert(thread == current_thread());
5799
5800         check_flags = flags & KEVENT_FLAG_UNBIND_CHECK_FLAGS;
5801
5802         /* Get the kqueue we started with */
5803         kq = ut->uu_kqueue_bound;
5804         assert(kq != NULL);
5805         assert(kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
5806
5807         /* get flags and QoS parameters we started with */
5808         bound_flags = ut->uu_kqueue_flags;
5809
5810         /* Unbind from the class of workq */
5811         if (kq->kq_state & KQ_WORKQ) {
5812                 if (check_flags && !(flags & KEVENT_FLAG_WORKQ)) {
5813                         return EINVAL;
5814                 }
5815
5816                 kqworkq_internal_unbind(p, ut->uu_kqueue_qos_index, thread, bound_flags);
5817         } else {
5818                 if (check_flags && !(flags & KEVENT_FLAG_WORKLOOP)) {
5819                         return EINVAL;
5820                 }
5821
5822                 kqworkloop_internal_unbind(p, thread, bound_flags);
5823         }
5824
5825         return 0;
5826 }
5827
5828 /*
5829  * kqueue_process - process the triggered events in a kqueue
5830  *
5831  *      Walk the queued knotes and validate that they are
5832  *      really still triggered events by calling the filter
5833  *      routines (if necessary).  Hold a use reference on
5834  *      the knote to avoid it being detached. For each event
5835  *      that is still considered triggered, invoke the
5836  *      callback routine provided.
5837  *
5838  *      caller holds a reference on the kqueue.
5839  *      kqueue locked on entry and exit - but may be dropped
5840  *      kqueue list locked (held for duration of call)
5841  */
5842
5843 static int
5844 kqueue_process(struct kqueue *kq,
5845     kevent_callback_t callback,
5846     void *callback_data,
5847     struct filt_process_s *process_data,
5848     int *countp,
5849     struct proc *p)
5850 {
5851         unsigned int flags = process_data ? process_data->fp_flags : 0;
5852         struct uthread *ut = get_bsdthread_info(current_thread());
5853         kq_index_t start_index, end_index, i;
5854         struct knote *kn;
5855         int nevents = 0;
5856         int error = 0;
5857
5858         /*
5859          * Based on the mode of the kqueue and the bound QoS of the servicer,
5860          * determine the range of thread requests that need checking
5861          */
5862         if (kq->kq_state & KQ_WORKQ) {
5863                 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
5864                         start_index = KQWQ_QOS_MANAGER;
5865                 } else if (ut->uu_kqueue_bound != kq) {
5866                         return EJUSTRETURN;
5867                 } else {
5868                         start_index = ut->uu_kqueue_qos_index;
5869                 }
5870
5871                 /* manager services every request in a workq kqueue */
5872                 assert(start_index > 0 && start_index <= KQWQ_QOS_MANAGER);
5873                 end_index = (start_index == KQWQ_QOS_MANAGER) ? 0 : start_index;
5874
5875         } else if (kq->kq_state & KQ_WORKLOOP) {
5876                 if (ut->uu_kqueue_bound != kq)
5877                         return EJUSTRETURN;
5878
5879                 /*
5880                  * Single request servicing
5881                  * we want to deliver all events, regardless of the QOS
5882                  */
5883                 start_index = end_index = THREAD_QOS_UNSPECIFIED;
5884         } else {
5885                 start_index = end_index = QOS_INDEX_KQFILE;
5886         }
5887
5888         i = start_index;
5889
5890         do {
5891                 if (kqueue_begin_processing(kq, i, flags) == -1) {
5892                         *countp = 0;
5893                         /* Nothing to process */
5894                         continue;
5895                 }
5896
5897                 /*
5898                  * loop through the enqueued knotes associated with this request,
5899                  * processing each one. Each request may have several queues
5900                  * of knotes to process (depending on the type of kqueue) so we
5901                  * have to loop through all the queues as long as we have additional
5902                  * space.
5903                  */
5904                 error = 0;
5905
5906                 struct kqtailq *base_queue = kqueue_get_base_queue(kq, i);
5907                 struct kqtailq *queue = kqueue_get_high_queue(kq, i);
5908                 do {
5909                         while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) {
5910                                 error = knote_process(kn, callback, callback_data, process_data, p);
5911                                 if (error == EJUSTRETURN) {
5912                                         error = 0;
5913                                 } else {
5914                                         nevents++;
5915                                 }
5916                                 /* error is EWOULDBLOCK when the out event array is full */
5917                         }
5918                 } while (error == 0 && queue-- > base_queue);
5919
5920                 if ((kq->kq_state & KQ_WORKQ) == 0) {
5921                         kqueue_end_processing(kq, i, nevents, flags);
5922                 }
5923
5924                 if (error == EWOULDBLOCK) {
5925                         /* break out if no more space for additional events */
5926                         error = 0;
5927                         break;
5928                 }
5929         } while (i-- > end_index);
5930
5931         *countp = nevents;
5932         return (error);
5933 }
5934
5935 static void
5936 kqueue_scan_continue(void *data, wait_result_t wait_result)
5937 {
5938         thread_t self = current_thread();
5939         uthread_t ut = (uthread_t)get_bsdthread_info(self);
5940         struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
5941         struct kqueue *kq = (struct kqueue *)data;
5942         struct filt_process_s *process_data = cont_args->process_data;
5943         int error;
5944         int count;
5945
5946         /* convert the (previous) wait_result to a proper error */
5947         switch (wait_result) {
5948         case THREAD_AWAKENED: {
5949                 kqlock(kq);
5950         retry:
5951                 error = kqueue_process(kq, cont_args->call, cont_args->data,
5952                                        process_data, &count, current_proc());
5953                 if (error == 0 && count == 0) {
5954                         if (kq->kq_state & KQ_DRAIN) {
5955                                 kqunlock(kq);
5956                                 goto drain;
5957                         }
5958
5959                         if (kq->kq_state & KQ_WAKEUP)
5960                                 goto retry;
5961
5962                         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
5963                                             KQ_EVENT, THREAD_ABORTSAFE,
5964                                             cont_args->deadline);
5965                         kq->kq_state |= KQ_SLEEP;
5966                         kqunlock(kq);
5967                         thread_block_parameter(kqueue_scan_continue, kq);
5968                         /* NOTREACHED */
5969                 }
5970                 kqunlock(kq);
5971                 } break;
5972         case THREAD_TIMED_OUT:
5973                 error = EWOULDBLOCK;
5974                 break;
5975         case THREAD_INTERRUPTED:
5976                 error = EINTR;
5977                 break;
5978         case THREAD_RESTART:
5979         drain:
5980                 error = EBADF;
5981                 break;
5982         default:
5983                 panic("%s: - invalid wait_result (%d)", __func__,
5984                     wait_result);
5985                 error = 0;
5986         }
5987
5988         /* call the continuation with the results */
5989         assert(cont_args->cont != NULL);
5990         (cont_args->cont)(kq, cont_args->data, error);
5991 }
5992
5993
5994 /*
5995  * kqueue_scan - scan and wait for events in a kqueue
5996  *
5997  *      Process the triggered events in a kqueue.
5998  *
5999  *      If there are no events triggered arrange to
6000  *      wait for them. If the caller provided a
6001  *      continuation routine, then kevent_scan will
6002  *      also.
6003  *
6004  *      The callback routine must be valid.
6005  *      The caller must hold a use-count reference on the kq.
6006  */
6007
6008 int
6009 kqueue_scan(struct kqueue *kq,
6010             kevent_callback_t callback,
6011             kqueue_continue_t continuation,
6012             void *callback_data,
6013             struct filt_process_s *process_data,
6014             struct timeval *atvp,
6015             struct proc *p)
6016 {
6017         thread_continue_t cont = THREAD_CONTINUE_NULL;
6018         unsigned int flags;
6019         uint64_t deadline;
6020         int error;
6021         int first;
6022         int fd;
6023
6024         assert(callback != NULL);
6025
6026         /*
6027          * Determine which QoS index we are servicing
6028          */
6029         flags = (process_data) ? process_data->fp_flags : 0;
6030         fd = (process_data) ? process_data->fp_fd : -1;
6031
6032         first = 1;
6033         for (;;) {
6034                 wait_result_t wait_result;
6035                 int count;
6036
6037                 /*
6038                  * Make a pass through the kq to find events already
6039                  * triggered.
6040                  */
6041                 kqlock(kq);
6042                 error = kqueue_process(kq, callback, callback_data,
6043                                        process_data, &count, p);
6044                 if (error || count)
6045                         break; /* lock still held */
6046
6047                 /* looks like we have to consider blocking */
6048                 if (first) {
6049                         first = 0;
6050                         /* convert the timeout to a deadline once */
6051                         if (atvp->tv_sec || atvp->tv_usec) {
6052                                 uint64_t now;
6053
6054                                 clock_get_uptime(&now);
6055                                 nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
6056                                                             atvp->tv_usec * (long)NSEC_PER_USEC,
6057                                                             &deadline);
6058                                 if (now >= deadline) {
6059                                         /* non-blocking call */
6060                                         error = EWOULDBLOCK;
6061                                         break; /* lock still held */
6062                                 }
6063                                 deadline -= now;
6064                                 clock_absolutetime_interval_to_deadline(deadline, &deadline);
6065                         } else {
6066                                 deadline = 0;   /* block forever */
6067                         }
6068
6069                         if (continuation) {
6070                                 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
6071                                 struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
6072
6073                                 cont_args->call = callback;
6074                                 cont_args->cont = continuation;
6075                                 cont_args->deadline = deadline;
6076                                 cont_args->data = callback_data;
6077                                 cont_args->process_data = process_data;
6078                                 cont = kqueue_scan_continue;
6079                         }
6080                 }
6081
6082                 if (kq->kq_state & KQ_DRAIN) {
6083                         kqunlock(kq);
6084                         return EBADF;
6085                 }
6086
6087                 /* If awakened during processing, try again */
6088                 if (kq->kq_state & KQ_WAKEUP) {
6089                         kqunlock(kq);
6090                         continue;
6091                 }
6092
6093                 /* go ahead and wait */
6094                 waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
6095                                            KQ_EVENT, THREAD_ABORTSAFE,
6096                                            TIMEOUT_URGENCY_USER_NORMAL,
6097                                            deadline, TIMEOUT_NO_LEEWAY);
6098                 kq->kq_state |= KQ_SLEEP;
6099                 kqunlock(kq);
6100                 wait_result = thread_block_parameter(cont, kq);
6101                 /* NOTREACHED if (continuation != NULL) */
6102
6103                 switch (wait_result) {
6104                 case THREAD_AWAKENED:
6105                         continue;
6106                 case THREAD_TIMED_OUT:
6107                         return EWOULDBLOCK;
6108                 case THREAD_INTERRUPTED:
6109                         return EINTR;
6110                 case THREAD_RESTART:
6111                         return EBADF;
6112                 default:
6113                         panic("%s: - bad wait_result (%d)", __func__,
6114                             wait_result);
6115                         error = 0;
6116                 }
6117         }
6118         kqunlock(kq);
6119         return (error);
6120 }
6121
6122
6123 /*
6124  * XXX
6125  * This could be expanded to call kqueue_scan, if desired.
6126  */
6127 /*ARGSUSED*/
6128 static int
6129 kqueue_read(__unused struct fileproc *fp,
6130     __unused struct uio *uio,
6131     __unused int flags,
6132     __unused vfs_context_t ctx)
6133 {
6134         return (ENXIO);
6135 }
6136
6137 /*ARGSUSED*/
6138 static int
6139 kqueue_write(__unused struct fileproc *fp,
6140     __unused struct uio *uio,
6141     __unused int flags,
6142     __unused vfs_context_t ctx)
6143 {
6144         return (ENXIO);
6145 }
6146
6147 /*ARGSUSED*/
6148 static int
6149 kqueue_ioctl(__unused struct fileproc *fp,
6150     __unused u_long com,
6151     __unused caddr_t data,
6152     __unused vfs_context_t ctx)
6153 {
6154         return (ENOTTY);
6155 }
6156
6157 /*ARGSUSED*/
6158 static int
6159 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
6160     __unused vfs_context_t ctx)
6161 {
6162         struct kqueue *kq = (struct kqueue *)fp->f_data;
6163         struct kqtailq *queue;
6164         struct kqtailq *suppressq;
6165         struct knote *kn;
6166         int retnum = 0;
6167
6168         if (which != FREAD)
6169                 return (0);
6170
6171         kqlock(kq);
6172
6173         assert((kq->kq_state & KQ_WORKQ) == 0);
6174
6175         /*
6176          * If this is the first pass, link the wait queue associated with the
6177          * the kqueue onto the wait queue set for the select().  Normally we
6178          * use selrecord() for this, but it uses the wait queue within the
6179          * selinfo structure and we need to use the main one for the kqueue to
6180          * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
6181          * (The select() call will unlink them when it ends).
6182          */
6183         if (wq_link_id != NULL) {
6184                 thread_t cur_act = current_thread();
6185                 struct uthread * ut = get_bsdthread_info(cur_act);
6186
6187                 kq->kq_state |= KQ_SEL;
6188                 waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset,
6189                            WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
6190
6191                 /* always consume the reserved link object */
6192                 waitq_link_release(*(uint64_t *)wq_link_id);
6193                 *(uint64_t *)wq_link_id = 0;
6194
6195                 /*
6196                  * selprocess() is expecting that we send it back the waitq
6197                  * that was just added to the thread's waitq set. In order
6198                  * to not change the selrecord() API (which is exported to
6199                  * kexts), we pass this value back through the
6200                  * void *wq_link_id pointer we were passed. We need to use
6201                  * memcpy here because the pointer may not be properly aligned
6202                  * on 32-bit systems.
6203                  */
6204                 void *wqptr = &kq->kq_wqs;
6205                 memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
6206         }
6207
6208         if (kqueue_begin_processing(kq, QOS_INDEX_KQFILE, 0) == -1) {
6209                 kqunlock(kq);
6210                 return (0);
6211         }
6212
6213         queue = kqueue_get_base_queue(kq, QOS_INDEX_KQFILE);
6214         if (!TAILQ_EMPTY(queue)) {
6215                 /*
6216                  * there is something queued - but it might be a
6217                  * KN_STAYACTIVE knote, which may or may not have
6218                  * any events pending.  Otherwise, we have to walk
6219                  * the list of knotes to see, and peek at the
6220                  * (non-vanished) stay-active ones to be really sure.
6221                  */
6222                 while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
6223                         if (kn->kn_status & KN_ACTIVE) {
6224                                 retnum = 1;
6225                                 goto out;
6226                         }
6227                         assert(kn->kn_status & KN_STAYACTIVE);
6228                         knote_suppress(kn);
6229                 }
6230
6231                 /*
6232                  * There were no regular events on the queue, so take
6233                  * a deeper look at the stay-queued ones we suppressed.
6234                  */
6235                 suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
6236                 while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
6237                         unsigned peek = 1;
6238
6239                         assert(!knoteuse_needs_boost(kn, NULL));
6240
6241                         /* If didn't vanish while suppressed - peek at it */
6242                         if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
6243                                 peek = knote_fops(kn)->f_peek(kn);
6244
6245                                 /* if it dropped while getting lock - move on */
6246                                 if (!knoteuse2kqlock(kq, kn, KNUSE_NONE))
6247                                         continue;
6248                         }
6249
6250                         /* unsuppress it */
6251                         knote_unsuppress(kn);
6252
6253                         /* has data or it has to report a vanish */
6254                         if (peek > 0) {
6255                                 retnum = 1;
6256                                 goto out;
6257                         }
6258                 }
6259         }
6260
6261 out:
6262         kqueue_end_processing(kq, QOS_INDEX_KQFILE, retnum, 0);
6263         kqunlock(kq);
6264         return (retnum);
6265 }
6266
6267 /*
6268  * kqueue_close -
6269  */
6270 /*ARGSUSED*/
6271 static int
6272 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
6273 {
6274         struct kqfile *kqf = (struct kqfile *)fg->fg_data;
6275
6276         assert((kqf->kqf_state & KQ_WORKQ) == 0);
6277         kqueue_dealloc(&kqf->kqf_kqueue);
6278         fg->fg_data = NULL;
6279         return (0);
6280 }
6281
6282 /*ARGSUSED*/
6283 /*
6284  * The callers has taken a use-count reference on this kqueue and will donate it
6285  * to the kqueue we are being added to.  This keeps the kqueue from closing until
6286  * that relationship is torn down.
6287  */
6288 static int
6289 kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn,
6290                 __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
6291 {
6292         struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
6293         struct kqueue *kq = &kqf->kqf_kqueue;
6294         struct kqueue *parentkq = knote_get_kq(kn);
6295
6296         assert((kqf->kqf_state & KQ_WORKQ) == 0);
6297
6298         if (parentkq == kq ||
6299             kn->kn_filter != EVFILT_READ) {
6300                 kn->kn_flags = EV_ERROR;
6301                 kn->kn_data = EINVAL;
6302                 return 0;
6303         }
6304
6305         /*
6306          * We have to avoid creating a cycle when nesting kqueues
6307          * inside another.  Rather than trying to walk the whole
6308          * potential DAG of nested kqueues, we just use a simple
6309          * ceiling protocol.  When a kqueue is inserted into another,
6310          * we check that the (future) parent is not already nested
6311          * into another kqueue at a lower level than the potenial
6312          * child (because it could indicate a cycle).  If that test
6313          * passes, we just mark the nesting levels accordingly.
6314          */
6315
6316         kqlock(parentkq);
6317         if (parentkq->kq_level > 0 &&
6318             parentkq->kq_level < kq->kq_level)
6319         {
6320                 kqunlock(parentkq);
6321                 kn->kn_flags = EV_ERROR;
6322                 kn->kn_data = EINVAL;
6323                 return 0;
6324         } else {
6325                 /* set parent level appropriately */
6326                 if (parentkq->kq_level == 0)
6327                         parentkq->kq_level = 2;
6328                 if (parentkq->kq_level < kq->kq_level + 1)
6329                         parentkq->kq_level = kq->kq_level + 1;
6330                 kqunlock(parentkq);
6331
6332                 kn->kn_filtid = EVFILTID_KQREAD;
6333                 kqlock(kq);
6334                 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
6335                 /* indicate nesting in child, if needed */
6336                 if (kq->kq_level == 0)
6337                         kq->kq_level = 1;
6338
6339                 int count = kq->kq_count;
6340                 kqunlock(kq);
6341                 return (count > 0);
6342         }
6343 }
6344
6345 /*
6346  * kqueue_drain - called when kq is closed
6347  */
6348 /*ARGSUSED*/
6349 static int
6350 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
6351 {
6352         struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
6353
6354         assert((kq->kq_state & KQ_WORKQ) == 0);
6355
6356         kqlock(kq);
6357         kq->kq_state |= KQ_DRAIN;
6358         kqueue_interrupt(kq);
6359         kqunlock(kq);
6360         return (0);
6361 }
6362
6363 /*ARGSUSED*/
6364 int
6365 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
6366 {
6367         assert((kq->kq_state & KQ_WORKQ) == 0);
6368
6369         kqlock(kq);
6370         if (isstat64 != 0) {
6371                 struct stat64 *sb64 = (struct stat64 *)ub;
6372
6373                 bzero((void *)sb64, sizeof(*sb64));
6374                 sb64->st_size = kq->kq_count;
6375                 if (kq->kq_state & KQ_KEV_QOS)
6376                         sb64->st_blksize = sizeof(struct kevent_qos_s);
6377                 else if (kq->kq_state & KQ_KEV64)
6378                         sb64->st_blksize = sizeof(struct kevent64_s);
6379                 else if (IS_64BIT_PROCESS(p))
6380                         sb64->st_blksize = sizeof(struct user64_kevent);
6381                 else
6382                         sb64->st_blksize = sizeof(struct user32_kevent);
6383                 sb64->st_mode = S_IFIFO;
6384         } else {
6385                 struct stat *sb = (struct stat *)ub;
6386
6387                 bzero((void *)sb, sizeof(*sb));
6388                 sb->st_size = kq->kq_count;
6389                 if (kq->kq_state & KQ_KEV_QOS)
6390                         sb->st_blksize = sizeof(struct kevent_qos_s);
6391                 else if (kq->kq_state & KQ_KEV64)
6392                         sb->st_blksize = sizeof(struct kevent64_s);
6393                 else if (IS_64BIT_PROCESS(p))
6394                         sb->st_blksize = sizeof(struct user64_kevent);
6395                 else
6396                         sb->st_blksize = sizeof(struct user32_kevent);
6397                 sb->st_mode = S_IFIFO;
6398         }
6399         kqunlock(kq);
6400         return (0);
6401 }
6402
6403 /*
6404  * Interact with the pthread kext to request a servicing there.
6405  * Eventually, this will request threads at specific QoS levels.
6406  * For now, it only requests a dispatch-manager-QoS thread, and
6407  * only one-at-a-time.
6408  *
6409  * - Caller holds the workq request lock
6410  *
6411  * - May be called with the kqueue's wait queue set locked,
6412  *   so cannot do anything that could recurse on that.
6413  */
6414 static void
6415 kqworkq_request_thread(
6416         struct kqworkq *kqwq,
6417         kq_index_t qos_index)
6418 {
6419         struct kqrequest *kqr;
6420
6421         assert(kqwq->kqwq_state & KQ_WORKQ);
6422         assert(qos_index < KQWQ_NQOS);
6423
6424         kqr = kqworkq_get_request(kqwq, qos_index);
6425
6426         assert(kqr->kqr_state & KQR_WAKEUP);
6427
6428         /*
6429          * If we have already requested a thread, and it hasn't
6430          * started processing yet, there's no use hammering away
6431          * on the pthread kext.
6432          */
6433         if (kqr->kqr_state & KQR_THREQUESTED)
6434                 return;
6435
6436         assert((kqr->kqr_state & KQR_BOUND) == 0);
6437
6438         /* request additional workq threads if appropriate */
6439         if (pthread_functions != NULL &&
6440             pthread_functions->workq_reqthreads != NULL) {
6441                 unsigned int flags = KEVENT_FLAG_WORKQ;
6442                 unsigned long priority;
6443                 thread_t wqthread;
6444
6445                 /* Compute the appropriate pthread priority */
6446                 priority = qos_from_qos_index(qos_index);
6447
6448 #if 0
6449                 /* JMM - for now remain compatible with old invocations */
6450                 /* set the over-commit flag on the request if needed */
6451                 if (kqr->kqr_state & KQR_THOVERCOMMIT)
6452                         priority |= _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
6453 #endif /* 0 */
6454
6455                 /* Compute a priority based on qos_index. */
6456                 struct workq_reqthreads_req_s request = {
6457                         .priority = priority,
6458                         .count = 1
6459                 };
6460
6461                 /* mark that we are making a request */
6462                 kqr->kqr_state |= KQR_THREQUESTED;
6463                 if (qos_index == KQWQ_QOS_MANAGER)
6464                         kqr->kqr_state |= KQWQ_THMANAGER;
6465
6466                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
6467                               0, qos_index,
6468                               (((uintptr_t)kqr->kqr_override_index << 8) |
6469                                (uintptr_t)kqr->kqr_state));
6470                 wqthread = (*pthread_functions->workq_reqthreads)(kqwq->kqwq_p, 1, &request);
6471
6472                 /* We've been switched to the emergency/manager thread */
6473                 if (wqthread == (thread_t)-1) {
6474                         assert(qos_index != KQWQ_QOS_MANAGER);
6475                         kqr->kqr_state |= KQWQ_THMANAGER;
6476                         return;
6477                 }
6478
6479                 /*
6480                  * bind the returned thread identity
6481                  * This goes away when we switch to synchronous callback
6482                  * binding from the pthread kext.
6483                  */
6484                 if (wqthread != NULL) {
6485                         kqworkq_bind_thread_impl(kqwq, qos_index, wqthread, flags);
6486                 }
6487         }
6488 }
6489
6490 /*
6491  * If we aren't already busy processing events [for this QoS],
6492  * request workq thread support as appropriate.
6493  *
6494  * TBD - for now, we don't segregate out processing by QoS.
6495  *
6496  * - May be called with the kqueue's wait queue set locked,
6497  *   so cannot do anything that could recurse on that.
6498  */
6499 static void
6500 kqworkq_request_help(
6501         struct kqworkq *kqwq,
6502         kq_index_t qos_index)
6503 {
6504         struct kqrequest *kqr;
6505
6506         /* convert to thread qos value */
6507         assert(qos_index < KQWQ_NQOS);
6508
6509         kqwq_req_lock(kqwq);
6510         kqr = kqworkq_get_request(kqwq, qos_index);
6511
6512         if ((kqr->kqr_state & KQR_WAKEUP) == 0) {
6513                 /* Indicate that we needed help from this request */
6514                 kqr->kqr_state |= KQR_WAKEUP;
6515
6516                 /* Go assure a thread request has been made */
6517                 kqworkq_request_thread(kqwq, qos_index);
6518         }
6519         kqwq_req_unlock(kqwq);
6520 }
6521
6522 static void
6523 kqworkloop_threadreq_impl(struct kqworkloop *kqwl, kq_index_t qos_index)
6524 {
6525         struct kqrequest *kqr = &kqwl->kqwl_request;
6526         unsigned long pri = pthread_priority_for_kqrequest(kqr, qos_index);
6527         int op, ret;
6528
6529         assert((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED);
6530
6531         /*
6532          * New-style thread request supported. Provide
6533          * the pthread kext a pointer to a workq_threadreq_s
6534          * structure for its use until a corresponding
6535          * workloop_fulfill_threqreq callback.
6536          */
6537         if (current_proc() == kqwl->kqwl_kqueue.kq_p) {
6538                 op = WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL;
6539         } else {
6540                 op = WORKQ_THREADREQ_WORKLOOP;
6541         }
6542 again:
6543         ret = (*pthread_functions->workq_threadreq)(kqwl->kqwl_p, &kqr->kqr_req,
6544                         WORKQ_THREADREQ_WORKLOOP, pri, 0);
6545         switch (ret) {
6546         case ENOTSUP:
6547                 assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
6548                 op = WORKQ_THREADREQ_WORKLOOP;
6549                 goto again;
6550
6551         case ECANCELED:
6552         case EINVAL:
6553                 /*
6554                  * Process is shutting down or exec'ing.
6555                  * All the kqueues are going to be cleaned up
6556                  * soon. Forget we even asked for a thread -
6557                  * and make sure we don't ask for more.
6558                  */
6559                 kqueue_release((struct kqueue *)kqwl, KQUEUE_CANT_BE_LAST_REF);
6560                 kqr->kqr_state &= ~KQR_THREQUESTED;
6561                 kqr->kqr_state |= KQR_DRAIN;
6562                 break;
6563
6564         case EAGAIN:
6565                 assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
6566                 act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
6567                 break;
6568
6569         default:
6570                 assert(ret == 0);
6571         }
6572 }
6573
6574 static void
6575 kqworkloop_threadreq_modify(struct kqworkloop *kqwl, kq_index_t qos_index)
6576 {
6577         struct kqrequest *kqr = &kqwl->kqwl_request;
6578         unsigned long pri = pthread_priority_for_kqrequest(kqr, qos_index);
6579         int ret, op = WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL;
6580
6581         assert((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED);
6582
6583         if (current_proc() == kqwl->kqwl_kqueue.kq_p) {
6584                 op = WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL;
6585         } else {
6586                 op = WORKQ_THREADREQ_CHANGE_PRI;
6587         }
6588 again:
6589         ret = (*pthread_functions->workq_threadreq_modify)(kqwl->kqwl_p,
6590                         &kqr->kqr_req, op, pri, 0);
6591         switch (ret) {
6592         case ENOTSUP:
6593                 assert(op == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL);
6594                 op = WORKQ_THREADREQ_CHANGE_PRI;
6595                 goto again;
6596
6597         case EAGAIN:
6598                 assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
6599                 act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
6600                 break;
6601
6602         case ECANCELED:
6603         case EINVAL:
6604         case 0:
6605                 break;
6606
6607         default:
6608                 assert(ret == 0);
6609         }
6610 }
6611
6612 /*
6613  * Interact with the pthread kext to request a servicing thread.
6614  * This will request a single thread at the highest QoS level
6615  * for which there is work (whether that was the requested QoS
6616  * for an event or an override applied to a lower-QoS request).
6617  *
6618  * - Caller holds the workloop request lock
6619  *
6620  * - May be called with the kqueue's wait queue set locked,
6621  *   so cannot do anything that could recurse on that.
6622  */
6623 static void
6624 kqworkloop_request_thread(struct kqworkloop *kqwl, kq_index_t qos_index)
6625 {
6626         struct kqrequest *kqr;
6627
6628         assert(kqwl->kqwl_state & KQ_WORKLOOP);
6629
6630         kqr = &kqwl->kqwl_request;
6631
6632         assert(kqwl->kqwl_owner == THREAD_NULL);
6633         assert((kqr->kqr_state & KQR_BOUND) == 0);
6634         assert((kqr->kqr_state & KQR_THREQUESTED) == 0);
6635         assert(!(kqwl->kqwl_kqueue.kq_state & KQ_NO_WQ_THREAD));
6636
6637         /* If we're draining thread requests, just bail */
6638         if (kqr->kqr_state & KQR_DRAIN)
6639                 return;
6640
6641         if (pthread_functions != NULL &&
6642                         pthread_functions->workq_threadreq != NULL) {
6643                 /*
6644                  * set request state flags, etc... before calling pthread
6645                  * This assures they are set before a possible synchronous
6646                  * callback to workloop_fulfill_threadreq().
6647                  */
6648                 kqr->kqr_state |= KQR_THREQUESTED;
6649
6650                 /* Add a thread request reference on the kqueue. */
6651                 kqueue_retain((struct kqueue *)kqwl);
6652
6653                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
6654                               kqwl->kqwl_dynamicid,
6655                               0, qos_index, kqr->kqr_state);
6656                 kqworkloop_threadreq_impl(kqwl, qos_index);
6657         } else {
6658                 panic("kqworkloop_request_thread");
6659                 return;
6660         }
6661 }
6662
6663 static void
6664 kqworkloop_update_sync_override_state(struct kqworkloop *kqwl, boolean_t sync_ipc_override)
6665 {
6666         struct kqrequest *kqr = &kqwl->kqwl_request;
6667         kqwl_req_lock(kqwl);
6668         kqr->kqr_has_sync_override = sync_ipc_override;
6669         kqwl_req_unlock(kqwl);
6670
6671 }
6672
6673 static inline kq_index_t
6674 kqworkloop_combined_qos(struct kqworkloop *kqwl, boolean_t *ipc_override_is_sync)
6675 {
6676         struct kqrequest *kqr = &kqwl->kqwl_request;
6677         kq_index_t override;
6678
6679         *ipc_override_is_sync = FALSE;
6680         override = MAX(MAX(kqr->kqr_qos_index, kqr->kqr_override_index),
6681                                         kqr->kqr_dsync_waiters_qos);
6682
6683         if (kqr->kqr_sync_suppress_count > 0 || kqr->kqr_has_sync_override) {
6684                 *ipc_override_is_sync = TRUE;
6685                 override = THREAD_QOS_USER_INTERACTIVE;
6686         }
6687         return override;
6688 }
6689
6690 static inline void
6691 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
6692 {
6693         struct kqrequest *kqr = &kqwl->kqwl_request;
6694
6695         kqwl_req_held(kqwl);
6696
6697         if (kqr->kqr_state & KQR_R2K_NOTIF_ARMED) {
6698                 assert(kqr->kqr_state & KQR_BOUND);
6699                 assert(kqr->kqr_thread);
6700
6701                 kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
6702                 act_set_astkevent(kqr->kqr_thread, AST_KEVENT_RETURN_TO_KERNEL);
6703         }
6704 }
6705
6706 static void
6707 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
6708 {
6709         const uint8_t KQWL_STAYACTIVE_FIRED_BIT = (1 << 0);
6710
6711         struct kqrequest *kqr = &kqwl->kqwl_request;
6712         boolean_t old_ipc_override_is_sync = FALSE;
6713         kq_index_t old_qos = kqworkloop_combined_qos(kqwl, &old_ipc_override_is_sync);
6714         struct kqueue *kq = &kqwl->kqwl_kqueue;
6715         bool static_thread = (kq->kq_state & KQ_NO_WQ_THREAD);
6716         kq_index_t i;
6717
6718         /* must hold the kqr lock */
6719         kqwl_req_held(kqwl);
6720
6721         switch (op) {
6722         case KQWL_UTQ_UPDATE_WAKEUP_QOS:
6723                 if (qos == KQWL_BUCKET_STAYACTIVE) {
6724                         /*
6725                          * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
6726                          * a high watermark (kqr_stayactive_qos) of any stay active knote
6727                          * that was ever registered with this workloop.
6728                          *
6729                          * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
6730                          * knote, we use this high-watermark as a wakeup-index, and also set
6731                          * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
6732                          * there is at least one stay active knote fired until the next full
6733                          * processing of this bucket.
6734                          */
6735                         kqr->kqr_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT;
6736                         qos = kqr->kqr_stayactive_qos;
6737                         assert(qos);
6738                         assert(!static_thread);
6739                 }
6740                 if (kqr->kqr_wakeup_indexes & (1 << qos)) {
6741                         assert(kqr->kqr_state & KQR_WAKEUP);
6742                         break;
6743                 }
6744
6745                 kqr->kqr_wakeup_indexes |= (1 << qos);
6746                 kqr->kqr_state |= KQR_WAKEUP;
6747                 kqworkloop_request_fire_r2k_notification(kqwl);
6748                 goto recompute_async;
6749
6750         case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
6751                 assert(qos);
6752                 if (kqr->kqr_stayactive_qos < qos) {
6753                         kqr->kqr_stayactive_qos = qos;
6754                         if (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
6755                                 assert(kqr->kqr_state & KQR_WAKEUP);
6756                                 kqr->kqr_wakeup_indexes |= (1 << qos);
6757                                 goto recompute_async;
6758                         }
6759                 }
6760                 break;
6761
6762         case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
6763                 kqlock_held(kq); // to look at kq_queues
6764                 kqr->kqr_has_sync_override = FALSE;
6765                 i = KQWL_BUCKET_STAYACTIVE;
6766                 if (TAILQ_EMPTY(&kqr->kqr_suppressed)) {
6767                         kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
6768                 }
6769                 if (!TAILQ_EMPTY(&kq->kq_queue[i]) &&
6770                                 (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
6771                         /*
6772                          * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
6773                          * knote may have fired, so we need to merge in kqr_stayactive_qos.
6774                          *
6775                          * Unlike other buckets, this one is never empty but could be idle.
6776                          */
6777                         kqr->kqr_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT;
6778                         kqr->kqr_wakeup_indexes |= (1 << kqr->kqr_stayactive_qos);
6779                 } else {
6780                         kqr->kqr_wakeup_indexes = 0;
6781                 }
6782                 for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) {
6783                         if (!TAILQ_EMPTY(&kq->kq_queue[i])) {
6784                                 kqr->kqr_wakeup_indexes |= (1 << i);
6785                                 struct knote *kn = TAILQ_FIRST(&kqwl->kqwl_kqueue.kq_queue[i]);
6786                                 if (i == THREAD_QOS_USER_INTERACTIVE &&
6787                                     kn->kn_qos_override_is_sync) {
6788                                         kqr->kqr_has_sync_override = TRUE;
6789                                 }
6790                         }
6791                 }
6792                 if (kqr->kqr_wakeup_indexes) {
6793                         kqr->kqr_state |= KQR_WAKEUP;
6794                         kqworkloop_request_fire_r2k_notification(kqwl);
6795                 } else {
6796                         kqr->kqr_state &= ~KQR_WAKEUP;
6797                 }
6798                 assert(qos == THREAD_QOS_UNSPECIFIED);
6799                 goto recompute_async;
6800
6801         case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
6802                 kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
6803                 assert(qos == THREAD_QOS_UNSPECIFIED);
6804                 goto recompute_async;
6805
6806         case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
6807         recompute_async:
6808                 /*
6809                  * When modifying the wakeup QoS or the async override QoS, we always
6810                  * need to maintain our invariant that kqr_override_index is at least as
6811                  * large as the highest QoS for which an event is fired.
6812                  *
6813                  * However this override index can be larger when there is an overriden
6814                  * suppressed knote pushing on the kqueue.
6815                  */
6816                 if (kqr->kqr_wakeup_indexes > (1 << qos)) {
6817                         qos = fls(kqr->kqr_wakeup_indexes) - 1; /* fls is 1-based */
6818                 }
6819                 if (kqr->kqr_override_index < qos) {
6820                         kqr->kqr_override_index = qos;
6821                 }
6822                 break;
6823
6824         case KQWL_UTQ_REDRIVE_EVENTS:
6825                 break;
6826
6827         case KQWL_UTQ_SET_ASYNC_QOS:
6828                 filt_wlheld(kqwl);
6829                 kqr->kqr_qos_index = qos;
6830                 break;
6831
6832         case KQWL_UTQ_SET_SYNC_WAITERS_QOS:
6833                 filt_wlheld(kqwl);
6834                 kqr->kqr_dsync_waiters_qos = qos;
6835                 break;
6836
6837         default:
6838                 panic("unknown kqwl thread qos update operation: %d", op);
6839         }
6840
6841         boolean_t new_ipc_override_is_sync = FALSE;
6842         kq_index_t new_qos = kqworkloop_combined_qos(kqwl, &new_ipc_override_is_sync);
6843         thread_t kqwl_owner = kqwl->kqwl_owner;
6844         thread_t servicer = kqr->kqr_thread;
6845         __assert_only int ret;
6846
6847         /*
6848          * Apply the diffs to the owner if applicable
6849          */
6850         if (filt_wlowner_is_valid(kqwl_owner)) {
6851 #if 0
6852                 /* JMM - need new trace hooks for owner overrides */
6853                 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
6854                                 kqwl->kqwl_dynamicid,
6855                                 (kqr->kqr_state & KQR_BOUND) ? thread_tid(kqwl_owner) : 0,
6856                                 (kqr->kqr_qos_index << 8) | new_qos,
6857                                 (kqr->kqr_override_index << 8) | kqr->kqr_state);
6858 #endif
6859                 if (new_qos == kqr->kqr_dsync_owner_qos) {
6860                         // nothing to do
6861                 } else if (kqr->kqr_dsync_owner_qos == THREAD_QOS_UNSPECIFIED) {
6862                         thread_add_ipc_override(kqwl_owner, new_qos);
6863                 } else if (new_qos == THREAD_QOS_UNSPECIFIED) {
6864                         thread_drop_ipc_override(kqwl_owner);
6865                 } else /* kqr->kqr_dsync_owner_qos != new_qos */ {
6866                         thread_update_ipc_override(kqwl_owner, new_qos);
6867                 }
6868                 kqr->kqr_dsync_owner_qos = new_qos;
6869
6870                 if (new_ipc_override_is_sync &&
6871                         !kqr->kqr_owner_override_is_sync) {
6872                         thread_add_sync_ipc_override(kqwl_owner);
6873                 } else if (!new_ipc_override_is_sync &&
6874                         kqr->kqr_owner_override_is_sync) {
6875                         thread_drop_sync_ipc_override(kqwl_owner);
6876                 }
6877                 kqr->kqr_owner_override_is_sync = new_ipc_override_is_sync;
6878         }
6879
6880         /*
6881          * apply the diffs to the servicer
6882          */
6883         if (static_thread) {
6884                 /*
6885                  * Statically bound thread
6886                  *
6887                  * These threads don't participates in QoS overrides today, just wakeup
6888                  * the thread blocked on this kqueue if a new event arrived.
6889                  */
6890
6891                 switch (op) {
6892                 case KQWL_UTQ_UPDATE_WAKEUP_QOS:
6893                 case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
6894                 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
6895                         break;
6896
6897                 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
6898                 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
6899                 case KQWL_UTQ_REDRIVE_EVENTS:
6900                 case KQWL_UTQ_SET_ASYNC_QOS:
6901                 case KQWL_UTQ_SET_SYNC_WAITERS_QOS:
6902                         panic("should never be called");
6903                         break;
6904                 }
6905
6906                 kqlock_held(kq);
6907
6908                 if ((kqr->kqr_state & KQR_BOUND) && (kqr->kqr_state & KQR_WAKEUP)) {
6909                         assert(servicer && !is_workqueue_thread(servicer));
6910                         if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
6911                                 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
6912                                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT,
6913                                                 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
6914                         }
6915                 }
6916         } else if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
6917                 /*
6918                  * No servicer, nor thread-request
6919                  *
6920                  * Make a new thread request, unless there is an owner (or the workloop
6921                  * is suspended in userland) or if there is no asynchronous work in the
6922                  * first place.
6923                  */
6924
6925                 if (kqwl_owner == THREAD_NULL && (kqr->kqr_state & KQR_WAKEUP)) {
6926                         kqworkloop_request_thread(kqwl, new_qos);
6927                 }
6928         } else if ((kqr->kqr_state & KQR_BOUND) == 0 &&
6929                         (kqwl_owner || (kqr->kqr_state & KQR_WAKEUP) == 0)) {
6930                 /*
6931                  * No servicer, thread request in flight we want to cancel
6932                  *
6933                  * We just got rid of the last knote of the kqueue or noticed an owner
6934                  * with a thread request still in flight, take it back.
6935                  */
6936                 ret = (*pthread_functions->workq_threadreq_modify)(kqwl->kqwl_p,
6937                                 &kqr->kqr_req, WORKQ_THREADREQ_CANCEL, 0, 0);
6938                 if (ret == 0) {
6939                         kqr->kqr_state &= ~KQR_THREQUESTED;
6940                         kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
6941                 }
6942         } else {
6943                 boolean_t qos_changed = FALSE;
6944
6945                 /*
6946                  * Servicer or request is in flight
6947                  *
6948                  * Just apply the diff to the servicer or the thread request
6949                  */
6950                 if (kqr->kqr_state & KQR_BOUND) {
6951                         servicer = kqr->kqr_thread;
6952                         struct uthread *ut = get_bsdthread_info(servicer);
6953                         if (ut->uu_kqueue_qos_index != new_qos) {
6954                                 if (ut->uu_kqueue_qos_index == THREAD_QOS_UNSPECIFIED) {
6955                                         thread_add_ipc_override(servicer, new_qos);
6956                                 } else if (new_qos == THREAD_QOS_UNSPECIFIED) {
6957                                         thread_drop_ipc_override(servicer);
6958                                 } else /* ut->uu_kqueue_qos_index != new_qos */ {
6959                                         thread_update_ipc_override(servicer, new_qos);
6960                                 }
6961                                 ut->uu_kqueue_qos_index = new_qos;
6962                                 qos_changed = TRUE;
6963                         }
6964
6965                         if (new_ipc_override_is_sync != ut->uu_kqueue_override_is_sync) {
6966                                 if (new_ipc_override_is_sync &&
6967                                     !ut->uu_kqueue_override_is_sync) {
6968                                         thread_add_sync_ipc_override(servicer);
6969                                 } else if (!new_ipc_override_is_sync &&
6970                                         ut->uu_kqueue_override_is_sync) {
6971                                         thread_drop_sync_ipc_override(servicer);
6972                                 }
6973                                 ut->uu_kqueue_override_is_sync = new_ipc_override_is_sync;
6974                                 qos_changed = TRUE;
6975                         }
6976                 } else if (old_qos != new_qos) {
6977                         assert(new_qos);
6978                         kqworkloop_threadreq_modify(kqwl, new_qos);
6979                         qos_changed = TRUE;
6980                 }
6981                 if (qos_changed) {
6982                         servicer = kqr->kqr_thread;
6983                         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
6984                                 kqwl->kqwl_dynamicid,
6985                                 (kqr->kqr_state & KQR_BOUND) ? thread_tid(servicer) : 0,
6986                                 (kqr->kqr_qos_index << 16) | (new_qos << 8) | new_ipc_override_is_sync,
6987                                 (kqr->kqr_override_index << 8) | kqr->kqr_state);
6988                 }
6989         }
6990 }
6991
6992 static void
6993 kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index)
6994 {
6995         /* convert to thread qos value */
6996         assert(qos_index < KQWL_NBUCKETS);
6997
6998         kqwl_req_lock(kqwl);
6999         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos_index);
7000         kqwl_req_unlock(kqwl);
7001 }
7002
7003 /*
7004  * These arrays described the low and high qindexes for a given qos_index.
7005  * The values come from the chart in <sys/eventvar.h> (must stay in sync).
7006  */
7007 static kq_index_t _kqwq_base_index[KQWQ_NQOS] = {0, 0, 6, 11, 15, 18, 20, 21};
7008 static kq_index_t _kqwq_high_index[KQWQ_NQOS] = {0, 5, 10, 14, 17, 19, 20, 21};
7009
7010 static struct kqtailq *
7011 kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index)
7012 {
7013         if (kq->kq_state & KQ_WORKQ) {
7014                 assert(qos_index < KQWQ_NQOS);
7015                 return &kq->kq_queue[_kqwq_base_index[qos_index]];
7016         } else if (kq->kq_state & KQ_WORKLOOP) {
7017                 assert(qos_index < KQWL_NBUCKETS);
7018                 return &kq->kq_queue[qos_index];
7019         } else {
7020                 assert(qos_index == QOS_INDEX_KQFILE);
7021                 return &kq->kq_queue[QOS_INDEX_KQFILE];
7022         }
7023 }
7024
7025 static struct kqtailq *
7026 kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index)
7027 {
7028         if (kq->kq_state & KQ_WORKQ) {
7029                 assert(qos_index < KQWQ_NQOS);
7030                 return &kq->kq_queue[_kqwq_high_index[qos_index]];
7031         } else if (kq->kq_state & KQ_WORKLOOP) {
7032                 assert(qos_index < KQWL_NBUCKETS);
7033                 return &kq->kq_queue[KQWL_BUCKET_STAYACTIVE];
7034         } else {
7035                 assert(qos_index == QOS_INDEX_KQFILE);
7036                 return &kq->kq_queue[QOS_INDEX_KQFILE];
7037         }
7038 }
7039
7040 static int
7041 kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
7042 {
7043         struct kqtailq *base_queue = kqueue_get_base_queue(kq, qos_index);
7044         struct kqtailq *queue = kqueue_get_high_queue(kq, qos_index);
7045
7046         do {
7047                 if (!TAILQ_EMPTY(queue))
7048                         return 0;
7049         } while (queue-- > base_queue);
7050         return 1;
7051 }
7052
7053 static struct kqtailq *
7054 kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index)
7055 {
7056     struct kqtailq *res;
7057         struct kqrequest *kqr;
7058
7059         if (kq->kq_state & KQ_WORKQ) {
7060                 struct kqworkq *kqwq = (struct kqworkq *)kq;
7061
7062                 kqr = kqworkq_get_request(kqwq, qos_index);
7063                 res = &kqr->kqr_suppressed;
7064         } else if (kq->kq_state & KQ_WORKLOOP) {
7065                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7066
7067                 kqr = &kqwl->kqwl_request;
7068                 res = &kqr->kqr_suppressed;
7069         } else {
7070                 struct kqfile *kqf = (struct kqfile *)kq;
7071                 res = &kqf->kqf_suppressed;
7072         }
7073         return res;
7074 }
7075
7076 static kq_index_t
7077 knote_get_queue_index(struct knote *kn)
7078 {
7079         kq_index_t override_index = knote_get_qos_override_index(kn);
7080         kq_index_t qos_index = knote_get_qos_index(kn);
7081         struct kqueue *kq = knote_get_kq(kn);
7082         kq_index_t res;
7083
7084         if (kq->kq_state & KQ_WORKQ) {
7085                 res = _kqwq_base_index[qos_index];
7086                 if (override_index > qos_index)
7087                         res += override_index - qos_index;
7088                 assert(res <= _kqwq_high_index[qos_index]);
7089         } else if (kq->kq_state & KQ_WORKLOOP) {
7090                 res = MAX(override_index, qos_index);
7091                 assert(res < KQWL_NBUCKETS);
7092         } else {
7093                 assert(qos_index == QOS_INDEX_KQFILE);
7094                 assert(override_index == QOS_INDEX_KQFILE);
7095                 res = QOS_INDEX_KQFILE;
7096         }
7097         return res;
7098 }
7099
7100 static struct kqtailq *
7101 knote_get_queue(struct knote *kn)
7102 {
7103         kq_index_t qindex = knote_get_queue_index(kn);
7104
7105         return &(knote_get_kq(kn))->kq_queue[qindex];
7106 }
7107
7108 static kq_index_t
7109 knote_get_req_index(struct knote *kn)
7110 {
7111         return kn->kn_req_index;
7112 }
7113
7114 static kq_index_t
7115 knote_get_qos_index(struct knote *kn)
7116 {
7117         return kn->kn_qos_index;
7118 }
7119
7120 static void
7121 knote_set_qos_index(struct knote *kn, kq_index_t qos_index)
7122 {
7123         struct kqueue *kq = knote_get_kq(kn);
7124
7125         assert(qos_index < KQWQ_NQOS);
7126         assert((kn->kn_status & KN_QUEUED) == 0);
7127
7128         if (kq->kq_state & KQ_WORKQ) {
7129                 assert(qos_index > THREAD_QOS_UNSPECIFIED);
7130         } else if (kq->kq_state & KQ_WORKLOOP) {
7131                 /* XXX this policy decision shouldn't be here */
7132                 if (qos_index == THREAD_QOS_UNSPECIFIED)
7133                         qos_index = THREAD_QOS_LEGACY;
7134         } else
7135                 qos_index = QOS_INDEX_KQFILE;
7136
7137         /* always set requested */
7138         kn->kn_req_index = qos_index;
7139
7140         /* only adjust in-use qos index when not suppressed */
7141         if ((kn->kn_status & KN_SUPPRESSED) == 0)
7142                 kn->kn_qos_index = qos_index;
7143 }
7144
7145 static void
7146 knote_set_qos_overcommit(struct knote *kn)
7147 {
7148         struct kqueue *kq = knote_get_kq(kn);
7149         struct kqrequest *kqr;
7150
7151         /* turn overcommit on for the appropriate thread request? */
7152         if (kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
7153                 if (kq->kq_state & KQ_WORKQ) {
7154                         kq_index_t qos_index = knote_get_qos_index(kn);
7155                         struct kqworkq *kqwq = (struct kqworkq *)kq;
7156
7157                         kqr = kqworkq_get_request(kqwq, qos_index);
7158
7159                         kqwq_req_lock(kqwq);
7160                         kqr->kqr_state |= KQR_THOVERCOMMIT;
7161                         kqwq_req_unlock(kqwq);
7162                 } else if (kq->kq_state & KQ_WORKLOOP) {
7163                         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7164
7165                         kqr = &kqwl->kqwl_request;
7166
7167                         kqwl_req_lock(kqwl);
7168                         kqr->kqr_state |= KQR_THOVERCOMMIT;
7169                         kqwl_req_unlock(kqwl);
7170                 }
7171         }
7172 }
7173
7174 static kq_index_t
7175 knote_get_qos_override_index(struct knote *kn)
7176 {
7177         return kn->kn_qos_override;
7178 }
7179
7180 static void
7181 knote_set_qos_override_index(struct knote *kn, kq_index_t override_index,
7182                 boolean_t override_is_sync)
7183 {
7184         struct kqueue *kq = knote_get_kq(kn);
7185         kq_index_t qos_index = knote_get_qos_index(kn);
7186         kq_index_t old_override_index = knote_get_qos_override_index(kn);
7187         boolean_t old_override_is_sync = kn->kn_qos_override_is_sync;
7188         uint32_t flags = 0;
7189
7190         assert((kn->kn_status & KN_QUEUED) == 0);
7191
7192         if (override_index == KQWQ_QOS_MANAGER) {
7193                 assert(qos_index == KQWQ_QOS_MANAGER);
7194         } else {
7195                 assert(override_index < KQWQ_QOS_MANAGER);
7196         }
7197
7198         kn->kn_qos_override = override_index;
7199         kn->kn_qos_override_is_sync = override_is_sync;
7200
7201         /*
7202          * If this is a workq/workloop kqueue, apply the override to the
7203          * servicing thread.
7204          */
7205         if (kq->kq_state & KQ_WORKQ)  {
7206                 struct kqworkq *kqwq = (struct kqworkq *)kq;
7207
7208                 assert(qos_index > THREAD_QOS_UNSPECIFIED);
7209                 kqworkq_update_override(kqwq, qos_index, override_index);
7210         } else if (kq->kq_state & KQ_WORKLOOP) {
7211                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7212
7213                 if ((kn->kn_status & KN_SUPPRESSED) == KN_SUPPRESSED) {
7214                         flags = flags | KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS;
7215
7216                         if (override_index == THREAD_QOS_USER_INTERACTIVE
7217                                         && override_is_sync) {
7218                                 flags = flags | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI;
7219                         }
7220
7221                         if (old_override_index == THREAD_QOS_USER_INTERACTIVE
7222                                         && old_override_is_sync) {
7223                                 flags = flags | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI;
7224                         }
7225                 }
7226
7227                 assert(qos_index > THREAD_QOS_UNSPECIFIED);
7228                 kqworkloop_update_override(kqwl, qos_index, override_index, flags);
7229         }
7230 }
7231
7232 static kq_index_t
7233 knote_get_sync_qos_override_index(struct knote *kn)
7234 {
7235         return kn->kn_qos_sync_override;
7236 }
7237
7238 static void
7239 kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index)
7240 {
7241         struct kqrequest *kqr;
7242         kq_index_t old_override_index;
7243
7244         if (override_index <= qos_index) {
7245                 return;
7246         }
7247
7248         kqr = kqworkq_get_request(kqwq, qos_index);
7249
7250         kqwq_req_lock(kqwq);
7251         old_override_index = kqr->kqr_override_index;
7252         if (override_index > MAX(kqr->kqr_qos_index, old_override_index)) {
7253                 kqr->kqr_override_index = override_index;
7254
7255                 /* apply the override to [incoming?] servicing thread */
7256                 if (kqr->kqr_state & KQR_BOUND) {
7257                         thread_t wqthread = kqr->kqr_thread;
7258
7259                         /* only apply if non-manager */
7260                         assert(wqthread);
7261                     if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
7262                                 if (old_override_index)
7263                                         thread_update_ipc_override(wqthread, override_index);
7264                                 else
7265                                         thread_add_ipc_override(wqthread, override_index);
7266                         }
7267                 }
7268         }
7269         kqwq_req_unlock(kqwq);
7270 }
7271
7272 /* called with the kqworkq lock held */
7273 static void
7274 kqworkq_bind_thread_impl(
7275         struct kqworkq *kqwq,
7276         kq_index_t qos_index,
7277         thread_t thread,
7278         unsigned int flags)
7279 {
7280         /* request lock must be held */
7281         kqwq_req_held(kqwq);
7282
7283         struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
7284         assert(kqr->kqr_state & KQR_THREQUESTED);
7285
7286         if (qos_index == KQWQ_QOS_MANAGER)
7287                 flags |= KEVENT_FLAG_WORKQ_MANAGER;
7288
7289         struct uthread *ut = get_bsdthread_info(thread);
7290
7291         /*
7292          * If this is a manager, and the manager request bit is
7293          * not set, assure no other thread is bound. If the bit
7294          * is set, make sure the old thread is us (or not set).
7295          */
7296         if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
7297                 if ((kqr->kqr_state & KQR_BOUND) == 0) {
7298                         kqr->kqr_state |= (KQR_BOUND | KQWQ_THMANAGER);
7299                         TAILQ_INIT(&kqr->kqr_suppressed);
7300                         kqr->kqr_thread = thread;
7301                         ut->uu_kqueue_bound = (struct kqueue *)kqwq;
7302                         ut->uu_kqueue_qos_index = KQWQ_QOS_MANAGER;
7303                         ut->uu_kqueue_flags = (KEVENT_FLAG_WORKQ |
7304                                                KEVENT_FLAG_WORKQ_MANAGER);
7305                 } else {
7306                         assert(kqr->kqr_state & KQR_BOUND);
7307                         assert(thread == kqr->kqr_thread);
7308                         assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
7309                         assert(ut->uu_kqueue_qos_index == KQWQ_QOS_MANAGER);
7310                         assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
7311                 }
7312                 return;
7313         }
7314
7315         /* Just a normal one-queue servicing thread */
7316         assert(kqr->kqr_state & KQR_THREQUESTED);
7317         assert(kqr->kqr_qos_index == qos_index);
7318
7319         if ((kqr->kqr_state & KQR_BOUND) == 0) {
7320                 kqr->kqr_state |= KQR_BOUND;
7321                 TAILQ_INIT(&kqr->kqr_suppressed);
7322                 kqr->kqr_thread = thread;
7323
7324                 /* apply an ipc QoS override if one is needed */
7325                 if (kqr->kqr_override_index) {
7326                         assert(kqr->kqr_qos_index);
7327                         assert(kqr->kqr_override_index > kqr->kqr_qos_index);
7328                         assert(thread_get_ipc_override(thread) == THREAD_QOS_UNSPECIFIED);
7329                         thread_add_ipc_override(thread, kqr->kqr_override_index);
7330                 }
7331
7332                 /* indicate that we are processing in the uthread */
7333                 ut->uu_kqueue_bound = (struct kqueue *)kqwq;
7334                 ut->uu_kqueue_qos_index = qos_index;
7335                 ut->uu_kqueue_flags = flags;
7336         } else {
7337                 /*
7338                  * probably syncronously bound AND post-request bound
7339                  * this logic can go away when we get rid of post-request bind
7340                  */
7341                 assert(kqr->kqr_state & KQR_BOUND);
7342                 assert(thread == kqr->kqr_thread);
7343                 assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
7344                 assert(ut->uu_kqueue_qos_index == qos_index);
7345                 assert((ut->uu_kqueue_flags & flags) == flags);
7346         }
7347 }
7348
7349 static void
7350 kqworkloop_update_override(
7351         struct kqworkloop *kqwl,
7352         kq_index_t qos_index,
7353         kq_index_t override_index,
7354         uint32_t flags)
7355 {
7356         struct kqrequest *kqr = &kqwl->kqwl_request;
7357
7358         kqwl_req_lock(kqwl);
7359
7360         /* Do not override on attached threads */
7361         if (kqr->kqr_state & KQR_BOUND) {
7362                 assert(kqr->kqr_thread);
7363
7364                 if (kqwl->kqwl_kqueue.kq_state & KQ_NO_WQ_THREAD) {
7365                         kqwl_req_unlock(kqwl);
7366                         assert(!is_workqueue_thread(kqr->kqr_thread));
7367                         return;
7368                 }
7369         }
7370
7371         /* Update sync ipc counts on kqr for suppressed knotes */
7372         if (flags & KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS) {
7373                 kqworkloop_update_suppress_sync_count(kqr, flags);
7374         }
7375
7376         if ((flags & KQWL_UO_UPDATE_OVERRIDE_LAZY) == 0) {
7377                 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
7378                         MAX(qos_index, override_index));
7379         }
7380         kqwl_req_unlock(kqwl);
7381 }
7382
7383 static void
7384 kqworkloop_update_suppress_sync_count(
7385         struct kqrequest *kqr,
7386         uint32_t flags)
7387 {
7388         if (flags & KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI) {
7389                 kqr->kqr_sync_suppress_count++;
7390         }
7391
7392         if (flags & KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI) {
7393                 assert(kqr->kqr_sync_suppress_count > 0);
7394                 kqr->kqr_sync_suppress_count--;
7395         }
7396 }
7397
7398 /*
7399  *      kqworkloop_unbind_thread - Unbind the servicer thread of a workloop kqueue
7400  *
7401  *      It will end the processing phase in case it was still processing:
7402  *
7403  *      We may have to request a new thread for not KQ_NO_WQ_THREAD workloop.
7404  *      This can happen if :
7405  *      - there were active events at or above our QoS we never got to (count > 0)
7406  *      - we pended waitq hook callouts during processing
7407  *      - we pended wakeups while processing (or unsuppressing)
7408  *
7409  *      Called with kqueue lock held.
7410  */
7411
7412 static void
7413 kqworkloop_unbind_thread(
7414         struct kqworkloop *kqwl,
7415         thread_t thread,
7416         __unused unsigned int flags)
7417 {
7418         struct kqueue *kq = &kqwl->kqwl_kqueue;
7419         struct kqrequest *kqr = &kqwl->kqwl_request;
7420
7421         kqlock_held(kq);
7422
7423         assert((kq->kq_state & KQ_PROCESSING) == 0);
7424         if (kq->kq_state & KQ_PROCESSING) {
7425                 return;
7426         }
7427
7428         /*
7429          * Forcing the KQ_PROCESSING flag allows for QoS updates because of
7430          * unsuppressing knotes not to be applied until the eventual call to
7431          * kqworkloop_update_threads_qos() below.
7432          */
7433         kq->kq_state |= KQ_PROCESSING;
7434         kqworkloop_acknowledge_events(kqwl, TRUE);
7435         kq->kq_state &= ~KQ_PROCESSING;
7436
7437         kqwl_req_lock(kqwl);
7438
7439         /* deal with extraneous unbinds in release kernels */
7440         assert((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) == KQR_BOUND);
7441         if ((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) != KQR_BOUND) {
7442                 kqwl_req_unlock(kqwl);
7443                 return;
7444         }
7445
7446         assert(thread == current_thread());
7447         assert(kqr->kqr_thread == thread);
7448         if (kqr->kqr_thread != thread) {
7449                 kqwl_req_unlock(kqwl);
7450             return;
7451         }
7452
7453         struct uthread *ut = get_bsdthread_info(thread);
7454         kq_index_t old_qos_index = ut->uu_kqueue_qos_index;
7455         boolean_t ipc_override_is_sync = ut->uu_kqueue_override_is_sync;
7456         ut->uu_kqueue_bound = NULL;
7457         ut->uu_kqueue_qos_index = 0;
7458         ut->uu_kqueue_override_is_sync = 0;
7459         ut->uu_kqueue_flags = 0;
7460
7461         /* unbind the servicer thread, drop overrides */
7462         kqr->kqr_thread = NULL;
7463         kqr->kqr_state &= ~(KQR_BOUND | KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
7464         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
7465
7466         kqwl_req_unlock(kqwl);
7467
7468         /*
7469          * Drop the override on the current thread last, after the call to
7470          * kqworkloop_update_threads_qos above.
7471          */
7472         if (old_qos_index) {
7473                 thread_drop_ipc_override(thread);
7474         }
7475         if (ipc_override_is_sync) {
7476                 thread_drop_sync_ipc_override(thread);
7477         }
7478 }
7479
7480 /* called with the kqworkq lock held */
7481 static void
7482 kqworkq_unbind_thread(
7483         struct kqworkq *kqwq,
7484         kq_index_t qos_index,
7485         thread_t thread,
7486         __unused unsigned int flags)
7487 {
7488         struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
7489         kq_index_t override_index = 0;
7490
7491         /* request lock must be held */
7492         kqwq_req_held(kqwq);
7493
7494         assert(thread == current_thread());
7495
7496         if ((kqr->kqr_state & KQR_BOUND) == 0) {
7497                 assert(kqr->kqr_state & KQR_BOUND);
7498                 return;
7499         }
7500
7501         assert(kqr->kqr_thread == thread);
7502         assert(TAILQ_EMPTY(&kqr->kqr_suppressed));
7503
7504         /*
7505          * If there is an override, drop it from the current thread
7506          * and then we are free to recompute (a potentially lower)
7507          * minimum override to apply to the next thread request.
7508          */
7509         if (kqr->kqr_override_index) {
7510                 struct kqtailq *base_queue = kqueue_get_base_queue(&kqwq->kqwq_kqueue, qos_index);
7511                 struct kqtailq *queue = kqueue_get_high_queue(&kqwq->kqwq_kqueue, qos_index);
7512
7513                 /* if not bound to a manager thread, drop the current ipc override */
7514                 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
7515                         thread_drop_ipc_override(thread);
7516                 }
7517
7518                 /* recompute the new override */
7519                 do {
7520                         if (!TAILQ_EMPTY(queue)) {
7521                                 override_index = queue - base_queue + qos_index;
7522                                 break;
7523                         }
7524                 } while (queue-- > base_queue);
7525         }
7526
7527         /* Mark it unbound */
7528         kqr->kqr_thread = NULL;
7529         kqr->kqr_state &= ~(KQR_BOUND | KQR_THREQUESTED | KQWQ_THMANAGER);
7530
7531         /* apply the new override */
7532         if (override_index > kqr->kqr_qos_index) {
7533                 kqr->kqr_override_index = override_index;
7534         } else {
7535                 kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
7536         }
7537 }
7538
7539 struct kqrequest *
7540 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
7541 {
7542         assert(qos_index < KQWQ_NQOS);
7543         return &kqwq->kqwq_request[qos_index];
7544 }
7545
7546 void
7547 knote_adjust_qos(struct knote *kn, qos_t new_qos, qos_t new_override, kq_index_t sync_override_index)
7548 {
7549         struct kqueue *kq = knote_get_kq(kn);
7550         boolean_t override_is_sync = FALSE;
7551
7552         if (kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) {
7553                 kq_index_t new_qos_index;
7554                 kq_index_t new_override_index;
7555                 kq_index_t servicer_qos_index;
7556
7557                 new_qos_index = qos_index_from_qos(kn, new_qos, FALSE);
7558                 new_override_index = qos_index_from_qos(kn, new_override, TRUE);
7559
7560                 /* make sure the servicer qos acts as a floor */
7561                 servicer_qos_index = qos_index_from_qos(kn, kn->kn_qos, FALSE);
7562                 if (servicer_qos_index > new_qos_index)
7563                         new_qos_index = servicer_qos_index;
7564                 if (servicer_qos_index > new_override_index)
7565                         new_override_index = servicer_qos_index;
7566                 if (sync_override_index >= new_override_index) {
7567                         new_override_index = sync_override_index;
7568                         override_is_sync = TRUE;
7569                 }
7570
7571                 kqlock(kq);
7572                 if (new_qos_index != knote_get_req_index(kn) ||
7573                     new_override_index != knote_get_qos_override_index(kn) ||
7574                     override_is_sync != kn->kn_qos_override_is_sync) {
7575                         if (kn->kn_status & KN_QUEUED) {
7576                                 knote_dequeue(kn);
7577                                 knote_set_qos_index(kn, new_qos_index);
7578                                 knote_set_qos_override_index(kn, new_override_index, override_is_sync);
7579                                 knote_enqueue(kn);
7580                                 knote_wakeup(kn);
7581                         } else {
7582                                 knote_set_qos_index(kn, new_qos_index);
7583                                 knote_set_qos_override_index(kn, new_override_index, override_is_sync);
7584                         }
7585                 }
7586                 kqunlock(kq);
7587         }
7588 }
7589
7590 void
7591 knote_adjust_sync_qos(struct knote *kn, kq_index_t sync_qos, boolean_t lock_kq)
7592 {
7593         struct kqueue *kq = knote_get_kq(kn);
7594         kq_index_t old_sync_override;
7595         kq_index_t qos_index = knote_get_qos_index(kn);
7596         uint32_t flags = 0;
7597
7598         /* Tracking only happens for UI qos */
7599         if (sync_qos != THREAD_QOS_USER_INTERACTIVE &&
7600                 sync_qos != THREAD_QOS_UNSPECIFIED) {
7601                 return;
7602         }
7603
7604         if (lock_kq)
7605                 kqlock(kq);
7606
7607         if (kq->kq_state & KQ_WORKLOOP) {
7608                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7609
7610                 old_sync_override = knote_get_sync_qos_override_index(kn);
7611                 if (old_sync_override != sync_qos) {
7612                         kn->kn_qos_sync_override = sync_qos;
7613
7614                         /* update sync ipc counters for suppressed knotes */
7615                         if ((kn->kn_status & KN_SUPPRESSED) == KN_SUPPRESSED) {
7616                                 flags = flags | KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS;
7617
7618                                 /* Do not recalculate kqwl override, it would be done later */
7619                                 flags = flags | KQWL_UO_UPDATE_OVERRIDE_LAZY;
7620
7621                                 if (sync_qos == THREAD_QOS_USER_INTERACTIVE) {
7622                                         flags = flags | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI;
7623                                 }
7624
7625                                 if (old_sync_override == THREAD_QOS_USER_INTERACTIVE) {
7626                                         flags = flags | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI;
7627                                 }
7628
7629                                 kqworkloop_update_override(kqwl, qos_index, sync_qos,
7630                                         flags);
7631                         }
7632
7633                 }
7634         }
7635         if (lock_kq)
7636                 kqunlock(kq);
7637 }
7638
7639 static void
7640 knote_wakeup(struct knote *kn)
7641 {
7642         struct kqueue *kq = knote_get_kq(kn);
7643         kq_index_t qos_index = knote_get_qos_index(kn);
7644
7645         kqlock_held(kq);
7646
7647         if (kq->kq_state & KQ_WORKQ) {
7648                 /* request a servicing thread */
7649                 struct kqworkq *kqwq = (struct kqworkq *)kq;
7650
7651                 kqworkq_request_help(kqwq, qos_index);
7652
7653         } else if (kq->kq_state & KQ_WORKLOOP) {
7654                 /* request a servicing thread */
7655                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7656
7657                 if (kqworkloop_is_processing_on_current_thread(kqwl)) {
7658                         /*
7659                          * kqworkloop_end_processing() will perform the required QoS
7660                          * computations when it unsets the processing mode.
7661                          */
7662                         return;
7663                 }
7664                 kqworkloop_request_help(kqwl, qos_index);
7665         } else {
7666                 struct kqfile *kqf = (struct kqfile *)kq;
7667
7668                 /* flag wakeups during processing */
7669                 if (kq->kq_state & KQ_PROCESSING)
7670                         kq->kq_state |= KQ_WAKEUP;
7671
7672                 /* wakeup a thread waiting on this queue */
7673                 if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
7674                         kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
7675                         waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7676                                            KQ_EVENT,
7677                                            THREAD_AWAKENED,
7678                                            WAITQ_ALL_PRIORITIES);
7679                 }
7680
7681                 /* wakeup other kqueues/select sets we're inside */
7682                 KNOTE(&kqf->kqf_sel.si_note, 0);
7683         }
7684 }
7685
7686 /*
7687  * Called with the kqueue locked
7688  */
7689 static void
7690 kqueue_interrupt(struct kqueue *kq)
7691 {
7692         assert((kq->kq_state & KQ_WORKQ) == 0);
7693
7694         /* wakeup sleeping threads */
7695         if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0) {
7696                 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
7697                 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7698                                          KQ_EVENT,
7699                                          THREAD_RESTART,
7700                                          WAITQ_ALL_PRIORITIES);
7701         }
7702
7703         /* wakeup threads waiting their turn to process */
7704         if (kq->kq_state & KQ_PROCWAIT) {
7705                 struct kqtailq *suppressq;
7706
7707                 assert(kq->kq_state & KQ_PROCESSING);
7708
7709                 kq->kq_state &= ~KQ_PROCWAIT;
7710                 suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
7711                 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7712                                          CAST_EVENT64_T(suppressq),
7713                                          THREAD_RESTART,
7714                                          WAITQ_ALL_PRIORITIES);
7715         }
7716 }
7717
7718 /*
7719  * Called back from waitq code when no threads waiting and the hook was set.
7720  *
7721  * Interrupts are likely disabled and spin locks are held - minimal work
7722  * can be done in this context!!!
7723  *
7724  * JMM - in the future, this will try to determine which knotes match the
7725  * wait queue wakeup and apply these wakeups against those knotes themselves.
7726  * For now, all the events dispatched this way are dispatch-manager handled,
7727  * so hard-code that for now.
7728  */
7729 void
7730 waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos)
7731 {
7732 #pragma unused(knote_hook, qos)
7733
7734         struct kqueue *kq = (struct kqueue *)kq_hook;
7735
7736         if (kq->kq_state & KQ_WORKQ) {
7737                 struct kqworkq *kqwq = (struct kqworkq *)kq;
7738
7739                 kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER);
7740
7741         } else if (kq->kq_state & KQ_WORKLOOP) {
7742                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7743
7744                 kqworkloop_request_help(kqwl, KQWL_BUCKET_STAYACTIVE);
7745         }
7746 }
7747
7748 void
7749 klist_init(struct klist *list)
7750 {
7751         SLIST_INIT(list);
7752 }
7753
7754
7755 /*
7756  * Query/Post each knote in the object's list
7757  *
7758  *      The object lock protects the list. It is assumed
7759  *      that the filter/event routine for the object can
7760  *      determine that the object is already locked (via
7761  *      the hint) and not deadlock itself.
7762  *
7763  *      The object lock should also hold off pending
7764  *      detach/drop operations.  But we'll prevent it here
7765  *      too (by taking a use reference) - just in case.
7766  */
7767 void
7768 knote(struct klist *list, long hint)
7769 {
7770         struct knote *kn;
7771
7772         SLIST_FOREACH(kn, list, kn_selnext) {
7773                 struct kqueue *kq = knote_get_kq(kn);
7774
7775                 kqlock(kq);
7776
7777                 assert(!knoteuse_needs_boost(kn, NULL));
7778
7779                 /* If we can get a use reference - deliver event */
7780                 if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
7781                         int result;
7782
7783                         /* call the event with only a use count */
7784                         result = knote_fops(kn)->f_event(kn, hint);
7785
7786                         /* if its not going away and triggered */
7787                         if (knoteuse2kqlock(kq, kn, KNUSE_NONE) && result)
7788                                 knote_activate(kn);
7789                         /* kq lock held */
7790                 }
7791                 kqunlock(kq);
7792         }
7793 }
7794
7795 /*
7796  * attach a knote to the specified list.  Return true if this is the first entry.
7797  * The list is protected by whatever lock the object it is associated with uses.
7798  */
7799 int
7800 knote_attach(struct klist *list, struct knote *kn)
7801 {
7802         int ret = SLIST_EMPTY(list);
7803         SLIST_INSERT_HEAD(list, kn, kn_selnext);
7804         return (ret);
7805 }
7806
7807 /*
7808  * detach a knote from the specified list.  Return true if that was the last entry.
7809  * The list is protected by whatever lock the object it is associated with uses.
7810  */
7811 int
7812 knote_detach(struct klist *list, struct knote *kn)
7813 {
7814         SLIST_REMOVE(list, kn, knote, kn_selnext);
7815         return (SLIST_EMPTY(list));
7816 }
7817
7818 /*
7819  * knote_vanish - Indicate that the source has vanished
7820  *
7821  * If the knote has requested EV_VANISHED delivery,
7822  * arrange for that. Otherwise, deliver a NOTE_REVOKE
7823  * event for backward compatibility.
7824  *
7825  * The knote is marked as having vanished, but is not
7826  * actually detached from the source in this instance.
7827  * The actual detach is deferred until the knote drop.
7828  *
7829  * Our caller already has the object lock held. Calling
7830  * the detach routine would try to take that lock
7831  * recursively - which likely is not supported.
7832  */
7833 void
7834 knote_vanish(struct klist *list)
7835 {
7836         struct knote *kn;
7837         struct knote *kn_next;
7838
7839         SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
7840                 struct kqueue *kq = knote_get_kq(kn);
7841                 int result;
7842
7843                 kqlock(kq);
7844
7845                 assert(!knoteuse_needs_boost(kn, NULL));
7846
7847                 if ((kn->kn_status & KN_DROPPING) == 0) {
7848                         /* If EV_VANISH supported - prepare to deliver one */
7849                         if (kn->kn_status & KN_REQVANISH) {
7850                                 kn->kn_status |= KN_VANISHED;
7851                                 knote_activate(kn);
7852
7853                         } else if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
7854                                 /* call the event with only a use count */
7855                                 result = knote_fops(kn)->f_event(kn, NOTE_REVOKE);
7856
7857                                 /* if its not going away and triggered */
7858                                 if (knoteuse2kqlock(kq, kn, KNUSE_NONE) && result)
7859                                         knote_activate(kn);
7860                                 /* lock held again */
7861                         }
7862                 }
7863                 kqunlock(kq);
7864         }
7865 }
7866
7867 /*
7868  * For a given knote, link a provided wait queue directly with the kqueue.
7869  * Wakeups will happen via recursive wait queue support.  But nothing will move
7870  * the knote to the active list at wakeup (nothing calls knote()).  Instead,
7871  * we permanently enqueue them here.
7872  *
7873  * kqueue and knote references are held by caller.
7874  * waitq locked by caller.
7875  *
7876  * caller provides the wait queue link structure.
7877  */
7878 int
7879 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
7880 {
7881         struct kqueue *kq = knote_get_kq(kn);
7882         kern_return_t kr;
7883
7884         kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
7885         if (kr == KERN_SUCCESS) {
7886                 knote_markstayactive(kn);
7887                 return (0);
7888         } else {
7889                 return (EINVAL);
7890         }
7891 }
7892
7893 /*
7894  * Unlink the provided wait queue from the kqueue associated with a knote.
7895  * Also remove it from the magic list of directly attached knotes.
7896  *
7897  * Note that the unlink may have already happened from the other side, so
7898  * ignore any failures to unlink and just remove it from the kqueue list.
7899  *
7900  * On success, caller is responsible for the link structure
7901  */
7902 int
7903 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
7904 {
7905         struct kqueue *kq = knote_get_kq(kn);
7906         kern_return_t kr;
7907
7908         kr = waitq_unlink(wq, &kq->kq_wqs);
7909         knote_clearstayactive(kn);
7910         return ((kr != KERN_SUCCESS) ? EINVAL : 0);
7911 }
7912
7913 /*
7914  * remove all knotes referencing a specified fd
7915  *
7916  * Essentially an inlined knote_remove & knote_drop
7917  * when we know for sure that the thing is a file
7918  *
7919  * Entered with the proc_fd lock already held.
7920  * It returns the same way, but may drop it temporarily.
7921  */
7922 void
7923 knote_fdclose(struct proc *p, int fd, int force)
7924 {
7925         struct klist *list;
7926         struct knote *kn;
7927
7928 restart:
7929         list = &p->p_fd->fd_knlist[fd];
7930         SLIST_FOREACH(kn, list, kn_link) {
7931                 struct kqueue *kq = knote_get_kq(kn);
7932
7933                 kqlock(kq);
7934
7935                 if (kq->kq_p != p)
7936                         panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
7937                             __func__, kq->kq_p, p);
7938
7939                 /*
7940                  * If the knote supports EV_VANISHED delivery,
7941                  * transition it to vanished mode (or skip over
7942                  * it if already vanished).
7943                  */
7944                 if (!force && (kn->kn_status & KN_REQVANISH)) {
7945
7946                         if ((kn->kn_status & KN_VANISHED) == 0) {
7947                                 proc_fdunlock(p);
7948
7949                                 assert(!knoteuse_needs_boost(kn, NULL));
7950
7951                                 /* get detach reference (also marks vanished) */
7952                                 if (kqlock2knotedetach(kq, kn, KNUSE_NONE)) {
7953                                         /* detach knote and drop fp use reference */
7954                                         knote_fops(kn)->f_detach(kn);
7955                                         if (knote_fops(kn)->f_isfd)
7956                                                 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
7957
7958                                         /* activate it if it's still in existence */
7959                                         if (knoteuse2kqlock(kq, kn, KNUSE_NONE)) {
7960                                                 knote_activate(kn);
7961                                         }
7962                                         kqunlock(kq);
7963                                 }
7964                                 proc_fdlock(p);
7965                                 goto restart;
7966                         } else {
7967                                 kqunlock(kq);
7968                                 continue;
7969                         }
7970                 }
7971
7972                 proc_fdunlock(p);
7973
7974                 /*
7975                  * Convert the kq lock to a drop ref.
7976                  * If we get it, go ahead and drop it.
7977                  * Otherwise, we waited for the blocking
7978                  * condition to complete. Either way,
7979                  * we dropped the fdlock so start over.
7980                  */
7981                 if (kqlock2knotedrop(kq, kn)) {
7982                         knote_drop(kn, p);
7983                 }
7984
7985                 proc_fdlock(p);
7986                 goto restart;
7987         }
7988 }
7989
7990 /*
7991  * knote_fdfind - lookup a knote in the fd table for process
7992  *
7993  * If the filter is file-based, lookup based on fd index.
7994  * Otherwise use a hash based on the ident.
7995  *
7996  * Matching is based on kq, filter, and ident. Optionally,
7997  * it may also be based on the udata field in the kevent -
7998  * allowing multiple event registration for the file object
7999  * per kqueue.
8000  *
8001  * fd_knhashlock or fdlock held on entry (and exit)
8002  */
8003 static struct knote *
8004 knote_fdfind(struct kqueue *kq,
8005              struct kevent_internal_s *kev,
8006              bool is_fd,
8007              struct proc *p)
8008 {
8009         struct filedesc *fdp = p->p_fd;
8010         struct klist *list = NULL;
8011         struct knote *kn = NULL;
8012
8013         /*
8014          * determine where to look for the knote
8015          */
8016         if (is_fd) {
8017                 /* fd-based knotes are linked off the fd table */
8018                 if (kev->ident < (u_int)fdp->fd_knlistsize) {
8019                         list = &fdp->fd_knlist[kev->ident];
8020                 }
8021         } else if (fdp->fd_knhashmask != 0) {
8022                 /* hash non-fd knotes here too */
8023                 list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
8024         }
8025
8026         /*
8027          * scan the selected list looking for a match
8028          */
8029         if (list != NULL) {
8030                 SLIST_FOREACH(kn, list, kn_link) {
8031                         if (kq == knote_get_kq(kn) &&
8032                             kev->ident == kn->kn_id &&
8033                             kev->filter == kn->kn_filter) {
8034                                 if (kev->flags & EV_UDATA_SPECIFIC) {
8035                                         if ((kn->kn_status & KN_UDATA_SPECIFIC) &&
8036                                             kev->udata == kn->kn_udata) {
8037                                                 break; /* matching udata-specific knote */
8038                                         }
8039                                 } else if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
8040                                         break; /* matching non-udata-specific knote */
8041                                 }
8042                         }
8043                 }
8044         }
8045         return kn;
8046 }
8047
8048 /*
8049  * kq_add_knote- Add knote to the fd table for process
8050  * while checking for duplicates.
8051  *
8052  * All file-based filters associate a list of knotes by file
8053  * descriptor index. All other filters hash the knote by ident.
8054  *
8055  * May have to grow the table of knote lists to cover the
8056  * file descriptor index presented.
8057  *
8058  * fd_knhashlock and fdlock unheld on entry (and exit).
8059  *
8060  * Takes a rwlock boost if inserting the knote is successful.
8061  */
8062 static int
8063 kq_add_knote(struct kqueue *kq, struct knote *kn,
8064              struct kevent_internal_s *kev,
8065              struct proc *p, int *knoteuse_flags)
8066 {
8067         struct filedesc *fdp = p->p_fd;
8068         struct klist *list = NULL;
8069         int ret = 0;
8070         bool is_fd = knote_fops(kn)->f_isfd;
8071
8072         if (is_fd)
8073                 proc_fdlock(p);
8074         else
8075                 knhash_lock(p);
8076
8077         if (knote_fdfind(kq, kev, is_fd, p) != NULL) {
8078                 /* found an existing knote: we can't add this one */
8079                 ret = ERESTART;
8080                 goto out_locked;
8081         }
8082
8083         /* knote was not found: add it now */
8084         if (!is_fd) {
8085                 if (fdp->fd_knhashmask == 0) {
8086                         u_long size = 0;
8087
8088                         list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
8089                                                   &size);
8090                         if (list == NULL) {
8091                                 ret = ENOMEM;
8092                                 goto out_locked;
8093                         }
8094
8095                         fdp->fd_knhash = list;
8096                         fdp->fd_knhashmask = size;
8097                 }
8098
8099                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
8100                 SLIST_INSERT_HEAD(list, kn, kn_link);
8101                 ret = 0;
8102                 goto out_locked;
8103
8104         } else {
8105                 /* knote is fd based */
8106
8107                 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
8108                         u_int size = 0;
8109
8110                         if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
8111                             || kn->kn_id >= (uint64_t)maxfiles) {
8112                                 ret = EINVAL;
8113                                 goto out_locked;
8114                         }
8115                         /* have to grow the fd_knlist */
8116                         size = fdp->fd_knlistsize;
8117                         while (size <= kn->kn_id)
8118                                 size += KQEXTENT;
8119
8120                         if (size >= (UINT_MAX/sizeof(struct klist *))) {
8121                                 ret = EINVAL;
8122                                 goto out_locked;
8123                         }
8124
8125                         MALLOC(list, struct klist *,
8126                             size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
8127                         if (list == NULL) {
8128                                 ret = ENOMEM;
8129                                 goto out_locked;
8130                         }
8131
8132                         bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
8133                             fdp->fd_knlistsize * sizeof(struct klist *));
8134                         bzero((caddr_t)list +
8135                             fdp->fd_knlistsize * sizeof(struct klist *),
8136                             (size - fdp->fd_knlistsize) * sizeof(struct klist *));
8137                         FREE(fdp->fd_knlist, M_KQUEUE);
8138                         fdp->fd_knlist = list;
8139                         fdp->fd_knlistsize = size;
8140                 }
8141
8142                 list = &fdp->fd_knlist[kn->kn_id];
8143                 SLIST_INSERT_HEAD(list, kn, kn_link);
8144                 ret = 0;
8145                 goto out_locked;
8146
8147         }
8148
8149 out_locked:
8150         if (ret == 0 && knoteuse_needs_boost(kn, kev)) {
8151                 set_thread_rwlock_boost();
8152                 *knoteuse_flags = KNUSE_BOOST;
8153         } else {
8154                 *knoteuse_flags = KNUSE_NONE;
8155         }
8156         if (is_fd)
8157                 proc_fdunlock(p);
8158         else
8159                 knhash_unlock(p);
8160
8161         return ret;
8162 }
8163
8164 /*
8165  * kq_remove_knote - remove a knote from the fd table for process
8166  * and copy kn_status an kq_state while holding kqlock and
8167  * fd table locks.
8168  *
8169  * If the filter is file-based, remove based on fd index.
8170  * Otherwise remove from the hash based on the ident.
8171  *
8172  * fd_knhashlock and fdlock unheld on entry (and exit).
8173  */
8174 static void
8175 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
8176         kn_status_t *kn_status, uint16_t *kq_state)
8177 {
8178         struct filedesc *fdp = p->p_fd;
8179         struct klist *list = NULL;
8180         bool is_fd;
8181
8182         is_fd = knote_fops(kn)->f_isfd;
8183
8184         if (is_fd)
8185                 proc_fdlock(p);
8186         else
8187                 knhash_lock(p);
8188
8189         if (is_fd) {
8190                 assert ((u_int)fdp->fd_knlistsize > kn->kn_id);
8191                 list = &fdp->fd_knlist[kn->kn_id];
8192         } else {
8193                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
8194         }
8195         SLIST_REMOVE(list, kn, knote, kn_link);
8196
8197         kqlock(kq);
8198         *kn_status = kn->kn_status;
8199         *kq_state = kq->kq_state;
8200         kqunlock(kq);
8201
8202         if (is_fd)
8203                 proc_fdunlock(p);
8204         else
8205                 knhash_unlock(p);
8206 }
8207
8208 /*
8209  * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
8210  * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
8211  *
8212  * fd_knhashlock or fdlock unheld on entry (and exit)
8213  */
8214
8215 static struct knote *
8216 kq_find_knote_and_kq_lock(struct kqueue *kq,
8217              struct kevent_internal_s *kev,
8218              bool is_fd,
8219              struct proc *p)
8220 {
8221         struct knote * ret;
8222
8223         if (is_fd)
8224                 proc_fdlock(p);
8225         else
8226                 knhash_lock(p);
8227
8228         ret = knote_fdfind(kq, kev, is_fd, p);
8229
8230         if (ret) {
8231                 kqlock(kq);
8232         }
8233
8234         if (is_fd)
8235                 proc_fdunlock(p);
8236         else
8237                 knhash_unlock(p);
8238
8239         return ret;
8240 }
8241 /*
8242  * knote_drop - disconnect and drop the knote
8243  *
8244  * Called with the kqueue unlocked and holding a
8245  * "drop reference" on the knote in question.
8246  * This reference is most often aquired thru a call
8247  * to kqlock2knotedrop(). But it can also be acquired
8248  * through stealing a drop reference via a call to
8249  * knoteuse2knotedrop() or during the initial attach
8250  * of the knote.
8251  *
8252  * The knote may have already been detached from
8253  * (or not yet attached to) its source object.
8254  */
8255 static void
8256 knote_drop(struct knote *kn, __unused struct proc *ctxp)
8257 {
8258         struct kqueue *kq = knote_get_kq(kn);
8259         struct proc *p = kq->kq_p;
8260         kn_status_t kn_status;
8261         uint16_t kq_state;
8262
8263         /* If we are attached, disconnect from the source first */
8264         if (kn->kn_status & KN_ATTACHED) {
8265                 knote_fops(kn)->f_detach(kn);
8266         }
8267
8268         /* Remove the source from the appropriate hash */
8269         kq_remove_knote(kq, kn, p, &kn_status, &kq_state);
8270
8271         /*
8272          * If a kqueue_dealloc is happening in parallel for the kq
8273          * pointed by the knote the kq could be aready deallocated
8274          * at this point.
8275          * Do not access the kq after the kq_remove_knote if it is
8276          * not a KQ_DYNAMIC.
8277          */
8278
8279         /* determine if anyone needs to know about the drop */
8280         assert((kn_status & (KN_DROPPING | KN_SUPPRESSED | KN_QUEUED)) == KN_DROPPING);
8281
8282         /*
8283          * If KN_USEWAIT is set, some other thread was trying to drop the kn.
8284          * Or it was in kqueue_dealloc, so the kqueue_dealloc did not happen
8285          * because that thread was waiting on this wake, or it was a drop happening
8286          * because of a kevent_register that takes a reference on the kq, and therefore
8287          * the kq cannot be deallocated in parallel.
8288          *
8289          * It is safe to access kq->kq_wqs if needswakeup is set.
8290          */
8291         if (kn_status & KN_USEWAIT)
8292                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
8293                                    CAST_EVENT64_T(&kn->kn_status),
8294                                    THREAD_RESTART,
8295                                    WAITQ_ALL_PRIORITIES);
8296
8297         if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0))
8298                 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
8299
8300         knote_free(kn);
8301
8302         /*
8303          * release reference on dynamic kq (and free if last).
8304          * Will only be last if this is from fdfree, etc...
8305          * because otherwise processing thread has reference.
8306          */
8307         if (kq_state & KQ_DYNAMIC)
8308                 kqueue_release_last(p, kq);
8309 }
8310
8311 /* called with kqueue lock held */
8312 static void
8313 knote_activate(struct knote *kn)
8314 {
8315         if (kn->kn_status & KN_ACTIVE)
8316                 return;
8317
8318         KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
8319                       kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
8320                       kn->kn_filtid);
8321
8322         kn->kn_status |= KN_ACTIVE;
8323         if (knote_enqueue(kn))
8324                 knote_wakeup(kn);
8325 }
8326
8327 /* called with kqueue lock held */
8328 static void
8329 knote_deactivate(struct knote *kn)
8330 {
8331         kn->kn_status &= ~KN_ACTIVE;
8332         if ((kn->kn_status & KN_STAYACTIVE) == 0)
8333                 knote_dequeue(kn);
8334 }
8335
8336 /* called with kqueue lock held */
8337 static void
8338 knote_enable(struct knote *kn)
8339 {
8340         if ((kn->kn_status & KN_DISABLED) == 0)
8341                 return;
8342
8343         kn->kn_status &= ~KN_DISABLED;
8344
8345         if (kn->kn_status & KN_SUPPRESSED) {
8346                 /* Clear the sync qos on the knote */
8347                 knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
8348
8349                 /*
8350                  * it is possible for userland to have knotes registered for a given
8351                  * workloop `wl_orig` but really handled on another workloop `wl_new`.
8352                  *
8353                  * In that case, rearming will happen from the servicer thread of
8354                  * `wl_new` which if `wl_orig` is no longer being serviced, would cause
8355                  * this knote to stay suppressed forever if we only relied on
8356                  * kqworkloop_acknowledge_events to be called by `wl_orig`.
8357                  *
8358                  * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
8359                  * unsuppress because that would mess with the processing phase of
8360                  * `wl_orig`, however it also means kqworkloop_acknowledge_events()
8361                  * will be called.
8362                  */
8363                 struct kqueue *kq = knote_get_kq(kn);
8364                 if ((kq->kq_state & KQ_PROCESSING) == 0) {
8365                         knote_unsuppress(kn);
8366                 }
8367         } else if (knote_enqueue(kn)) {
8368                 knote_wakeup(kn);
8369         }
8370 }
8371
8372 /* called with kqueue lock held */
8373 static void
8374 knote_disable(struct knote *kn)
8375 {
8376         if (kn->kn_status & KN_DISABLED)
8377                 return;
8378
8379         kn->kn_status |= KN_DISABLED;
8380         knote_dequeue(kn);
8381 }
8382
8383 /* called with kqueue lock held */
8384 static void
8385 knote_suppress(struct knote *kn)
8386 {
8387         struct kqtailq *suppressq;
8388         struct kqueue *kq = knote_get_kq(kn);
8389
8390         kqlock_held(kq);
8391
8392         if (kn->kn_status & KN_SUPPRESSED)
8393                 return;
8394
8395         knote_dequeue(kn);
8396         kn->kn_status |= KN_SUPPRESSED;
8397         suppressq = kqueue_get_suppressed_queue(kq, knote_get_qos_index(kn));
8398         TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
8399
8400         if ((kq->kq_state & KQ_WORKLOOP) &&
8401              knote_get_qos_override_index(kn) == THREAD_QOS_USER_INTERACTIVE &&
8402              kn->kn_qos_override_is_sync) {
8403                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8404                 /* update the sync qos override counter for suppressed knotes */
8405                 kqworkloop_update_override(kqwl, knote_get_qos_index(kn),
8406                         knote_get_qos_override_index(kn),
8407                         (KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI));
8408         }
8409 }
8410
8411 /* called with kqueue lock held */
8412 static void
8413 knote_unsuppress(struct knote *kn)
8414 {
8415         struct kqtailq *suppressq;
8416         struct kqueue *kq = knote_get_kq(kn);
8417
8418         kqlock_held(kq);
8419
8420         if ((kn->kn_status & KN_SUPPRESSED) == 0)
8421                 return;
8422
8423         /* Clear the sync qos on the knote */
8424         knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
8425
8426         kn->kn_status &= ~KN_SUPPRESSED;
8427         suppressq = kqueue_get_suppressed_queue(kq, knote_get_qos_index(kn));
8428         TAILQ_REMOVE(suppressq, kn, kn_tqe);
8429
8430         /* udate in-use qos to equal requested qos */
8431         kn->kn_qos_index = kn->kn_req_index;
8432
8433         /* don't wakeup if unsuppressing just a stay-active knote */
8434         if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
8435                 knote_wakeup(kn);
8436         }
8437
8438         if ((kq->kq_state & KQ_WORKLOOP) && !(kq->kq_state & KQ_NO_WQ_THREAD) &&
8439              knote_get_qos_override_index(kn) == THREAD_QOS_USER_INTERACTIVE &&
8440              kn->kn_qos_override_is_sync) {
8441                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8442
8443                 /* update the sync qos override counter for suppressed knotes */
8444                 kqworkloop_update_override(kqwl, knote_get_qos_index(kn),
8445                         knote_get_qos_override_index(kn),
8446                         (KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI));
8447         }
8448
8449         if (TAILQ_EMPTY(suppressq) && (kq->kq_state & KQ_WORKLOOP) &&
8450                         !(kq->kq_state & KQ_NO_WQ_THREAD)) {
8451                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8452                 if (kqworkloop_is_processing_on_current_thread(kqwl)) {
8453                         /*
8454                          * kqworkloop_end_processing() will perform the required QoS
8455                          * computations when it unsets the processing mode.
8456                          */
8457                 } else {
8458                         kqwl_req_lock(kqwl);
8459                         kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, 0);
8460                         kqwl_req_unlock(kqwl);
8461                 }
8462         }
8463 }
8464
8465 /* called with kqueue lock held */
8466 static void
8467 knote_update_sync_override_state(struct knote *kn)
8468 {
8469         struct kqtailq *queue = knote_get_queue(kn);
8470         struct kqueue *kq = knote_get_kq(kn);
8471
8472         if (!(kq->kq_state & KQ_WORKLOOP) ||
8473             knote_get_queue_index(kn) != THREAD_QOS_USER_INTERACTIVE)
8474                 return;
8475
8476         /* Update the sync ipc state on workloop */
8477         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8478         boolean_t sync_ipc_override = FALSE;
8479         if (!TAILQ_EMPTY(queue)) {
8480                 struct knote *kn_head = TAILQ_FIRST(queue);
8481                 if (kn_head->kn_qos_override_is_sync)
8482                         sync_ipc_override = TRUE;
8483         }
8484         kqworkloop_update_sync_override_state(kqwl, sync_ipc_override);
8485 }
8486
8487 /* called with kqueue lock held */
8488 static int
8489 knote_enqueue(struct knote *kn)
8490 {
8491         if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0 ||
8492             (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)))
8493                 return 0;
8494
8495         if ((kn->kn_status & KN_QUEUED) == 0) {
8496                 struct kqtailq *queue = knote_get_queue(kn);
8497                 struct kqueue *kq = knote_get_kq(kn);
8498
8499                 kqlock_held(kq);
8500                 /* insert at head for sync ipc waiters */
8501                 if (kn->kn_qos_override_is_sync) {
8502                         TAILQ_INSERT_HEAD(queue, kn, kn_tqe);
8503                 } else {
8504                         TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
8505                 }
8506                 kn->kn_status |= KN_QUEUED;
8507                 kq->kq_count++;
8508                 knote_update_sync_override_state(kn);
8509                 return 1;
8510         }
8511         return ((kn->kn_status & KN_STAYACTIVE) != 0);
8512 }
8513
8514
8515 /* called with kqueue lock held */
8516 static void
8517 knote_dequeue(struct knote *kn)
8518 {
8519         struct kqueue *kq = knote_get_kq(kn);
8520         struct kqtailq *queue;
8521
8522         kqlock_held(kq);
8523
8524         if ((kn->kn_status & KN_QUEUED) == 0)
8525                 return;
8526
8527         queue = knote_get_queue(kn);
8528         TAILQ_REMOVE(queue, kn, kn_tqe);
8529         kn->kn_status &= ~KN_QUEUED;
8530         kq->kq_count--;
8531         knote_update_sync_override_state(kn);
8532 }
8533
8534 void
8535 knote_init(void)
8536 {
8537         knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote),
8538                            8192, "knote zone");
8539
8540         kqfile_zone = zinit(sizeof(struct kqfile), 8192*sizeof(struct kqfile),
8541                             8192, "kqueue file zone");
8542
8543         kqworkq_zone = zinit(sizeof(struct kqworkq), 8192*sizeof(struct kqworkq),
8544                             8192, "kqueue workq zone");
8545
8546         kqworkloop_zone = zinit(sizeof(struct kqworkloop), 8192*sizeof(struct kqworkloop),
8547                             8192, "kqueue workloop zone");
8548
8549         /* allocate kq lock group attribute and group */
8550         kq_lck_grp_attr = lck_grp_attr_alloc_init();
8551
8552         kq_lck_grp = lck_grp_alloc_init("kqueue",  kq_lck_grp_attr);
8553
8554         /* Allocate kq lock attribute */
8555         kq_lck_attr = lck_attr_alloc_init();
8556
8557         /* Initialize the timer filter lock */
8558         lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
8559
8560         /* Initialize the user filter lock */
8561         lck_spin_init(&_filt_userlock, kq_lck_grp, kq_lck_attr);
8562
8563 #if CONFIG_MEMORYSTATUS
8564         /* Initialize the memorystatus list lock */
8565         memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
8566 #endif
8567 }
8568 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
8569
8570 const struct filterops *
8571 knote_fops(struct knote *kn)
8572 {
8573         return sysfilt_ops[kn->kn_filtid];
8574 }
8575
8576 static struct knote *
8577 knote_alloc(void)
8578 {
8579         struct knote *kn;
8580         kn = ((struct knote *)zalloc(knote_zone));
8581         *kn = (struct knote) { .kn_qos_override = 0, .kn_qos_sync_override = 0, .kn_qos_override_is_sync = 0 };
8582         return kn;
8583 }
8584
8585 static void
8586 knote_free(struct knote *kn)
8587 {
8588         zfree(knote_zone, kn);
8589 }
8590
8591 #if SOCKETS
8592 #include <sys/param.h>
8593 #include <sys/socket.h>
8594 #include <sys/protosw.h>
8595 #include <sys/domain.h>
8596 #include <sys/mbuf.h>
8597 #include <sys/kern_event.h>
8598 #include <sys/malloc.h>
8599 #include <sys/sys_domain.h>
8600 #include <sys/syslog.h>
8601
8602 #ifndef ROUNDUP64
8603 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8604 #endif
8605
8606 #ifndef ADVANCE64
8607 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8608 #endif
8609
8610 static lck_grp_attr_t *kev_lck_grp_attr;
8611 static lck_attr_t *kev_lck_attr;
8612 static lck_grp_t *kev_lck_grp;
8613 static decl_lck_rw_data(,kev_lck_data);
8614 static lck_rw_t *kev_rwlock = &kev_lck_data;
8615
8616 static int kev_attach(struct socket *so, int proto, struct proc *p);
8617 static int kev_detach(struct socket *so);
8618 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8619     struct ifnet *ifp, struct proc *p);
8620 static lck_mtx_t * event_getlock(struct socket *, int);
8621 static int event_lock(struct socket *, int, void *);
8622 static int event_unlock(struct socket *, int, void *);
8623
8624 static int event_sofreelastref(struct socket *);
8625 static void kev_delete(struct kern_event_pcb *);
8626
8627 static struct pr_usrreqs event_usrreqs = {
8628         .pru_attach =           kev_attach,
8629         .pru_control =          kev_control,
8630         .pru_detach =           kev_detach,
8631         .pru_soreceive =        soreceive,
8632 };
8633
8634 static struct protosw eventsw[] = {
8635 {
8636         .pr_type =              SOCK_RAW,
8637         .pr_protocol =          SYSPROTO_EVENT,
8638         .pr_flags =             PR_ATOMIC,
8639         .pr_usrreqs =           &event_usrreqs,
8640         .pr_lock =              event_lock,
8641         .pr_unlock =            event_unlock,
8642         .pr_getlock =           event_getlock,
8643 }
8644 };
8645
8646 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8647 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8648
8649 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8650         CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Kernel event family");
8651
8652 struct kevtstat kevtstat;
8653 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8654     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8655     kevt_getstat, "S,kevtstat", "");
8656
8657 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8658         CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8659         kevt_pcblist, "S,xkevtpcb", "");
8660
8661 static lck_mtx_t *
8662 event_getlock(struct socket *so, int flags)
8663 {
8664 #pragma unused(flags)
8665         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8666
8667         if (so->so_pcb != NULL)  {
8668                 if (so->so_usecount < 0)
8669                         panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8670                             so, so->so_usecount, solockhistory_nr(so));
8671                         /* NOTREACHED */
8672         } else {
8673                 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
8674                     so, solockhistory_nr(so));
8675                 /* NOTREACHED */
8676         }
8677         return (&ev_pcb->evp_mtx);
8678 }
8679
8680 static int
8681 event_lock(struct socket *so, int refcount, void *lr)
8682 {
8683         void *lr_saved;
8684
8685         if (lr == NULL)
8686                 lr_saved = __builtin_return_address(0);
8687         else
8688                 lr_saved = lr;
8689
8690         if (so->so_pcb != NULL) {
8691                 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8692         } else  {
8693                 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
8694                     so, lr_saved, solockhistory_nr(so));
8695                 /* NOTREACHED */
8696         }
8697
8698         if (so->so_usecount < 0) {
8699                 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
8700                     so, so->so_pcb, lr_saved, so->so_usecount,
8701                     solockhistory_nr(so));
8702                 /* NOTREACHED */
8703         }
8704
8705         if (refcount)
8706                 so->so_usecount++;
8707
8708         so->lock_lr[so->next_lock_lr] = lr_saved;
8709         so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
8710         return (0);
8711 }
8712
8713 static int
8714 event_unlock(struct socket *so, int refcount, void *lr)
8715 {
8716         void *lr_saved;
8717         lck_mtx_t *mutex_held;
8718
8719         if (lr == NULL)
8720                 lr_saved = __builtin_return_address(0);
8721         else
8722                 lr_saved = lr;
8723
8724         if (refcount) {
8725                 so->so_usecount--;
8726         }
8727         if (so->so_usecount < 0) {
8728                 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8729                     so, so->so_usecount, solockhistory_nr(so));
8730                 /* NOTREACHED */
8731         }
8732         if (so->so_pcb == NULL) {
8733                 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
8734                     so, so->so_usecount, (void *)lr_saved,
8735                     solockhistory_nr(so));
8736                 /* NOTREACHED */
8737         }
8738         mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8739
8740         LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8741         so->unlock_lr[so->next_unlock_lr] = lr_saved;
8742         so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
8743
8744         if (so->so_usecount == 0) {
8745                 VERIFY(so->so_flags & SOF_PCBCLEARING);
8746                 event_sofreelastref(so);
8747         } else {
8748                 lck_mtx_unlock(mutex_held);
8749         }
8750
8751         return (0);
8752 }
8753
8754 static int
8755 event_sofreelastref(struct socket *so)
8756 {
8757         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8758
8759         LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8760
8761         so->so_pcb = NULL;
8762
8763         /*
8764          * Disable upcall in the event another thread is in kev_post_msg()
8765          * appending record to the receive socket buffer, since sbwakeup()
8766          * may release the socket lock otherwise.
8767          */
8768         so->so_rcv.sb_flags &= ~SB_UPCALL;
8769         so->so_snd.sb_flags &= ~SB_UPCALL;
8770         so->so_event = sonullevent;
8771         lck_mtx_unlock(&(ev_pcb->evp_mtx));
8772
8773         LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8774         lck_rw_lock_exclusive(kev_rwlock);
8775         LIST_REMOVE(ev_pcb, evp_link);
8776         kevtstat.kes_pcbcount--;
8777         kevtstat.kes_gencnt++;
8778         lck_rw_done(kev_rwlock);
8779         kev_delete(ev_pcb);
8780
8781         sofreelastref(so, 1);
8782         return (0);
8783 }
8784
8785 static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));
8786
8787 static
8788 struct kern_event_head kern_event_head;
8789
8790 static u_int32_t static_event_id = 0;
8791
8792 #define EVPCB_ZONE_MAX          65536
8793 #define EVPCB_ZONE_NAME         "kerneventpcb"
8794 static struct zone *ev_pcb_zone;
8795
8796 /*
8797  * Install the protosw's for the NKE manager.  Invoked at extension load time
8798  */
8799 void
8800 kern_event_init(struct domain *dp)
8801 {
8802         struct protosw *pr;
8803         int i;
8804
8805         VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8806         VERIFY(dp == systemdomain);
8807
8808         kev_lck_grp_attr = lck_grp_attr_alloc_init();
8809         if (kev_lck_grp_attr == NULL) {
8810                 panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
8811                 /* NOTREACHED */
8812         }
8813
8814         kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
8815             kev_lck_grp_attr);
8816         if (kev_lck_grp == NULL) {
8817                 panic("%s: lck_grp_alloc_init failed\n", __func__);
8818                 /* NOTREACHED */
8819         }
8820
8821         kev_lck_attr = lck_attr_alloc_init();
8822         if (kev_lck_attr == NULL) {
8823                 panic("%s: lck_attr_alloc_init failed\n", __func__);
8824                 /* NOTREACHED */
8825         }
8826
8827         lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
8828         if (kev_rwlock == NULL) {
8829                 panic("%s: lck_mtx_alloc_init failed\n", __func__);
8830                 /* NOTREACHED */
8831         }
8832
8833         for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++)
8834                 net_add_proto(pr, dp, 1);
8835
8836         ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
8837             EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
8838         if (ev_pcb_zone == NULL) {
8839                 panic("%s: failed allocating ev_pcb_zone", __func__);
8840                 /* NOTREACHED */
8841         }
8842         zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
8843         zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
8844 }
8845
8846 static int
8847 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8848 {
8849         int error = 0;
8850         struct kern_event_pcb *ev_pcb;
8851
8852         error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8853         if (error != 0)
8854                 return (error);
8855
8856         if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
8857                 return (ENOBUFS);
8858         }
8859         bzero(ev_pcb, sizeof(struct kern_event_pcb));
8860         lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
8861
8862         ev_pcb->evp_socket = so;
8863         ev_pcb->evp_vendor_code_filter = 0xffffffff;
8864
8865         so->so_pcb = (caddr_t) ev_pcb;
8866         lck_rw_lock_exclusive(kev_rwlock);
8867         LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8868         kevtstat.kes_pcbcount++;
8869         kevtstat.kes_gencnt++;
8870         lck_rw_done(kev_rwlock);
8871
8872         return (error);
8873 }
8874
8875 static void
8876 kev_delete(struct kern_event_pcb *ev_pcb)
8877 {
8878         VERIFY(ev_pcb != NULL);
8879         lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
8880         zfree(ev_pcb_zone, ev_pcb);
8881 }
8882
8883 static int
8884 kev_detach(struct socket *so)
8885 {
8886         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8887
8888         if (ev_pcb != NULL) {
8889                 soisdisconnected(so);
8890                 so->so_flags |= SOF_PCBCLEARING;
8891         }
8892
8893         return (0);
8894 }
8895
8896 /*
8897  * For now, kev_vendor_code and mbuf_tags use the same
8898  * mechanism.
8899  */
8900 errno_t kev_vendor_code_find(
8901         const char      *string,
8902         u_int32_t       *out_vendor_code)
8903 {
8904         if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8905                 return (EINVAL);
8906         }
8907         return (net_str_id_find_internal(string, out_vendor_code,
8908             NSI_VENDOR_CODE, 1));
8909 }
8910
8911 errno_t
8912 kev_msg_post(struct kev_msg *event_msg)
8913 {
8914         mbuf_tag_id_t min_vendor, max_vendor;
8915
8916         net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8917
8918         if (event_msg == NULL)
8919                 return (EINVAL);
8920
8921         /*
8922          * Limit third parties to posting events for registered vendor codes
8923          * only
8924          */
8925         if (event_msg->vendor_code < min_vendor ||
8926             event_msg->vendor_code > max_vendor) {
8927                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
8928                 return (EINVAL);
8929         }
8930         return (kev_post_msg(event_msg));
8931 }
8932
8933 int
8934 kev_post_msg(struct kev_msg *event_msg)
8935 {
8936         struct mbuf *m, *m2;
8937         struct kern_event_pcb *ev_pcb;
8938         struct kern_event_msg *ev;
8939         char *tmp;
8940         u_int32_t total_size;
8941         int i;
8942
8943         /* Verify the message is small enough to fit in one mbuf w/o cluster */
8944         total_size = KEV_MSG_HEADER_SIZE;
8945
8946         for (i = 0; i < 5; i++) {
8947                 if (event_msg->dv[i].data_length == 0)
8948                         break;
8949                 total_size += event_msg->dv[i].data_length;
8950         }
8951
8952         if (total_size > MLEN) {
8953                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
8954                 return (EMSGSIZE);
8955         }
8956
8957         m = m_get(M_WAIT, MT_DATA);
8958         if (m == 0) {
8959                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
8960                 return (ENOMEM);
8961         }
8962         ev = mtod(m, struct kern_event_msg *);
8963         total_size = KEV_MSG_HEADER_SIZE;
8964
8965         tmp = (char *) &ev->event_data[0];
8966         for (i = 0; i < 5; i++) {
8967                 if (event_msg->dv[i].data_length == 0)
8968                         break;
8969
8970                 total_size += event_msg->dv[i].data_length;
8971                 bcopy(event_msg->dv[i].data_ptr, tmp,
8972                     event_msg->dv[i].data_length);
8973                 tmp += event_msg->dv[i].data_length;
8974         }
8975
8976         ev->id = ++static_event_id;
8977         ev->total_size   = total_size;
8978         ev->vendor_code  = event_msg->vendor_code;
8979         ev->kev_class    = event_msg->kev_class;
8980         ev->kev_subclass = event_msg->kev_subclass;
8981         ev->event_code   = event_msg->event_code;
8982
8983         m->m_len = total_size;
8984         lck_rw_lock_shared(kev_rwlock);
8985         for (ev_pcb = LIST_FIRST(&kern_event_head);
8986             ev_pcb;
8987             ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8988                 lck_mtx_lock(&ev_pcb->evp_mtx);
8989                 if (ev_pcb->evp_socket->so_pcb == NULL) {
8990                         lck_mtx_unlock(&ev_pcb->evp_mtx);
8991                         continue;
8992                 }
8993                 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8994                         if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8995                                 lck_mtx_unlock(&ev_pcb->evp_mtx);
8996                                 continue;
8997                         }
8998
8999                         if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
9000                                 if (ev_pcb->evp_class_filter != ev->kev_class) {
9001                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
9002                                         continue;
9003                                 }
9004
9005                                 if ((ev_pcb->evp_subclass_filter !=
9006                                     KEV_ANY_SUBCLASS) &&
9007                                     (ev_pcb->evp_subclass_filter !=
9008                                     ev->kev_subclass)) {
9009                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
9010                                         continue;
9011                                 }
9012                         }
9013                 }
9014
9015                 m2 = m_copym(m, 0, m->m_len, M_WAIT);
9016                 if (m2 == 0) {
9017                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
9018                         m_free(m);
9019                         lck_mtx_unlock(&ev_pcb->evp_mtx);
9020                         lck_rw_done(kev_rwlock);
9021                         return (ENOMEM);
9022                 }
9023                 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
9024                         /*
9025                          * We use "m" for the socket stats as it would be
9026                          * unsafe to use "m2"
9027                          */
9028                         so_inc_recv_data_stat(ev_pcb->evp_socket,
9029                             1, m->m_len, MBUF_TC_BE);
9030
9031                         sorwakeup(ev_pcb->evp_socket);
9032                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
9033                 } else {
9034                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
9035                 }
9036                 lck_mtx_unlock(&ev_pcb->evp_mtx);
9037         }
9038         m_free(m);
9039         lck_rw_done(kev_rwlock);
9040
9041         return (0);
9042 }
9043
9044 static int
9045 kev_control(struct socket *so,
9046     u_long cmd,
9047     caddr_t data,
9048     __unused struct ifnet *ifp,
9049     __unused struct proc *p)
9050 {
9051         struct kev_request *kev_req = (struct kev_request *) data;
9052         struct kern_event_pcb  *ev_pcb;
9053         struct kev_vendor_code *kev_vendor;
9054         u_int32_t  *id_value = (u_int32_t *) data;
9055
9056         switch (cmd) {
9057                 case SIOCGKEVID:
9058                         *id_value = static_event_id;
9059                         break;
9060                 case SIOCSKEVFILT:
9061                         ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9062                         ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
9063                         ev_pcb->evp_class_filter = kev_req->kev_class;
9064                         ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
9065                         break;
9066                 case SIOCGKEVFILT:
9067                         ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9068                         kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
9069                         kev_req->kev_class   = ev_pcb->evp_class_filter;
9070                         kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
9071                         break;
9072                 case SIOCGKEVVENDOR:
9073                         kev_vendor = (struct kev_vendor_code *)data;
9074                         /* Make sure string is NULL terminated */
9075                         kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
9076                         return (net_str_id_find_internal(kev_vendor->vendor_string,
9077                             &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0));
9078                 default:
9079                         return (ENOTSUP);
9080         }
9081
9082         return (0);
9083 }
9084
9085 int
9086 kevt_getstat SYSCTL_HANDLER_ARGS
9087 {
9088 #pragma unused(oidp, arg1, arg2)
9089         int error = 0;
9090
9091         lck_rw_lock_shared(kev_rwlock);
9092
9093         if (req->newptr != USER_ADDR_NULL) {
9094                 error = EPERM;
9095                 goto done;
9096         }
9097         if (req->oldptr == USER_ADDR_NULL) {
9098                 req->oldidx = sizeof(struct kevtstat);
9099                 goto done;
9100         }
9101
9102         error = SYSCTL_OUT(req, &kevtstat,
9103             MIN(sizeof(struct kevtstat), req->oldlen));
9104 done:
9105         lck_rw_done(kev_rwlock);
9106
9107         return (error);
9108 }
9109
9110 __private_extern__ int
9111 kevt_pcblist SYSCTL_HANDLER_ARGS
9112 {
9113 #pragma unused(oidp, arg1, arg2)
9114         int error = 0;
9115         int n, i;
9116         struct xsystmgen xsg;
9117         void *buf = NULL;
9118         size_t item_size = ROUNDUP64(sizeof (struct xkevtpcb)) +
9119                 ROUNDUP64(sizeof (struct xsocket_n)) +
9120                 2 * ROUNDUP64(sizeof (struct xsockbuf_n)) +
9121                 ROUNDUP64(sizeof (struct xsockstat_n));
9122         struct kern_event_pcb  *ev_pcb;
9123
9124         buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
9125         if (buf == NULL)
9126                 return (ENOMEM);
9127
9128         lck_rw_lock_shared(kev_rwlock);
9129
9130         n = kevtstat.kes_pcbcount;
9131
9132         if (req->oldptr == USER_ADDR_NULL) {
9133                 req->oldidx = (n + n/8) * item_size;
9134                 goto done;
9135         }
9136         if (req->newptr != USER_ADDR_NULL) {
9137                 error = EPERM;
9138                 goto done;
9139         }
9140         bzero(&xsg, sizeof (xsg));
9141         xsg.xg_len = sizeof (xsg);
9142         xsg.xg_count = n;
9143         xsg.xg_gen = kevtstat.kes_gencnt;
9144         xsg.xg_sogen = so_gencnt;
9145         error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
9146         if (error) {
9147                 goto done;
9148         }
9149         /*
9150          * We are done if there is no pcb
9151          */
9152         if (n == 0) {
9153                 goto done;
9154         }
9155
9156         i = 0;
9157         for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
9158             i < n && ev_pcb != NULL;
9159             i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
9160                 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
9161                 struct xsocket_n *xso = (struct xsocket_n *)
9162                         ADVANCE64(xk, sizeof (*xk));
9163                 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
9164                         ADVANCE64(xso, sizeof (*xso));
9165                 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
9166                         ADVANCE64(xsbrcv, sizeof (*xsbrcv));
9167                 struct xsockstat_n *xsostats = (struct xsockstat_n *)
9168                         ADVANCE64(xsbsnd, sizeof (*xsbsnd));
9169
9170                 bzero(buf, item_size);
9171
9172                 lck_mtx_lock(&ev_pcb->evp_mtx);
9173
9174                 xk->kep_len = sizeof(struct xkevtpcb);
9175                 xk->kep_kind = XSO_EVT;
9176                 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
9177                 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
9178                 xk->kep_class_filter = ev_pcb->evp_class_filter;
9179                 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
9180
9181                 sotoxsocket_n(ev_pcb->evp_socket, xso);
9182                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
9183                         &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
9184                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
9185                         &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
9186                 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
9187
9188                 lck_mtx_unlock(&ev_pcb->evp_mtx);
9189
9190                 error = SYSCTL_OUT(req, buf, item_size);
9191         }
9192
9193         if (error == 0) {
9194                 /*
9195                  * Give the user an updated idea of our state.
9196                  * If the generation differs from what we told
9197                  * her before, she knows that something happened
9198                  * while we were processing this request, and it
9199                  * might be necessary to retry.
9200                  */
9201                 bzero(&xsg, sizeof (xsg));
9202                 xsg.xg_len = sizeof (xsg);
9203                 xsg.xg_count = n;
9204                 xsg.xg_gen = kevtstat.kes_gencnt;
9205                 xsg.xg_sogen = so_gencnt;
9206                 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
9207                 if (error) {
9208                         goto done;
9209                 }
9210         }
9211
9212 done:
9213         lck_rw_done(kev_rwlock);
9214
9215         return (error);
9216 }
9217
9218 #endif /* SOCKETS */
9219
9220
9221 int
9222 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
9223 {
9224         struct vinfo_stat * st;
9225
9226         st = &kinfo->kq_stat;
9227
9228         st->vst_size = kq->kq_count;
9229         if (kq->kq_state & KQ_KEV_QOS)
9230                 st->vst_blksize = sizeof(struct kevent_qos_s);
9231         else if (kq->kq_state & KQ_KEV64)
9232                 st->vst_blksize = sizeof(struct kevent64_s);
9233         else
9234                 st->vst_blksize = sizeof(struct kevent);
9235         st->vst_mode = S_IFIFO;
9236         st->vst_ino = (kq->kq_state & KQ_DYNAMIC) ?
9237                 ((struct kqworkloop *)kq)->kqwl_dynamicid : 0;
9238
9239         /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
9240 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
9241         kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
9242
9243         return (0);
9244 }
9245
9246 static int
9247 fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi)
9248 {
9249         struct kqworkloop *kqwl = (struct kqworkloop *)kq;
9250         struct kqrequest *kqr = &kqwl->kqwl_request;
9251         int err;
9252
9253         if ((kq->kq_state & KQ_WORKLOOP) == 0) {
9254                 return EINVAL;
9255         }
9256
9257         if ((err = fill_kqueueinfo(kq, &kqdi->kqdi_info))) {
9258                 return err;
9259         }
9260
9261         kqwl_req_lock(kqwl);
9262
9263         if (kqr->kqr_thread) {
9264                 kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread);
9265         }
9266
9267         if (kqwl->kqwl_owner == WL_OWNER_SUSPENDED) {
9268                 kqdi->kqdi_owner = ~0ull;
9269         } else {
9270                 kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
9271         }
9272
9273         kqdi->kqdi_request_state = kqr->kqr_state;
9274         kqdi->kqdi_async_qos = kqr->kqr_qos_index;
9275         kqdi->kqdi_events_qos = kqr->kqr_override_index;
9276         kqdi->kqdi_sync_waiters = kqr->kqr_dsync_waiters;
9277         kqdi->kqdi_sync_waiter_qos = kqr->kqr_dsync_waiters_qos;
9278
9279         kqwl_req_unlock(kqwl);
9280
9281         return 0;
9282 }
9283
9284
9285 void
9286 knote_markstayactive(struct knote *kn)
9287 {
9288         struct kqueue *kq = knote_get_kq(kn);
9289
9290         kqlock(kq);
9291         kn->kn_status |= KN_STAYACTIVE;
9292
9293         /*
9294          * Making a knote stay active is a property of the knote that must be
9295          * established before it is fully attached.
9296          */
9297         assert(kn->kn_status & KN_ATTACHING);
9298
9299         /* handle all stayactive knotes on the (appropriate) manager */
9300         if (kq->kq_state & KQ_WORKQ) {
9301                 knote_set_qos_index(kn, KQWQ_QOS_MANAGER);
9302         } else if (kq->kq_state & KQ_WORKLOOP) {
9303                 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
9304                 kqwl_req_lock(kqwl);
9305                 assert(kn->kn_req_index && kn->kn_req_index < THREAD_QOS_LAST);
9306                 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
9307                                 kn->kn_req_index);
9308                 kqwl_req_unlock(kqwl);
9309                 knote_set_qos_index(kn, KQWL_BUCKET_STAYACTIVE);
9310         }
9311
9312         knote_activate(kn);
9313         kqunlock(kq);
9314 }
9315
9316 void
9317 knote_clearstayactive(struct knote *kn)
9318 {
9319         kqlock(knote_get_kq(kn));
9320         kn->kn_status &= ~KN_STAYACTIVE;
9321         knote_deactivate(kn);
9322         kqunlock(knote_get_kq(kn));
9323 }
9324
9325 static unsigned long
9326 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
9327                 unsigned long buflen, unsigned long nknotes)
9328 {
9329         for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
9330                 if (kq == knote_get_kq(kn)) {
9331                         if (nknotes < buflen) {
9332                                 struct kevent_extinfo *info = &buf[nknotes];
9333                                 struct kevent_internal_s *kevp = &kn->kn_kevent;
9334
9335                                 kqlock(kq);
9336
9337                                 info->kqext_kev = (struct kevent_qos_s){
9338                                         .ident = kevp->ident,
9339                                         .filter = kevp->filter,
9340                                         .flags = kevp->flags,
9341                                         .fflags = kevp->fflags,
9342                                         .data = (int64_t)kevp->data,
9343                                         .udata = kevp->udata,
9344                                         .ext[0] = kevp->ext[0],
9345                                         .ext[1] = kevp->ext[1],
9346                                         .ext[2] = kevp->ext[2],
9347                                         .ext[3] = kevp->ext[3],
9348                                         .qos = kn->kn_req_index,
9349                                 };
9350                                 info->kqext_sdata = kn->kn_sdata;
9351                                 info->kqext_status = kn->kn_status;
9352                                 info->kqext_sfflags = kn->kn_sfflags;
9353
9354                                 kqunlock(kq);
9355                         }
9356
9357                         /* we return total number of knotes, which may be more than requested */
9358                         nknotes++;
9359                 }
9360         }
9361
9362         return nknotes;
9363 }
9364
9365 int
9366 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
9367                 int32_t *nkqueues_out)
9368 {
9369         proc_t p = (proc_t)proc;
9370         struct filedesc *fdp = p->p_fd;
9371         unsigned int nkqueues = 0;
9372         unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
9373         size_t buflen, bufsize;
9374         kqueue_id_t *kq_ids = NULL;
9375         int err = 0;
9376
9377         assert(p != NULL);
9378
9379         if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
9380                 err = EINVAL;
9381                 goto out;
9382         }
9383
9384         buflen = min(ubuflen, PROC_PIDDYNKQUEUES_MAX);
9385
9386         if (ubuflen != 0) {
9387                 if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
9388                         err = ERANGE;
9389                         goto out;
9390                 }
9391                 kq_ids = kalloc(bufsize);
9392                 assert(kq_ids != NULL);
9393         }
9394
9395         kqhash_lock(p);
9396
9397         if (fdp->fd_kqhashmask > 0) {
9398                 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
9399                         struct kqworkloop *kqwl;
9400
9401                         SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9402                                 /* report the number of kqueues, even if they don't all fit */
9403                                 if (nkqueues < buflen) {
9404                                         kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
9405                                 }
9406                                 nkqueues++;
9407                         }
9408                 }
9409         }
9410
9411         kqhash_unlock(p);
9412
9413         if (kq_ids) {
9414                 size_t copysize;
9415                 if (os_mul_overflow(sizeof(kqueue_id_t), min(ubuflen, nkqueues), &copysize)) {
9416                         err = ERANGE;
9417                         goto out;
9418                 }
9419
9420                 assert(ubufsize >= copysize);
9421                 err = copyout(kq_ids, ubuf, copysize);
9422         }
9423
9424 out:
9425         if (kq_ids) {
9426                 kfree(kq_ids, bufsize);
9427         }
9428
9429         if (!err) {
9430                 *nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
9431         }
9432         return err;
9433 }
9434
9435 int
9436 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9437                 uint32_t ubufsize, int32_t *size_out)
9438 {
9439         proc_t p = (proc_t)proc;
9440         struct kqueue *kq;
9441         int err = 0;
9442         struct kqueue_dyninfo kqdi = { };
9443
9444         assert(p != NULL);
9445
9446         if (ubufsize < sizeof(struct kqueue_info)) {
9447                 return ENOBUFS;
9448         }
9449
9450         kqhash_lock(p);
9451         kq = kqueue_hash_lookup(p, kq_id);
9452         if (!kq) {
9453                 kqhash_unlock(p);
9454                 return ESRCH;
9455         }
9456         kqueue_retain(kq);
9457         kqhash_unlock(p);
9458
9459         /*
9460          * backward compatibility: allow the argument to this call to only be
9461          * a struct kqueue_info
9462          */
9463         if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
9464                 ubufsize = sizeof(struct kqueue_dyninfo);
9465                 err = fill_kqueue_dyninfo(kq, &kqdi);
9466         } else {
9467                 ubufsize = sizeof(struct kqueue_info);
9468                 err = fill_kqueueinfo(kq, &kqdi.kqdi_info);
9469         }
9470         if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
9471                 *size_out = ubufsize;
9472         }
9473         kqueue_release_last(p, kq);
9474         return err;
9475 }
9476
9477 int
9478 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9479                 uint32_t ubufsize, int32_t *nknotes_out)
9480 {
9481         proc_t p = (proc_t)proc;
9482         struct kqueue *kq;
9483         int err;
9484
9485         assert(p != NULL);
9486
9487         kqhash_lock(p);
9488         kq = kqueue_hash_lookup(p, kq_id);
9489         if (!kq) {
9490                 kqhash_unlock(p);
9491                 return ESRCH;
9492         }
9493         kqueue_retain(kq);
9494         kqhash_unlock(p);
9495
9496         err = pid_kqueue_extinfo(p, kq, ubuf, ubufsize, nknotes_out);
9497         kqueue_release_last(p, kq);
9498         return err;
9499 }
9500
9501 int
9502 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
9503                 uint32_t bufsize, int32_t *retval)
9504 {
9505         struct knote *kn;
9506         int i;
9507         int err = 0;
9508         struct filedesc *fdp = p->p_fd;
9509         unsigned long nknotes = 0;
9510         unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
9511         struct kevent_extinfo *kqext = NULL;
9512
9513         /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
9514         buflen = min(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
9515
9516         kqext = kalloc(buflen * sizeof(struct kevent_extinfo));
9517         if (kqext == NULL) {
9518                 err = ENOMEM;
9519                 goto out;
9520         }
9521         bzero(kqext, buflen * sizeof(struct kevent_extinfo));
9522
9523         proc_fdlock(p);
9524         for (i = 0; i < fdp->fd_knlistsize; i++) {
9525                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
9526                 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9527         }
9528         proc_fdunlock(p);
9529
9530         if (fdp->fd_knhashmask != 0) {
9531                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
9532                         kqhash_lock(p);
9533                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
9534                         nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9535                         kqhash_unlock(p);
9536                 }
9537         }
9538
9539         assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
9540         err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
9541
9542  out:
9543         if (kqext) {
9544                 kfree(kqext, buflen * sizeof(struct kevent_extinfo));
9545                 kqext = NULL;
9546         }
9547
9548         if (!err) {
9549                 *retval = min(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
9550         }
9551         return err;
9552 }
9553
9554 static unsigned int
9555 klist_copy_udata(struct klist *list, uint64_t *buf,
9556                 unsigned int buflen, unsigned int nknotes)
9557 {
9558         struct kevent_internal_s *kev;
9559         struct knote *kn;
9560         SLIST_FOREACH(kn, list, kn_link) {
9561                 if (nknotes < buflen) {
9562                         struct kqueue *kq = knote_get_kq(kn);
9563                         kqlock(kq);
9564                         kev = &(kn->kn_kevent);
9565                         buf[nknotes] = kev->udata;
9566                         kqunlock(kq);
9567                 }
9568                 /* we return total number of knotes, which may be more than requested */
9569                 nknotes++;
9570         }
9571
9572         return nknotes;
9573 }
9574
9575 static unsigned int
9576 kqlist_copy_dynamicids(__assert_only proc_t p, struct kqlist *list,
9577                 uint64_t *buf, unsigned int buflen, unsigned int nids)
9578 {
9579         kqhash_lock_held(p);
9580         struct kqworkloop *kqwl;
9581         SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
9582                 if (nids < buflen) {
9583                         buf[nids] = kqwl->kqwl_dynamicid;
9584                 }
9585                 nids++;
9586         }
9587         return nids;
9588 }
9589
9590 int
9591 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize)
9592 {
9593         proc_t p = (proc_t)proc;
9594         struct filedesc *fdp = p->p_fd;
9595         unsigned int nuptrs = 0;
9596         unsigned long buflen = bufsize / sizeof(uint64_t);
9597
9598         if (buflen > 0) {
9599                 assert(buf != NULL);
9600         }
9601
9602         proc_fdlock(p);
9603         for (int i = 0; i < fdp->fd_knlistsize; i++) {
9604                 nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
9605         }
9606         knhash_lock(p);
9607         proc_fdunlock(p);
9608         if (fdp->fd_knhashmask != 0) {
9609                 for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
9610                         nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9611                 }
9612         }
9613         knhash_unlock(p);
9614
9615         kqhash_lock(p);
9616         if (fdp->fd_kqhashmask != 0) {
9617                 for (int i = 0; i < (int)fdp->fd_kqhashmask + 1; i++) {
9618                         nuptrs = kqlist_copy_dynamicids(p, &fdp->fd_kqhash[i], buf, buflen,
9619                                         nuptrs);
9620                 }
9621         }
9622         kqhash_unlock(p);
9623
9624         return (int)nuptrs;
9625 }
9626
9627 static void
9628 kevent_redrive_proc_thread_request(proc_t p)
9629 {
9630         __assert_only int ret;
9631         ret = (*pthread_functions->workq_threadreq)(p, NULL, WORKQ_THREADREQ_REDRIVE, 0, 0);
9632         assert(ret == 0 || ret == ECANCELED);
9633 }
9634
9635 static void
9636 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9637 {
9638         uint64_t ast_addr;
9639         bool proc_is_64bit = !!(p->p_flag & P_LP64);
9640         size_t user_addr_size = proc_is_64bit ? 8 : 4;
9641         uint32_t ast_flags32 = 0;
9642         uint64_t ast_flags64 = 0;
9643         struct uthread *ut = get_bsdthread_info(thread);
9644
9645         if (ut->uu_kqueue_bound != NULL) {
9646                 if (ut->uu_kqueue_flags & KEVENT_FLAG_WORKLOOP) {
9647                         ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9648                 } else if (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ) {
9649                         ast_flags64 |= R2K_WORKQ_PENDING_EVENTS;
9650                 }
9651         }
9652
9653         if (ast_flags64 == 0) {
9654                 return;
9655         }
9656
9657         if (!(p->p_flag & P_LP64)) {
9658                 ast_flags32 = (uint32_t)ast_flags64;
9659                 assert(ast_flags64 < 0x100000000ull);
9660         }
9661
9662         ast_addr = thread_rettokern_addr(thread);
9663         if (ast_addr == 0) {
9664                 return;
9665         }
9666
9667         if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9668                     (user_addr_t)ast_addr,
9669                     user_addr_size) != 0) {
9670                 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9671                        "ast_addr = %llu\n", p->p_pid, thread_tid(current_thread()), ast_addr);
9672         }
9673 }
9674
9675 void
9676 kevent_ast(thread_t thread, uint16_t bits)
9677 {
9678         proc_t p = current_proc();
9679
9680         if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9681                 kevent_redrive_proc_thread_request(p);
9682         }
9683         if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9684                 kevent_set_return_to_kernel_user_tsd(p, thread);
9685         }
9686 }
9687
9688 #if DEVELOPMENT || DEBUG
9689
9690 #define KEVENT_SYSCTL_BOUND_ID 1
9691
9692 static int
9693 kevent_sysctl SYSCTL_HANDLER_ARGS
9694 {
9695 #pragma unused(oidp, arg2)
9696         uintptr_t type = (uintptr_t)arg1;
9697         uint64_t bound_id = 0;
9698         struct uthread *ut;
9699         struct kqueue *kq;
9700
9701         if (type != KEVENT_SYSCTL_BOUND_ID) {
9702                 return EINVAL;
9703         }
9704
9705         if (req->newptr) {
9706                 return EINVAL;
9707         }
9708
9709         ut = get_bsdthread_info(current_thread());
9710         if (!ut) {
9711                 return EFAULT;
9712         }
9713
9714         kq = ut->uu_kqueue_bound;
9715         if (kq) {
9716                 if (kq->kq_state & KQ_WORKLOOP) {
9717                         bound_id = ((struct kqworkloop *)kq)->kqwl_dynamicid;
9718                 } else if (kq->kq_state & KQ_WORKQ) {
9719                         bound_id = -1;
9720                 }
9721         }
9722
9723         return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9724 }
9725
9726 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9727                 "kevent information");
9728
9729 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9730                 CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9731                 (void *)KEVENT_SYSCTL_BOUND_ID,
9732                 sizeof(kqueue_id_t), kevent_sysctl, "Q",
9733                 "get the ID of the bound kqueue");
9734
9735 #endif /* DEVELOPMENT || DEBUG */