bsd/kern/kern_event.c

   1 /*
   2  * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  *
  28  */
  29 /*-
  30  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  31  * All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  52  * SUCH DAMAGE.
  53  */
  54 /*
  55  *      @(#)kern_event.c       1.0 (3/31/2000)
  56  */
  57 #include <stdint.h>
  58
  59 #include <sys/param.h>
  60 #include <sys/systm.h>
  61 #include <sys/filedesc.h>
  62 #include <sys/kernel.h>
  63 #include <sys/proc_internal.h>
  64 #include <sys/kauth.h>
  65 #include <sys/malloc.h>
  66 #include <sys/unistd.h>
  67 #include <sys/file_internal.h>
  68 #include <sys/fcntl.h>
  69 #include <sys/select.h>
  70 #include <sys/queue.h>
  71 #include <sys/event.h>
  72 #include <sys/eventvar.h>
  73 #include <sys/protosw.h>
  74 #include <sys/socket.h>
  75 #include <sys/socketvar.h>
  76 #include <sys/stat.h>
  77 #include <sys/sysctl.h>
  78 #include <sys/uio.h>
  79 #include <sys/sysproto.h>
  80 #include <sys/user.h>
  81 #include <sys/vnode_internal.h>
  82 #include <string.h>
  83 #include <sys/proc_info.h>
  84 #include <sys/codesign.h>
  85 #include <sys/pthread_shims.h>
  86
  87 #include <kern/locks.h>
  88 #include <kern/clock.h>
  89 #include <kern/policy_internal.h>
  90 #include <kern/thread_call.h>
  91 #include <kern/sched_prim.h>
  92 #include <kern/waitq.h>
  93 #include <kern/zalloc.h>
  94 #include <kern/kalloc.h>
  95 #include <kern/assert.h>
  96
  97 #include <machine/spl.h>
  98
  99 #include <libkern/libkern.h>
 100 #include "net/net_str_id.h"
 101
 102 #include <mach/task.h>
 103
 104 #if CONFIG_MEMORYSTATUS
 105 #include <sys/kern_memorystatus.h>
 106 #endif
 107
 108 /*
 109  * JMM - this typedef needs to be unified with pthread_priority_t
 110  *       and mach_msg_priority_t. It also needs to be the same type
 111  *       everywhere.
 112  */
 113 typedef int32_t qos_t;
 114
 115 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 116
 117 #define KQ_EVENT        NO_EVENT64
 118
 119 static inline void kqlock(struct kqueue *kq);
 120 static inline void kqunlock(struct kqueue *kq);
 121
 122 static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn);
 123 static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
 124 static int kqlock2knotedetach(struct kqueue *kq, struct knote *kn);
 125 static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int defer_drop);
 126
 127 static int kqueue_read(struct fileproc *fp, struct uio *uio,
 128     int flags, vfs_context_t ctx);
 129 static int kqueue_write(struct fileproc *fp, struct uio *uio,
 130     int flags, vfs_context_t ctx);
 131 static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
 132     vfs_context_t ctx);
 133 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
 134     vfs_context_t ctx);
 135 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
 136 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
 137         vfs_context_t ctx);
 138 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
 139
 140 static const struct fileops kqueueops = {
 141         .fo_type = DTYPE_KQUEUE,
 142         .fo_read = kqueue_read,
 143         .fo_write = kqueue_write,
 144         .fo_ioctl = kqueue_ioctl,
 145         .fo_select = kqueue_select,
 146         .fo_close = kqueue_close,
 147         .fo_kqfilter = kqueue_kqfilter,
 148         .fo_drain = kqueue_drain,
 149 };
 150
 151 static int kevent_internal(struct proc *p, int fd,
 152                            user_addr_t changelist, int nchanges,
 153                            user_addr_t eventlist, int nevents,
 154                            user_addr_t data_out, uint64_t data_available,
 155                            unsigned int flags, user_addr_t utimeout,
 156                            kqueue_continue_t continuation,
 157                            int32_t *retval);
 158 static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp,
 159                          struct proc *p, unsigned int flags);
 160 static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
 161                           struct proc *p, unsigned int flags);
 162 char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
 163
 164 static void kqueue_interrupt(struct kqueue *kq);
 165 static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
 166                            void *data);
 167 static void kevent_continue(struct kqueue *kq, void *data, int error);
 168 static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
 169 static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data,
 170                           struct filt_process_s *process_data, kq_index_t servicer_qos_index,
 171                           int *countp, struct proc *p);
 172 static int kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags);
 173 static void kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags);
 174 static struct kqtailq *kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index);
 175 static struct kqtailq *kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index);
 176 static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index);
 177
 178 static struct kqtailq *kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index);
 179
 180 static void kqworkq_request_thread(struct kqworkq *kqwq, kq_index_t qos_index);
 181 static void kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index, uint32_t type);
 182 static void kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index);
 183 static void kqworkq_bind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
 184 static void kqworkq_unbind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
 185 static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
 186
 187
 188 static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data,
 189                          struct filt_process_s *process_data, struct proc *p);
 190 #if 0
 191 static void knote_put(struct knote *kn);
 192 #endif
 193
 194 static int knote_fdadd(struct knote *kn, struct proc *p);
 195 static void knote_fdremove(struct knote *kn, struct proc *p);
 196 static struct knote *knote_fdfind(struct kqueue *kq, struct kevent_internal_s *kev, struct proc *p);
 197
 198 static void knote_drop(struct knote *kn, struct proc *p);
 199 static struct knote *knote_alloc(void);
 200 static void knote_free(struct knote *kn);
 201
 202 static void knote_activate(struct knote *kn);
 203 static void knote_deactivate(struct knote *kn);
 204
 205 static void knote_enable(struct knote *kn);
 206 static void knote_disable(struct knote *kn);
 207
 208 static int knote_enqueue(struct knote *kn);
 209 static void knote_dequeue(struct knote *kn);
 210
 211 static void knote_suppress(struct knote *kn);
 212 static void knote_unsuppress(struct knote *kn);
 213 static void knote_wakeup(struct knote *kn);
 214
 215 static kq_index_t knote_get_queue_index(struct knote *kn);
 216 static struct kqtailq *knote_get_queue(struct knote *kn);
 217 static struct kqtailq *knote_get_suppressed_queue(struct knote *kn);
 218 static kq_index_t knote_get_req_index(struct knote *kn);
 219 static kq_index_t knote_get_qos_index(struct knote *kn);
 220 static void knote_set_qos_index(struct knote *kn, kq_index_t qos_index);
 221 static kq_index_t knote_get_qos_override_index(struct knote *kn);
 222 static void knote_set_qos_override_index(struct knote *kn, kq_index_t qos_index);
 223
 224 static int filt_fileattach(struct knote *kn);
 225 static struct filterops file_filtops = {
 226         .f_isfd = 1,
 227         .f_attach = filt_fileattach,
 228 };
 229
 230 static void filt_kqdetach(struct knote *kn);
 231 static int filt_kqueue(struct knote *kn, long hint);
 232 static int filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev);
 233 static int filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 234 static struct filterops kqread_filtops = {
 235         .f_isfd = 1,
 236         .f_detach = filt_kqdetach,
 237         .f_event = filt_kqueue,
 238         .f_touch = filt_kqtouch,
 239         .f_process = filt_kqprocess,
 240 };
 241
 242 /* placeholder for not-yet-implemented filters */
 243 static int filt_badattach(struct knote *kn);
 244 static struct filterops bad_filtops = {
 245         .f_attach = filt_badattach,
 246 };
 247
 248 static int filt_procattach(struct knote *kn);
 249 static void filt_procdetach(struct knote *kn);
 250 static int filt_proc(struct knote *kn, long hint);
 251 static int filt_proctouch(struct knote *kn, struct kevent_internal_s *kev);
 252 static int filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 253 static struct filterops proc_filtops = {
 254         .f_attach = filt_procattach,
 255         .f_detach = filt_procdetach,
 256         .f_event = filt_proc,
 257         .f_touch = filt_proctouch,
 258         .f_process = filt_procprocess,
 259 };
 260
 261 #if CONFIG_MEMORYSTATUS
 262 extern struct filterops memorystatus_filtops;
 263 #endif /* CONFIG_MEMORYSTATUS */
 264
 265 extern struct filterops fs_filtops;
 266
 267 extern struct filterops sig_filtops;
 268
 269 /* Timer filter */
 270 static int filt_timerattach(struct knote *kn);
 271 static void filt_timerdetach(struct knote *kn);
 272 static int filt_timer(struct knote *kn, long hint);
 273 static int filt_timertouch(struct knote *kn, struct kevent_internal_s *kev);
 274 static int filt_timerprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 275 static struct filterops timer_filtops = {
 276         .f_attach = filt_timerattach,
 277         .f_detach = filt_timerdetach,
 278         .f_event = filt_timer,
 279         .f_touch = filt_timertouch,
 280         .f_process = filt_timerprocess,
 281 };
 282
 283 /* Helpers */
 284 static void filt_timerexpire(void *knx, void *param1);
 285 static int filt_timervalidate(struct knote *kn);
 286 static void filt_timerupdate(struct knote *kn, int num_fired);
 287 static void filt_timercancel(struct knote *kn);
 288
 289 #define TIMER_RUNNING           0x1
 290 #define TIMER_CANCELWAIT        0x2
 291
 292 static lck_mtx_t _filt_timerlock;
 293 static void filt_timerlock(void);
 294 static void filt_timerunlock(void);
 295
 296 static zone_t knote_zone;
 297 static zone_t kqfile_zone;
 298 static zone_t kqworkq_zone;
 299
 300 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 301
 302 #if 0
 303 extern struct filterops aio_filtops;
 304 #endif
 305
 306 /* Mach portset filter */
 307 extern struct filterops machport_filtops;
 308
 309 /* User filter */
 310 static int filt_userattach(struct knote *kn);
 311 static void filt_userdetach(struct knote *kn);
 312 static int filt_user(struct knote *kn, long hint);
 313 static int filt_usertouch(struct knote *kn, struct kevent_internal_s *kev);
 314 static int filt_userprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
 315 static struct filterops user_filtops = {
 316         .f_attach = filt_userattach,
 317         .f_detach = filt_userdetach,
 318         .f_event = filt_user,
 319         .f_touch = filt_usertouch,
 320         .f_process = filt_userprocess,
 321 };
 322
 323 static lck_spin_t _filt_userlock;
 324 static void filt_userlock(void);
 325 static void filt_userunlock(void);
 326
 327 extern struct filterops pipe_rfiltops;
 328 extern struct filterops pipe_wfiltops;
 329 extern struct filterops ptsd_kqops;
 330 extern struct filterops soread_filtops;
 331 extern struct filterops sowrite_filtops;
 332 extern struct filterops sock_filtops;
 333 extern struct filterops soexcept_filtops;
 334 extern struct filterops spec_filtops;
 335 extern struct filterops bpfread_filtops;
 336 extern struct filterops necp_fd_rfiltops;
 337 extern struct filterops skywalk_channel_rfiltops;
 338 extern struct filterops skywalk_channel_wfiltops;
 339 extern struct filterops fsevent_filtops;
 340 extern struct filterops vnode_filtops;
 341
 342 /*
 343  *
 344  * Rules for adding new filters to the system:
 345  * Public filters:
 346  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
 347  *   in the exported section of the header
 348  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
 349  * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
 350  *   of the Public Filters section in the array.
 351  * Private filters:
 352  * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
 353  *   in the XNU_KERNEL_PRIVATE section of the header
 354  * - Update the EVFILTID_MAX value to reflect the new addition
 355  * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
 356  *   the Private filters section of the array.
 357  */
 358 static struct filterops *sysfilt_ops[EVFILTID_MAX] = {
 359         /* Public Filters */
 360         [~EVFILT_READ]                                  = &file_filtops,
 361         [~EVFILT_WRITE]                                 = &file_filtops,
 362         [~EVFILT_AIO]                                   = &bad_filtops,
 363         [~EVFILT_VNODE]                                 = &file_filtops,
 364         [~EVFILT_PROC]                                  = &proc_filtops,
 365         [~EVFILT_SIGNAL]                                = &sig_filtops,
 366         [~EVFILT_TIMER]                                 = &timer_filtops,
 367         [~EVFILT_MACHPORT]                              = &machport_filtops,
 368         [~EVFILT_FS]                                    = &fs_filtops,
 369         [~EVFILT_USER]                                  = &user_filtops,
 370                                                                           &bad_filtops,
 371                                                                           &bad_filtops,
 372         [~EVFILT_SOCK]                                  = &file_filtops,
 373 #if CONFIG_MEMORYSTATUS
 374         [~EVFILT_MEMORYSTATUS]                  = &memorystatus_filtops,
 375 #else
 376         [~EVFILT_MEMORYSTATUS]                  = &bad_filtops,
 377 #endif
 378         [~EVFILT_EXCEPT]                                = &file_filtops,
 379
 380         /* Private filters */
 381         [EVFILTID_KQREAD]                               = &kqread_filtops,
 382         [EVFILTID_PIPE_R]                               = &pipe_rfiltops,
 383         [EVFILTID_PIPE_W]                               = &pipe_wfiltops,
 384         [EVFILTID_PTSD]                                 = &ptsd_kqops,
 385         [EVFILTID_SOREAD]                               = &soread_filtops,
 386         [EVFILTID_SOWRITE]                              = &sowrite_filtops,
 387         [EVFILTID_SCK]                                  = &sock_filtops,
 388         [EVFILTID_SOEXCEPT]                     = &soexcept_filtops,
 389         [EVFILTID_SPEC]                                 = &spec_filtops,
 390         [EVFILTID_BPFREAD]                              = &bpfread_filtops,
 391         [EVFILTID_NECP_FD]                              = &necp_fd_rfiltops,
 392         [EVFILTID_FSEVENT]                              = &fsevent_filtops,
 393         [EVFILTID_VN]                                   = &vnode_filtops
 394 };
 395
 396 /* waitq prepost callback */
 397 void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos);
 398
 399 #ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
 400 #define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */
 401 #endif
 402 #ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
 403 #define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG    0x80000000 /* request overcommit threads */
 404 #endif
 405 #ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK
 406 #define _PTHREAD_PRIORITY_QOS_CLASS_MASK    0x003fff00  /* QoS class mask */
 407 #endif
 408 #ifndef _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32
 409 #define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 8
 410 #endif
 411
 412 static inline
 413 qos_t canonicalize_kevent_qos(qos_t qos)
 414 {
 415         unsigned long canonical;
 416
 417         /* preserve manager and overcommit flags in this case */
 418         canonical = pthread_priority_canonicalize(qos, FALSE);
 419         return (qos_t)canonical;
 420 }
 421
 422 static inline
 423 kq_index_t qos_index_from_qos(qos_t qos, boolean_t propagation)
 424 {
 425         kq_index_t qos_index;
 426         unsigned long flags = 0;
 427
 428         qos_index = (kq_index_t)thread_qos_from_pthread_priority(
 429                                 (unsigned long)qos, &flags);
 430
 431         if (!propagation && (flags & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG))
 432                 return KQWQ_QOS_MANAGER;
 433
 434         return qos_index;
 435 }
 436
 437 static inline
 438 qos_t qos_from_qos_index(kq_index_t qos_index)
 439 {
 440         if (qos_index == KQWQ_QOS_MANAGER)
 441                 return  _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
 442
 443         if (qos_index == 0)
 444                 return 0; /* Unspecified */
 445
 446         /* Should have support from pthread kext support */
 447         return (1 << (qos_index - 1 +
 448                       _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32));
 449 }
 450
 451 static inline
 452 kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags)
 453 {
 454         kq_index_t qos_index;
 455
 456         if (flags & KEVENT_FLAG_WORKQ_MANAGER)
 457                 return KQWQ_QOS_MANAGER;
 458
 459         /*
 460          * If the caller didn't pass in a class (legacy pthread kext)
 461          * the we use the thread policy QoS of the current thread.
 462          */
 463         assert(qos_class != -1);
 464         if (qos_class == -1)
 465                 qos_index = proc_get_thread_policy(thread,
 466                                                    TASK_POLICY_ATTRIBUTE,
 467                                                    TASK_POLICY_QOS);
 468         else
 469                 qos_index = (kq_index_t)qos_class;
 470
 471         assert(qos_index > 0 && qos_index < KQWQ_NQOS);
 472
 473         return qos_index;
 474 }
 475
 476 /*
 477  * kqueue/note lock implementations
 478  *
 479  *      The kqueue lock guards the kq state, the state of its queues,
 480  *      and the kqueue-aware status and use counts of individual knotes.
 481  *
 482  *      The kqueue workq lock is used to protect state guarding the
 483  *      interaction of the kqueue with the workq.  This state cannot
 484  *      be guarded by the kq lock - as it needs to be taken when we
 485  *      already have the waitq set lock held (during the waitq hook
 486  *      callback).  It might be better to use the waitq lock itself
 487  *      for this, but the IRQ requirements make that difficult).
 488  *
 489  *      Knote flags, filter flags, and associated data are protected
 490  *      by the underlying object lock - and are only ever looked at
 491  *      by calling the filter to get a [consistent] snapshot of that
 492  *      data.
 493  */
 494 lck_grp_attr_t * kq_lck_grp_attr;
 495 lck_grp_t * kq_lck_grp;
 496 lck_attr_t * kq_lck_attr;
 497
 498 static inline void
 499 kqlock(struct kqueue *kq)
 500 {
 501         lck_spin_lock(&kq->kq_lock);
 502 }
 503
 504 static inline void
 505 kqunlock(struct kqueue *kq)
 506 {
 507         lck_spin_unlock(&kq->kq_lock);
 508 }
 509
 510
 511 /*
 512  * Convert a kq lock to a knote use referece.
 513  *
 514  *      If the knote is being dropped, or has
 515  *  vanished, we can't get a use reference.
 516  *  Just return with it still locked.
 517  *
 518  *      - kq locked at entry
 519  *      - unlock on exit if we get the use reference
 520  */
 521 static int
 522 kqlock2knoteuse(struct kqueue *kq, struct knote *kn)
 523 {
 524         if (kn->kn_status & (KN_DROPPING | KN_VANISHED))
 525                 return (0);
 526
 527         assert(kn->kn_status & KN_ATTACHED);
 528         kn->kn_inuse++;
 529         kqunlock(kq);
 530         return (1);
 531 }
 532
 533
 534 /*
 535  * Convert from a knote use reference back to kq lock.
 536  *
 537  *      Drop a use reference and wake any waiters if
 538  *      this is the last one.
 539  *
 540  *  If someone is trying to drop the knote, but the
 541  *  caller has events they must deliver, take
 542  *  responsibility for the drop later - and wake the
 543  *  other attempted dropper in a manner that informs
 544  *  him of the transfer of responsibility.
 545  *
 546  *      The exit return indicates if the knote is still alive
 547  *  (or if not, the other dropper has been given the green
 548  *  light to drop it).
 549  *
 550  *  The kqueue lock is re-taken unconditionally.
 551  */
 552 static int
 553 knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int steal_drop)
 554 {
 555         int dropped = 0;
 556
 557         kqlock(kq);
 558         if (--kn->kn_inuse == 0) {
 559
 560                 if ((kn->kn_status & KN_ATTACHING) != 0) {
 561                         kn->kn_status &= ~KN_ATTACHING;
 562                 }
 563
 564                 if ((kn->kn_status & KN_USEWAIT) != 0) {
 565                         wait_result_t result;
 566
 567                         /* If we need to, try and steal the drop */
 568                         if (kn->kn_status & KN_DROPPING) {
 569                                 if (steal_drop && !(kn->kn_status & KN_STOLENDROP)) {
 570                                         kn->kn_status |= KN_STOLENDROP;
 571                                 } else {
 572                                         dropped = 1;
 573                                 }
 574                         }
 575
 576                         /* wakeup indicating if ANY USE stole the drop */
 577                         result = (kn->kn_status & KN_STOLENDROP) ?
 578                                  THREAD_RESTART : THREAD_AWAKENED;
 579
 580                         kn->kn_status &= ~KN_USEWAIT;
 581                         waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
 582                                            CAST_EVENT64_T(&kn->kn_status),
 583                                            result,
 584                                            WAITQ_ALL_PRIORITIES);
 585                 } else {
 586                         /* should have seen use-wait if dropping with use refs */
 587                         assert((kn->kn_status & (KN_DROPPING|KN_STOLENDROP)) == 0);
 588                 }
 589
 590         } else if (kn->kn_status & KN_DROPPING) {
 591                 /* not the last ref but want to steal a drop if present */
 592                 if (steal_drop && ((kn->kn_status & KN_STOLENDROP) == 0)) {
 593                         kn->kn_status |= KN_STOLENDROP;
 594
 595                         /* but we now have to wait to be the last ref */
 596                         kn->kn_status |= KN_USEWAIT;
 597                         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
 598                                             CAST_EVENT64_T(&kn->kn_status),
 599                                             THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
 600                         kqunlock(kq);
 601                         thread_block(THREAD_CONTINUE_NULL);
 602                         kqlock(kq);
 603                 } else {
 604                         dropped = 1;
 605                 }
 606         }
 607
 608         return (!dropped);
 609 }
 610
 611 /*
 612  * Convert a kq lock to a knote use reference
 613  * (for the purpose of detaching AND vanishing it).
 614  *
 615  *      If the knote is being dropped, we can't get
 616  *      a detach reference, so wait for the knote to
 617  *  finish dropping before returning.
 618  *
 619  *  If the knote is being used for other purposes,
 620  *  we cannot detach it until those uses are done
 621  *  as well. Again, just wait for them to finish
 622  *  (caller will start over at lookup).
 623  *
 624  *      - kq locked at entry
 625  *      - unlocked on exit
 626  */
 627 static int
 628 kqlock2knotedetach(struct kqueue *kq, struct knote *kn)
 629 {
 630         if ((kn->kn_status & KN_DROPPING) || kn->kn_inuse) {
 631                 /* have to wait for dropper or current uses to go away */
 632                 kn->kn_status |= KN_USEWAIT;
 633                 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
 634                                     CAST_EVENT64_T(&kn->kn_status),
 635                                     THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
 636                 kqunlock(kq);
 637                 thread_block(THREAD_CONTINUE_NULL);
 638                 return (0);
 639         }
 640         assert((kn->kn_status & KN_VANISHED) == 0);
 641         assert(kn->kn_status & KN_ATTACHED);
 642         kn->kn_status &= ~KN_ATTACHED;
 643         kn->kn_status |= KN_VANISHED;
 644         kn->kn_inuse++;
 645         kqunlock(kq);
 646         return (1);
 647 }
 648
 649 /*
 650  * Convert a kq lock to a knote drop reference.
 651  *
 652  *      If the knote is in use, wait for the use count
 653  *      to subside.  We first mark our intention to drop
 654  *      it - keeping other users from "piling on."
 655  *      If we are too late, we have to wait for the
 656  *      other drop to complete.
 657  *
 658  *      - kq locked at entry
 659  *      - always unlocked on exit.
 660  *      - caller can't hold any locks that would prevent
 661  *        the other dropper from completing.
 662  */
 663 static int
 664 kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
 665 {
 666         int oktodrop;
 667         wait_result_t result;
 668
 669         oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
 670         /* if another thread is attaching, they will become the dropping thread */
 671         kn->kn_status |= KN_DROPPING;
 672         knote_unsuppress(kn);
 673         knote_dequeue(kn);
 674         if (oktodrop) {
 675                 if (kn->kn_inuse == 0) {
 676                         kqunlock(kq);
 677                         return (oktodrop);
 678                 }
 679         }
 680         kn->kn_status |= KN_USEWAIT;
 681         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
 682                             CAST_EVENT64_T(&kn->kn_status),
 683                             THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
 684         kqunlock(kq);
 685         result = thread_block(THREAD_CONTINUE_NULL);
 686         /* THREAD_RESTART == another thread stole the knote drop */
 687         return (result == THREAD_AWAKENED);
 688 }
 689
 690 #if 0
 691 /*
 692  * Release a knote use count reference.
 693  */
 694 static void
 695 knote_put(struct knote *kn)
 696 {
 697         struct kqueue *kq = knote_get_kq(kn);
 698
 699         kqlock(kq);
 700         if (--kn->kn_inuse == 0) {
 701                 if ((kn->kn_status & KN_USEWAIT) != 0) {
 702                         kn->kn_status &= ~KN_USEWAIT;
 703                         waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
 704                                            CAST_EVENT64_T(&kn->kn_status),
 705                                            THREAD_AWAKENED,
 706                                            WAITQ_ALL_PRIORITIES);
 707                 }
 708         }
 709         kqunlock(kq);
 710 }
 711 #endif
 712
 713 static int
 714 filt_fileattach(struct knote *kn)
 715 {
 716         return (fo_kqfilter(kn->kn_fp, kn, vfs_context_current()));
 717 }
 718
 719 #define f_flag f_fglob->fg_flag
 720 #define f_msgcount f_fglob->fg_msgcount
 721 #define f_cred f_fglob->fg_cred
 722 #define f_ops f_fglob->fg_ops
 723 #define f_offset f_fglob->fg_offset
 724 #define f_data f_fglob->fg_data
 725
 726 static void
 727 filt_kqdetach(struct knote *kn)
 728 {
 729         struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
 730         struct kqueue *kq = &kqf->kqf_kqueue;
 731
 732         kqlock(kq);
 733         KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
 734         kqunlock(kq);
 735 }
 736
 737 /*ARGSUSED*/
 738 static int
 739 filt_kqueue(struct knote *kn, __unused long hint)
 740 {
 741         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 742         int count;
 743
 744         count = kq->kq_count;
 745         return (count > 0);
 746 }
 747
 748 static int
 749 filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev)
 750 {
 751 #pragma unused(kev)
 752         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 753         int res;
 754
 755         kqlock(kq);
 756         kn->kn_data = kq->kq_count;
 757         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
 758                 kn->kn_udata = kev->udata;
 759         res = (kn->kn_data > 0);
 760
 761         kqunlock(kq);
 762
 763         return res;
 764 }
 765
 766 static int
 767 filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
 768 {
 769 #pragma unused(data)
 770         struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 771         int res;
 772
 773         kqlock(kq);
 774         kn->kn_data = kq->kq_count;
 775         res = (kn->kn_data > 0);
 776         if (res) {
 777                 *kev = kn->kn_kevent;
 778                 if (kn->kn_flags & EV_CLEAR)
 779                         kn->kn_data = 0;
 780         }
 781         kqunlock(kq);
 782
 783         return res;
 784 }
 785
 786 static int
 787 filt_procattach(struct knote *kn)
 788 {
 789         struct proc *p;
 790
 791         assert(PID_MAX < NOTE_PDATAMASK);
 792
 793         if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
 794                 kn->kn_flags = EV_ERROR;
 795                 kn->kn_data = ENOTSUP;
 796                 return 0;
 797         }
 798
 799         p = proc_find(kn->kn_id);
 800         if (p == NULL) {
 801                 kn->kn_flags = EV_ERROR;
 802                 kn->kn_data = ESRCH;
 803                 return 0;
 804         }
 805
 806         const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
 807
 808         if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
 809                 do {
 810                         pid_t selfpid = proc_selfpid();
 811
 812                         if (p->p_ppid == selfpid)
 813                                 break;  /* parent => ok */
 814
 815                         if ((p->p_lflag & P_LTRACED) != 0 &&
 816                             (p->p_oppid == selfpid))
 817                                 break;  /* parent-in-waiting => ok */
 818
 819                         proc_rele(p);
 820                         kn->kn_flags = EV_ERROR;
 821                         kn->kn_data = EACCES;
 822                         return 0;
 823                 } while (0);
 824
 825         proc_klist_lock();
 826
 827         kn->kn_ptr.p_proc = p;          /* store the proc handle */
 828
 829         KNOTE_ATTACH(&p->p_klist, kn);
 830
 831         proc_klist_unlock();
 832
 833         proc_rele(p);
 834
 835         /*
 836          * only captures edge-triggered events after this point
 837          * so it can't already be fired.
 838          */
 839         return (0);
 840 }
 841
 842
 843 /*
 844  * The knote may be attached to a different process, which may exit,
 845  * leaving nothing for the knote to be attached to.  In that case,
 846  * the pointer to the process will have already been nulled out.
 847  */
 848 static void
 849 filt_procdetach(struct knote *kn)
 850 {
 851         struct proc *p;
 852
 853         proc_klist_lock();
 854
 855         p = kn->kn_ptr.p_proc;
 856         if (p != PROC_NULL) {
 857                 kn->kn_ptr.p_proc = PROC_NULL;
 858                 KNOTE_DETACH(&p->p_klist, kn);
 859         }
 860
 861         proc_klist_unlock();
 862 }
 863
 864 static int
 865 filt_proc(struct knote *kn, long hint)
 866 {
 867         u_int event;
 868
 869         /* ALWAYS CALLED WITH proc_klist_lock */
 870
 871         /*
 872          * Note: a lot of bits in hint may be obtained from the knote
 873          * To free some of those bits, see <rdar://problem/12592988> Freeing up
 874          * bits in hint for filt_proc
 875          *
 876          * mask off extra data
 877          */
 878         event = (u_int)hint & NOTE_PCTRLMASK;
 879
 880         /*
 881          * termination lifecycle events can happen while a debugger
 882          * has reparented a process, in which case notifications
 883          * should be quashed except to the tracing parent. When
 884          * the debugger reaps the child (either via wait4(2) or
 885          * process exit), the child will be reparented to the original
 886          * parent and these knotes re-fired.
 887          */
 888         if (event & NOTE_EXIT) {
 889                 if ((kn->kn_ptr.p_proc->p_oppid != 0)
 890                     && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
 891                         /*
 892                          * This knote is not for the current ptrace(2) parent, ignore.
 893                          */
 894                         return 0;
 895                 }
 896         }
 897
 898         /*
 899          * if the user is interested in this event, record it.
 900          */
 901         if (kn->kn_sfflags & event)
 902                 kn->kn_fflags |= event;
 903
 904 #pragma clang diagnostic push
 905 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
 906         if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
 907                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 908         }
 909 #pragma clang diagnostic pop
 910
 911
 912         /*
 913          * The kernel has a wrapper in place that returns the same data
 914          * as is collected here, in kn_data.  Any changes to how
 915          * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
 916          * should also be reflected in the proc_pidnoteexit() wrapper.
 917          */
 918         if (event == NOTE_EXIT) {
 919                 kn->kn_data = 0;
 920                 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
 921                         kn->kn_fflags |= NOTE_EXITSTATUS;
 922                         kn->kn_data |= (hint & NOTE_PDATAMASK);
 923                 }
 924                 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
 925                         kn->kn_fflags |= NOTE_EXIT_DETAIL;
 926                         if ((kn->kn_ptr.p_proc->p_lflag &
 927                              P_LTERM_DECRYPTFAIL) != 0) {
 928                                 kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
 929                         }
 930                         if ((kn->kn_ptr.p_proc->p_lflag &
 931                              P_LTERM_JETSAM) != 0) {
 932                                 kn->kn_data |= NOTE_EXIT_MEMORY;
 933                                 switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) {
 934                                 case P_JETSAM_VMPAGESHORTAGE:
 935                                         kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
 936                                         break;
 937                                 case P_JETSAM_VMTHRASHING:
 938                                         kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
 939                                         break;
 940                                 case P_JETSAM_FCTHRASHING:
 941                                         kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
 942                                         break;
 943                                 case P_JETSAM_VNODE:
 944                                         kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
 945                                         break;
 946                                 case P_JETSAM_HIWAT:
 947                                         kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
 948                                         break;
 949                                 case P_JETSAM_PID:
 950                                         kn->kn_data |= NOTE_EXIT_MEMORY_PID;
 951                                         break;
 952                                 case P_JETSAM_IDLEEXIT:
 953                                         kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
 954                                         break;
 955                                 }
 956                         }
 957                         if ((kn->kn_ptr.p_proc->p_csflags &
 958                              CS_KILLED) != 0) {
 959                                 kn->kn_data |= NOTE_EXIT_CSERROR;
 960                         }
 961                 }
 962         }
 963
 964         /* if we have any matching state, activate the knote */
 965         return (kn->kn_fflags != 0);
 966 }
 967
 968 static int
 969 filt_proctouch(struct knote *kn, struct kevent_internal_s *kev)
 970 {
 971         int res;
 972
 973         proc_klist_lock();
 974
 975         /* accept new filter flags and mask off output events no long interesting */
 976         kn->kn_sfflags = kev->fflags;
 977         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
 978                 kn->kn_udata = kev->udata;
 979
 980         /* restrict the current results to the (smaller?) set of new interest */
 981         /*
 982          * For compatibility with previous implementations, we leave kn_fflags
 983          * as they were before.
 984          */
 985         //kn->kn_fflags &= kn->kn_sfflags;
 986
 987         res = (kn->kn_fflags != 0);
 988
 989         proc_klist_unlock();
 990
 991         return res;
 992 }
 993
 994 static int
 995 filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
 996 {
 997 #pragma unused(data)
 998         int res;
 999
1000         proc_klist_lock();
1001         res = (kn->kn_fflags != 0);
1002         if (res) {
1003                 *kev = kn->kn_kevent;
1004                 kn->kn_flags |= EV_CLEAR;       /* automatically set */
1005                 kn->kn_fflags = 0;
1006                 kn->kn_data = 0;
1007         }
1008         proc_klist_unlock();
1009         return res;
1010 }
1011
1012 /*
1013  * filt_timervalidate - process data from user
1014  *
1015  *      Converts to either interval or deadline format.
1016  *
1017  *      The saved-data field in the knote contains the
1018  *      time value.  The saved filter-flags indicates
1019  *      the unit of measurement.
1020  *
1021  *      After validation, either the saved-data field
1022  *      contains the interval in absolute time, or ext[0]
1023  *      contains the expected deadline. If that deadline
1024  *      is in the past, ext[0] is 0.
1025  *
1026  *      Returns EINVAL for unrecognized units of time.
1027  *
1028  *      Timer filter lock is held.
1029  *
1030  */
1031 static int
1032 filt_timervalidate(struct knote *kn)
1033 {
1034         uint64_t multiplier;
1035         uint64_t raw = 0;
1036
1037         switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS)) {
1038         case NOTE_SECONDS:
1039                 multiplier = NSEC_PER_SEC;
1040                 break;
1041         case NOTE_USECONDS:
1042                 multiplier = NSEC_PER_USEC;
1043                 break;
1044         case NOTE_NSECONDS:
1045                 multiplier = 1;
1046                 break;
1047         case 0: /* milliseconds (default) */
1048                 multiplier = NSEC_PER_SEC / 1000;
1049                 break;
1050         default:
1051                 return (EINVAL);
1052         }
1053
1054         /* transform the slop delta(leeway) in kn_ext[1] if passed to same time scale */
1055         if(kn->kn_sfflags & NOTE_LEEWAY){
1056                 nanoseconds_to_absolutetime((uint64_t)kn->kn_ext[1] * multiplier, &raw);
1057                 kn->kn_ext[1] = raw;
1058         }
1059
1060         nanoseconds_to_absolutetime((uint64_t)kn->kn_sdata * multiplier, &raw);
1061
1062         kn->kn_ext[0] = 0;
1063         kn->kn_sdata = 0;
1064
1065         if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1066                 clock_sec_t seconds;
1067                 clock_nsec_t nanoseconds;
1068                 uint64_t now;
1069
1070                 clock_get_calendar_nanotime(&seconds, &nanoseconds);
1071                 nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC +
1072                     nanoseconds, &now);
1073
1074                 /* if time is in the future */
1075                 if (now < raw) {
1076                         raw -= now;
1077
1078                         if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1079                                 clock_continuoustime_interval_to_deadline(raw,
1080                                     &kn->kn_ext[0]);
1081                         } else {
1082                                 clock_absolutetime_interval_to_deadline(raw,
1083                                     &kn->kn_ext[0]);
1084                         }
1085                 }
1086         } else {
1087                 kn->kn_sdata = raw;
1088         }
1089
1090         return (0);
1091 }
1092
1093 /*
1094  * filt_timerupdate - compute the next deadline
1095  *
1096  *      Repeating timers store their interval in kn_sdata. Absolute
1097  *      timers have already calculated the deadline, stored in ext[0].
1098  *
1099  *      On return, the next deadline (or zero if no deadline is needed)
1100  *      is stored in kn_ext[0].
1101  *
1102  *      Timer filter lock is held.
1103  */
1104 static void
1105 filt_timerupdate(struct knote *kn, int num_fired)
1106 {
1107         assert(num_fired > 0);
1108
1109         /* if there's no interval, deadline is just in kn_ext[0] */
1110         if (kn->kn_sdata == 0)
1111                 return;
1112
1113         /* if timer hasn't fired before, fire in interval nsecs */
1114         if (kn->kn_ext[0] == 0) {
1115                 assert(num_fired == 1);
1116                 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1117                         clock_continuoustime_interval_to_deadline(kn->kn_sdata,
1118                             &kn->kn_ext[0]);
1119                 } else {
1120                         clock_absolutetime_interval_to_deadline(kn->kn_sdata,
1121                             &kn->kn_ext[0]);
1122                 }
1123         } else {
1124                 /*
1125                  * If timer has fired before, schedule the next pop
1126                  * relative to the last intended deadline.
1127                  *
1128                  * We could check for whether the deadline has expired,
1129                  * but the thread call layer can handle that.
1130                  *
1131                  * Go forward an additional number of periods, in the case the
1132                  * timer fired multiple times while the system was asleep.
1133                  */
1134                 kn->kn_ext[0] += (kn->kn_sdata * num_fired);
1135         }
1136 }
1137
1138 /*
1139  * filt_timerexpire - the timer callout routine
1140  *
1141  * Just propagate the timer event into the knote
1142  * filter routine (by going through the knote
1143  * synchronization point).  Pass a hint to
1144  * indicate this is a real event, not just a
1145  * query from above.
1146  */
1147 static void
1148 filt_timerexpire(void *knx, __unused void *spare)
1149 {
1150         struct klist timer_list;
1151         struct knote *kn = knx;
1152
1153         filt_timerlock();
1154
1155         kn->kn_hookid &= ~TIMER_RUNNING;
1156
1157         /* no "object" for timers, so fake a list */
1158         SLIST_INIT(&timer_list);
1159         SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
1160         KNOTE(&timer_list, 1);
1161
1162         /* if someone is waiting for timer to pop */
1163         if (kn->kn_hookid & TIMER_CANCELWAIT) {
1164                 struct kqueue *kq = knote_get_kq(kn);
1165                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
1166                                    CAST_EVENT64_T(&kn->kn_hook),
1167                                    THREAD_AWAKENED,
1168                                    WAITQ_ALL_PRIORITIES);
1169         }
1170
1171         filt_timerunlock();
1172 }
1173
1174 /*
1175  * Cancel a running timer (or wait for the pop).
1176  * Timer filter lock is held.
1177  */
1178 static void
1179 filt_timercancel(struct knote *kn)
1180 {
1181         struct kqueue *kq = knote_get_kq(kn);
1182         thread_call_t callout = kn->kn_hook;
1183         boolean_t cancelled;
1184
1185         if (kn->kn_hookid & TIMER_RUNNING) {
1186                 /* cancel the callout if we can */
1187                 cancelled = thread_call_cancel(callout);
1188                 if (cancelled) {
1189                         kn->kn_hookid &= ~TIMER_RUNNING;
1190                 } else {
1191                         /* we have to wait for the expire routine.  */
1192                         kn->kn_hookid |= TIMER_CANCELWAIT;
1193                         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
1194                                             CAST_EVENT64_T(&kn->kn_hook),
1195                                             THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1196                         filt_timerunlock();
1197                         thread_block(THREAD_CONTINUE_NULL);
1198                         filt_timerlock();
1199                         assert((kn->kn_hookid & TIMER_RUNNING) == 0);
1200                 }
1201         }
1202 }
1203
1204 /*
1205  * Allocate a thread call for the knote's lifetime, and kick off the timer.
1206  */
1207 static int
1208 filt_timerattach(struct knote *kn)
1209 {
1210         thread_call_t callout;
1211         int error;
1212         int res;
1213
1214         callout = thread_call_allocate(filt_timerexpire, kn);
1215         if (NULL == callout) {
1216                 kn->kn_flags = EV_ERROR;
1217                 kn->kn_data = ENOMEM;
1218                 return 0;
1219         }
1220
1221         filt_timerlock();
1222         error = filt_timervalidate(kn);
1223         if (error != 0) {
1224                 filt_timerunlock();
1225                 thread_call_free(callout);
1226                 kn->kn_flags = EV_ERROR;
1227                 kn->kn_data = error;
1228                 return 0;
1229         }
1230
1231         kn->kn_hook = (void*)callout;
1232         kn->kn_hookid = 0;
1233
1234         /* absolute=EV_ONESHOT */
1235         if (kn->kn_sfflags & NOTE_ABSOLUTE)
1236                 kn->kn_flags |= EV_ONESHOT;
1237
1238         filt_timerupdate(kn, 1);
1239         if (kn->kn_ext[0]) {
1240                 kn->kn_flags |= EV_CLEAR;
1241                 unsigned int timer_flags = 0;
1242                 if (kn->kn_sfflags & NOTE_CRITICAL)
1243                         timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1244                 else if (kn->kn_sfflags & NOTE_BACKGROUND)
1245                         timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1246                 else
1247                         timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1248
1249                 if (kn->kn_sfflags & NOTE_LEEWAY)
1250                         timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1251                 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1252                         timer_flags |= THREAD_CALL_CONTINUOUS;
1253
1254                 thread_call_enter_delayed_with_leeway(callout, NULL,
1255                                 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1256
1257                 kn->kn_hookid |= TIMER_RUNNING;
1258         } else {
1259                 /* fake immediate */
1260                 kn->kn_data = 1;
1261         }
1262
1263         res = (kn->kn_data > 0);
1264
1265         filt_timerunlock();
1266
1267         return res;
1268 }
1269
1270 /*
1271  * Shut down the timer if it's running, and free the callout.
1272  */
1273 static void
1274 filt_timerdetach(struct knote *kn)
1275 {
1276         thread_call_t callout;
1277
1278         filt_timerlock();
1279
1280         callout = (thread_call_t)kn->kn_hook;
1281         filt_timercancel(kn);
1282
1283         filt_timerunlock();
1284
1285         thread_call_free(callout);
1286 }
1287
1288
1289 static int filt_timer_num_fired(struct knote *kn)
1290 {
1291         /* by default we fire a timer once */
1292         int num_fired = 1;
1293
1294         /*
1295          * When the time base is mach_continuous_time, we have to calculate
1296          * the number of times the timer fired while we were asleep.
1297          */
1298         if ((kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) &&
1299             (kn->kn_sdata  != 0) &&
1300             (kn->kn_ext[0] != 0))
1301         {
1302                 const uint64_t now = mach_continuous_time();
1303                 // time for timer to fire (right now) is kn_ext[0]
1304                 // kn_sdata is period for timer to fire
1305                 assert(now >= kn->kn_ext[0]);
1306                 assert(kn->kn_sdata > 0);
1307
1308                 const uint64_t overrun_ticks = now - kn->kn_ext[0];
1309                 const uint64_t kn_sdata = kn->kn_sdata;
1310
1311                 if (overrun_ticks < kn_sdata) {
1312                         num_fired = 1;
1313                 } else if (overrun_ticks < (kn_sdata << 1)) {
1314                         num_fired = 2;
1315                 } else {
1316                         num_fired = (overrun_ticks / kn_sdata) + 1;
1317                 }
1318         }
1319
1320         return num_fired;
1321 }
1322
1323 /*
1324  * filt_timer - post events to a timer knote
1325  *
1326  * Count the timer fire and re-arm as requested.
1327  * This always crosses the threshold of interest,
1328  * so always return an indication that the knote
1329  * should be activated (if not already).
1330  */
1331 static int
1332 filt_timer(
1333         struct knote *kn,
1334         long hint)
1335 {
1336 #pragma unused(hint)
1337
1338         /* real timer pop -- timer lock held by filt_timerexpire */
1339         int num_fired = filt_timer_num_fired(kn);
1340         kn->kn_data += num_fired;
1341
1342         if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) &&
1343             ((kn->kn_flags & EV_ONESHOT) == 0)) {
1344                 /* evaluate next time to fire */
1345                 filt_timerupdate(kn, num_fired);
1346
1347                 if (kn->kn_ext[0]) {
1348                         unsigned int timer_flags = 0;
1349
1350                         /* keep the callout and re-arm */
1351                         if (kn->kn_sfflags & NOTE_CRITICAL)
1352                                 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1353                         else if (kn->kn_sfflags & NOTE_BACKGROUND)
1354                                 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1355                         else
1356                                 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1357
1358                         if (kn->kn_sfflags & NOTE_LEEWAY)
1359                                 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1360
1361                         thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
1362                                         kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1363
1364                         kn->kn_hookid |= TIMER_RUNNING;
1365                 }
1366         }
1367         return (1);
1368 }
1369
1370
1371
1372 /*
1373  * filt_timertouch - update timer knote with new user input
1374  *
1375  * Cancel and restart the timer based on new user data. When
1376  * the user picks up a knote, clear the count of how many timer
1377  * pops have gone off (in kn_data).
1378  */
1379 static int
1380 filt_timertouch(
1381         struct knote *kn,
1382         struct kevent_internal_s *kev)
1383 {
1384         int error;
1385         int res;
1386
1387         filt_timerlock();
1388
1389         /* cancel current call */
1390         filt_timercancel(kn);
1391
1392         /* capture the new values used to compute deadline */
1393         kn->kn_sdata = kev->data;
1394         kn->kn_sfflags = kev->fflags;
1395         kn->kn_ext[0] = kev->ext[0];
1396         kn->kn_ext[1] = kev->ext[1];
1397
1398         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1399                 kn->kn_udata = kev->udata;
1400
1401         /* recalculate deadline */
1402         error = filt_timervalidate(kn);
1403         if (error) {
1404                 /* no way to report error, so mark it in the knote */
1405                 filt_timerunlock();
1406                 kn->kn_flags |= EV_ERROR;
1407                 kn->kn_data = error;
1408                 return 1;
1409         }
1410
1411         /* start timer if necessary */
1412         filt_timerupdate(kn, 1);
1413
1414         if (kn->kn_ext[0]) {
1415                 unsigned int timer_flags = 0;
1416                 if (kn->kn_sfflags & NOTE_CRITICAL)
1417                         timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1418                 else if (kn->kn_sfflags & NOTE_BACKGROUND)
1419                         timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1420                 else
1421                         timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1422
1423                 if (kn->kn_sfflags & NOTE_LEEWAY)
1424                         timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1425
1426                 thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
1427                                 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1428
1429                 kn->kn_hookid |= TIMER_RUNNING;
1430         } else {
1431                 /* pretend the timer has fired */
1432                 kn->kn_data = 1;
1433         }
1434
1435         /* capture if already fired */
1436         res = (kn->kn_data > 0);
1437
1438         filt_timerunlock();
1439
1440         return res;
1441 }
1442
1443 /*
1444  * filt_timerprocess - query state of knote and snapshot event data
1445  *
1446  * Determine if the timer has fired in the past, snapshot the state
1447  * of the kevent for returning to user-space, and clear pending event
1448  * counters for the next time.
1449  */
1450 static int
1451 filt_timerprocess(
1452         struct knote *kn,
1453         __unused struct filt_process_s *data,
1454         struct kevent_internal_s *kev)
1455 {
1456         filt_timerlock();
1457
1458         /* user-query */
1459         if (kn->kn_data == 0) {
1460                 filt_timerunlock();
1461                 return 0;
1462         }
1463
1464         /*
1465          * Copy out the interesting kevent state,
1466          * but don't leak out the raw time calculations.
1467          */
1468         *kev = kn->kn_kevent;
1469         kev->ext[0] = 0;
1470         /* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
1471
1472         /*
1473          * reset the timer pop count in kn_data
1474          * and (optionally) clear the fflags.
1475          */
1476         kn->kn_data = 0;
1477         if (kn->kn_flags & EV_CLEAR)
1478                 kn->kn_fflags = 0;
1479
1480         filt_timerunlock();
1481         return 1;
1482 }
1483
1484 static void
1485 filt_timerlock(void)
1486 {
1487         lck_mtx_lock(&_filt_timerlock);
1488 }
1489
1490 static void
1491 filt_timerunlock(void)
1492 {
1493         lck_mtx_unlock(&_filt_timerlock);
1494 }
1495
1496 static void
1497 filt_userlock(void)
1498 {
1499         lck_spin_lock(&_filt_userlock);
1500 }
1501
1502 static void
1503 filt_userunlock(void)
1504 {
1505         lck_spin_unlock(&_filt_userlock);
1506 }
1507
1508 static int
1509 filt_userattach(struct knote *kn)
1510 {
1511         /* EVFILT_USER knotes are not attached to anything in the kernel */
1512         /* Cant discover this knote until after attach - so no lock needed */
1513         kn->kn_hook = NULL;
1514         if (kn->kn_fflags & NOTE_TRIGGER) {
1515                 kn->kn_hookid = 1;
1516         } else {
1517                 kn->kn_hookid = 0;
1518         }
1519         return (kn->kn_hookid);
1520 }
1521
1522 static void
1523 filt_userdetach(__unused struct knote *kn)
1524 {
1525         /* EVFILT_USER knotes are not attached to anything in the kernel */
1526 }
1527
1528 static int
1529 filt_user(
1530         __unused struct knote *kn,
1531         __unused long hint)
1532 {
1533         panic("filt_user");
1534         return 0;
1535 }
1536
1537 static int
1538 filt_usertouch(
1539         struct knote *kn,
1540         struct kevent_internal_s *kev)
1541 {
1542         uint32_t ffctrl;
1543         int fflags;
1544         int active;
1545
1546         filt_userlock();
1547
1548         ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1549         fflags = kev->fflags & NOTE_FFLAGSMASK;
1550         switch (ffctrl) {
1551         case NOTE_FFNOP:
1552                 break;
1553         case NOTE_FFAND:
1554                 kn->kn_sfflags &= fflags;
1555                 break;
1556         case NOTE_FFOR:
1557                 kn->kn_sfflags |= fflags;
1558                 break;
1559         case NOTE_FFCOPY:
1560                 kn->kn_sfflags = fflags;
1561                 break;
1562         }
1563         kn->kn_sdata = kev->data;
1564
1565         if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1566                 kn->kn_udata = kev->udata;
1567
1568         if (kev->fflags & NOTE_TRIGGER) {
1569                 kn->kn_hookid = 1;
1570         }
1571         active = kn->kn_hookid;
1572
1573         filt_userunlock();
1574
1575         return (active);
1576 }
1577
1578 static int
1579 filt_userprocess(
1580         struct knote *kn,
1581         __unused struct filt_process_s *data,
1582         struct kevent_internal_s *kev)
1583 {
1584         filt_userlock();
1585
1586         if (kn->kn_hookid == 0) {
1587                 filt_userunlock();
1588                 return 0;
1589         }
1590
1591         *kev = kn->kn_kevent;
1592         kev->fflags = (volatile UInt32)kn->kn_sfflags;
1593         kev->data = kn->kn_sdata;
1594         if (kn->kn_flags & EV_CLEAR) {
1595                 kn->kn_hookid = 0;
1596                 kn->kn_data = 0;
1597                 kn->kn_fflags = 0;
1598         }
1599         filt_userunlock();
1600
1601         return 1;
1602 }
1603
1604 /*
1605  * JMM - placeholder for not-yet-implemented filters
1606  */
1607 static int
1608 filt_badattach(__unused struct knote *kn)
1609 {
1610         kn->kn_flags |= EV_ERROR;
1611         kn->kn_data = ENOTSUP;
1612         return 0;
1613 }
1614
1615 struct kqueue *
1616 kqueue_alloc(struct proc *p, unsigned int flags)
1617 {
1618         struct filedesc *fdp = p->p_fd;
1619         struct kqueue *kq = NULL;
1620         int policy;
1621         void *hook;
1622         uint64_t kq_addr_offset;
1623
1624         if (flags & KEVENT_FLAG_WORKQ) {
1625                 struct kqworkq *kqwq;
1626                 int i;
1627
1628                 kqwq = (struct kqworkq *)zalloc(kqworkq_zone);
1629                 if (kqwq == NULL)
1630                         return NULL;
1631
1632                 kq = &kqwq->kqwq_kqueue;
1633                 bzero(kqwq, sizeof (struct kqworkq));
1634
1635                 kqwq->kqwq_state = KQ_WORKQ;
1636
1637                 for (i = 0; i < KQWQ_NBUCKETS; i++) {
1638                         TAILQ_INIT(&kq->kq_queue[i]);
1639                 }
1640                 for (i = 0; i < KQWQ_NQOS; i++) {
1641                         TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed);
1642                 }
1643
1644                 lck_spin_init(&kqwq->kqwq_reqlock, kq_lck_grp, kq_lck_attr);
1645                 policy = SYNC_POLICY_FIFO;
1646                 hook = (void *)kqwq;
1647
1648         } else {
1649                 struct kqfile *kqf;
1650
1651                 kqf = (struct kqfile *)zalloc(kqfile_zone);
1652                 if (kqf == NULL)
1653                         return NULL;
1654
1655                 kq = &kqf->kqf_kqueue;
1656                 bzero(kqf, sizeof (struct kqfile));
1657                 TAILQ_INIT(&kq->kq_queue[0]);
1658                 TAILQ_INIT(&kqf->kqf_suppressed);
1659
1660                 policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST;
1661                 hook = NULL;
1662
1663         }
1664
1665         waitq_set_init(&kq->kq_wqs, policy, NULL, hook);
1666         lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
1667         kq->kq_p = p;
1668
1669         if (fdp->fd_knlistsize < 0) {
1670                 proc_fdlock(p);
1671                 if (fdp->fd_knlistsize < 0)
1672                         fdp->fd_knlistsize = 0; /* this process has had a kq */
1673                 proc_fdunlock(p);
1674         }
1675
1676         kq_addr_offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS);
1677         /* Assert that the address can be pointer compacted for use with knote */
1678         assert(kq_addr_offset < (uint64_t)(1ull << KNOTE_KQ_BITSIZE));
1679         return (kq);
1680 }
1681
1682 /*
1683  * kqueue_dealloc - detach all knotes from a kqueue and free it
1684  *
1685  *      We walk each list looking for knotes referencing this
1686  *      this kqueue.  If we find one, we try to drop it.  But
1687  *      if we fail to get a drop reference, that will wait
1688  *      until it is dropped.  So, we can just restart again
1689  *      safe in the assumption that the list will eventually
1690  *      not contain any more references to this kqueue (either
1691  *      we dropped them all, or someone else did).
1692  *
1693  *      Assumes no new events are being added to the kqueue.
1694  *      Nothing locked on entry or exit.
1695  */
1696 void
1697 kqueue_dealloc(struct kqueue *kq)
1698 {
1699         struct proc *p;
1700         struct filedesc *fdp;
1701         struct knote *kn;
1702         int i;
1703
1704         if (kq == NULL)
1705                 return;
1706
1707         p = kq->kq_p;
1708         fdp = p->p_fd;
1709
1710         proc_fdlock(p);
1711         for (i = 0; i < fdp->fd_knlistsize; i++) {
1712                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1713                 while (kn != NULL) {
1714                         if (kq == knote_get_kq(kn)) {
1715                                 kqlock(kq);
1716                                 proc_fdunlock(p);
1717                                 /* drop it ourselves or wait */
1718                                 if (kqlock2knotedrop(kq, kn)) {
1719                                         knote_drop(kn, p);
1720                                 }
1721                                 proc_fdlock(p);
1722                                 /* start over at beginning of list */
1723                                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1724                                 continue;
1725                         }
1726                         kn = SLIST_NEXT(kn, kn_link);
1727                 }
1728         }
1729         if (fdp->fd_knhashmask != 0) {
1730                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
1731                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1732                         while (kn != NULL) {
1733                                 if (kq == knote_get_kq(kn)) {
1734                                         kqlock(kq);
1735                                         proc_fdunlock(p);
1736                                         /* drop it ourselves or wait */
1737                                         if (kqlock2knotedrop(kq, kn)) {
1738                                                 knote_drop(kn, p);
1739                                         }
1740                                         proc_fdlock(p);
1741                                         /* start over at beginning of list */
1742                                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1743                                         continue;
1744                                 }
1745                                 kn = SLIST_NEXT(kn, kn_link);
1746                         }
1747                 }
1748         }
1749         proc_fdunlock(p);
1750
1751         /*
1752          * waitq_set_deinit() remove the KQ's waitq set from
1753          * any select sets to which it may belong.
1754          */
1755         waitq_set_deinit(&kq->kq_wqs);
1756         lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
1757
1758         if (kq->kq_state & KQ_WORKQ) {
1759                 struct kqworkq *kqwq = (struct kqworkq *)kq;
1760
1761                 lck_spin_destroy(&kqwq->kqwq_reqlock, kq_lck_grp);
1762                 zfree(kqworkq_zone, kqwq);
1763         } else {
1764                 struct kqfile *kqf = (struct kqfile *)kq;
1765
1766                 zfree(kqfile_zone, kqf);
1767         }
1768 }
1769
1770 int
1771 kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
1772 {
1773         struct kqueue *kq;
1774         struct fileproc *fp;
1775         int fd, error;
1776
1777         error = falloc_withalloc(p,
1778             &fp, &fd, vfs_context_current(), fp_zalloc, cra);
1779         if (error) {
1780                 return (error);
1781         }
1782
1783         kq = kqueue_alloc(p, 0);
1784         if (kq == NULL) {
1785                 fp_free(p, fd, fp);
1786                 return (ENOMEM);
1787         }
1788
1789         fp->f_flag = FREAD | FWRITE;
1790         fp->f_ops = &kqueueops;
1791         fp->f_data = kq;
1792
1793         proc_fdlock(p);
1794         *fdflags(p, fd) |= UF_EXCLOSE;
1795         procfdtbl_releasefd(p, fd, NULL);
1796         fp_drop(p, fd, fp, 1);
1797         proc_fdunlock(p);
1798
1799         *retval = fd;
1800         return (error);
1801 }
1802
1803 int
1804 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
1805 {
1806         return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
1807 }
1808
1809 static int
1810 kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
1811     unsigned int flags)
1812 {
1813         int advance;
1814         int error;
1815
1816         if (flags & KEVENT_FLAG_LEGACY32) {
1817                 bzero(kevp, sizeof (*kevp));
1818
1819                 if (IS_64BIT_PROCESS(p)) {
1820                         struct user64_kevent kev64;
1821
1822                         advance = sizeof (kev64);
1823                         error = copyin(*addrp, (caddr_t)&kev64, advance);
1824                         if (error)
1825                                 return (error);
1826                         kevp->ident = kev64.ident;
1827                         kevp->filter = kev64.filter;
1828                         kevp->flags = kev64.flags;
1829                         kevp->udata = kev64.udata;
1830                         kevp->fflags = kev64.fflags;
1831                         kevp->data = kev64.data;
1832                 } else {
1833                         struct user32_kevent kev32;
1834
1835                         advance = sizeof (kev32);
1836                         error = copyin(*addrp, (caddr_t)&kev32, advance);
1837                         if (error)
1838                                 return (error);
1839                         kevp->ident = (uintptr_t)kev32.ident;
1840                         kevp->filter = kev32.filter;
1841                         kevp->flags = kev32.flags;
1842                         kevp->udata = CAST_USER_ADDR_T(kev32.udata);
1843                         kevp->fflags = kev32.fflags;
1844                         kevp->data = (intptr_t)kev32.data;
1845                 }
1846         } else if (flags & KEVENT_FLAG_LEGACY64) {
1847                 struct kevent64_s kev64;
1848
1849                 bzero(kevp, sizeof (*kevp));
1850
1851                 advance = sizeof (struct kevent64_s);
1852                 error = copyin(*addrp, (caddr_t)&kev64, advance);
1853                 if (error)
1854                         return(error);
1855                 kevp->ident = kev64.ident;
1856                 kevp->filter = kev64.filter;
1857                 kevp->flags = kev64.flags;
1858                 kevp->udata = kev64.udata;
1859                 kevp->fflags = kev64.fflags;
1860                 kevp->data = kev64.data;
1861                 kevp->ext[0] = kev64.ext[0];
1862                 kevp->ext[1] = kev64.ext[1];
1863
1864         } else {
1865                 struct kevent_qos_s kevqos;
1866
1867                 bzero(kevp, sizeof (*kevp));
1868
1869                 advance = sizeof (struct kevent_qos_s);
1870                 error = copyin(*addrp, (caddr_t)&kevqos, advance);
1871                 if (error)
1872                         return error;
1873                 kevp->ident = kevqos.ident;
1874                 kevp->filter = kevqos.filter;
1875                 kevp->flags = kevqos.flags;
1876                 kevp->qos = kevqos.qos;
1877 //              kevp->xflags = kevqos.xflags;
1878                 kevp->udata = kevqos.udata;
1879                 kevp->fflags = kevqos.fflags;
1880                 kevp->data = kevqos.data;
1881                 kevp->ext[0] = kevqos.ext[0];
1882                 kevp->ext[1] = kevqos.ext[1];
1883                 kevp->ext[2] = kevqos.ext[2];
1884                 kevp->ext[3] = kevqos.ext[3];
1885         }
1886         if (!error)
1887                 *addrp += advance;
1888         return (error);
1889 }
1890
1891 static int
1892 kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
1893     unsigned int flags)
1894 {
1895         user_addr_t addr = *addrp;
1896         int advance;
1897         int error;
1898
1899         /*
1900          * fully initialize the differnt output event structure
1901          * types from the internal kevent (and some universal
1902          * defaults for fields not represented in the internal
1903          * form).
1904          */
1905         if (flags & KEVENT_FLAG_LEGACY32) {
1906                 assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0);
1907
1908                 if (IS_64BIT_PROCESS(p)) {
1909                         struct user64_kevent kev64;
1910
1911                         advance = sizeof (kev64);
1912                         bzero(&kev64, advance);
1913
1914                         /*
1915                          * deal with the special case of a user-supplied
1916                          * value of (uintptr_t)-1.
1917                          */
1918                         kev64.ident = (kevp->ident == (uintptr_t)-1) ?
1919                                 (uint64_t)-1LL : (uint64_t)kevp->ident;
1920
1921                         kev64.filter = kevp->filter;
1922                         kev64.flags = kevp->flags;
1923                         kev64.fflags = kevp->fflags;
1924                         kev64.data = (int64_t) kevp->data;
1925                         kev64.udata = kevp->udata;
1926                         error = copyout((caddr_t)&kev64, addr, advance);
1927                 } else {
1928                         struct user32_kevent kev32;
1929
1930                         advance = sizeof (kev32);
1931                         bzero(&kev32, advance);
1932                         kev32.ident = (uint32_t)kevp->ident;
1933                         kev32.filter = kevp->filter;
1934                         kev32.flags = kevp->flags;
1935                         kev32.fflags = kevp->fflags;
1936                         kev32.data = (int32_t)kevp->data;
1937                         kev32.udata = kevp->udata;
1938                         error = copyout((caddr_t)&kev32, addr, advance);
1939                 }
1940         } else if (flags & KEVENT_FLAG_LEGACY64) {
1941                 struct kevent64_s kev64;
1942
1943                 advance = sizeof (struct kevent64_s);
1944                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1945                         addr -= advance;
1946                 }
1947                 bzero(&kev64, advance);
1948                 kev64.ident = kevp->ident;
1949                 kev64.filter = kevp->filter;
1950                 kev64.flags = kevp->flags;
1951                 kev64.fflags = kevp->fflags;
1952                 kev64.data = (int64_t) kevp->data;
1953                 kev64.udata = kevp->udata;
1954                 kev64.ext[0] = kevp->ext[0];
1955                 kev64.ext[1] = kevp->ext[1];
1956                 error = copyout((caddr_t)&kev64, addr, advance);
1957         } else {
1958                 struct kevent_qos_s kevqos;
1959
1960                 advance = sizeof (struct kevent_qos_s);
1961                 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1962                         addr -= advance;
1963                 }
1964                 bzero(&kevqos, advance);
1965                 kevqos.ident = kevp->ident;
1966                 kevqos.filter = kevp->filter;
1967                 kevqos.flags = kevp->flags;
1968                 kevqos.qos = kevp->qos;
1969                 kevqos.udata = kevp->udata;
1970                 kevqos.fflags = kevp->fflags;
1971                 kevqos.xflags = 0;
1972                 kevqos.data = (int64_t) kevp->data;
1973                 kevqos.ext[0] = kevp->ext[0];
1974                 kevqos.ext[1] = kevp->ext[1];
1975                 kevqos.ext[2] = kevp->ext[2];
1976                 kevqos.ext[3] = kevp->ext[3];
1977                 error = copyout((caddr_t)&kevqos, addr, advance);
1978         }
1979         if (!error) {
1980                 if (flags & KEVENT_FLAG_STACK_EVENTS)
1981                         *addrp = addr;
1982                 else
1983                         *addrp = addr + advance;
1984         }
1985         return (error);
1986 }
1987
1988 static int
1989 kevent_get_data_size(struct proc *p,
1990                      uint64_t data_available,
1991                      unsigned int flags,
1992                      user_size_t *residp)
1993 {
1994         user_size_t resid;
1995         int error = 0;
1996
1997         if (data_available != USER_ADDR_NULL) {
1998                 if (flags & KEVENT_FLAG_KERNEL) {
1999                         resid = *(user_size_t *)(uintptr_t)data_available;
2000                 } else if (IS_64BIT_PROCESS(p)) {
2001                         user64_size_t usize;
2002                         error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
2003                         resid = (user_size_t)usize;
2004                 } else {
2005                         user32_size_t usize;
2006                         error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
2007                         resid = (user_size_t)usize;
2008                 }
2009                 if (error)
2010                         return(error);
2011         } else {
2012                 resid = 0;
2013         }
2014         *residp = resid;
2015         return 0;
2016 }
2017
2018 static int
2019 kevent_put_data_size(struct proc *p,
2020                      uint64_t data_available,
2021                      unsigned int flags,
2022                      user_size_t resid)
2023 {
2024         int error = 0;
2025
2026         if (data_available) {
2027                 if (flags & KEVENT_FLAG_KERNEL) {
2028                         *(user_size_t *)(uintptr_t)data_available = resid;
2029                 } else if (IS_64BIT_PROCESS(p)) {
2030                         user64_size_t usize = (user64_size_t)resid;
2031                         error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
2032                 } else {
2033                         user32_size_t usize = (user32_size_t)resid;
2034                         error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
2035                 }
2036         }
2037         return error;
2038 }
2039
2040 /*
2041  * kevent_continue - continue a kevent syscall after blocking
2042  *
2043  *      assume we inherit a use count on the kq fileglob.
2044  */
2045
2046 __attribute__((noreturn))
2047 static void
2048 kevent_continue(__unused struct kqueue *kq, void *data, int error)
2049 {
2050         struct _kevent *cont_args;
2051         struct fileproc *fp;
2052         uint64_t data_available;
2053         user_size_t data_size;
2054         user_size_t data_resid;
2055         unsigned int flags;
2056         int32_t *retval;
2057         int noutputs;
2058         int fd;
2059         struct proc *p = current_proc();
2060
2061         cont_args = (struct _kevent *)data;
2062         data_available = cont_args->data_available;
2063         flags = cont_args->process_data.fp_flags;
2064         data_size = cont_args->process_data.fp_data_size;
2065         data_resid = cont_args->process_data.fp_data_resid;
2066         noutputs = cont_args->eventout;
2067         retval = cont_args->retval;
2068         fd = cont_args->fd;
2069         fp = cont_args->fp;
2070
2071         if (fp != NULL)
2072                 fp_drop(p, fd, fp, 0);
2073
2074         /* don't abandon other output just because of residual copyout failures */
2075         if (error == 0 && data_available && data_resid != data_size) {
2076                 (void)kevent_put_data_size(p, data_available, flags, data_resid);
2077         }
2078
2079         /* don't restart after signals... */
2080         if (error == ERESTART)
2081                 error = EINTR;
2082         else if (error == EWOULDBLOCK)
2083                 error = 0;
2084         if (error == 0)
2085                 *retval = noutputs;
2086         unix_syscall_return(error);
2087 }
2088
2089 /*
2090  * kevent - [syscall] register and wait for kernel events
2091  *
2092  */
2093 int
2094 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
2095 {
2096         unsigned int flags = KEVENT_FLAG_LEGACY32;
2097
2098         return kevent_internal(p,
2099                                uap->fd,
2100                                uap->changelist, uap->nchanges,
2101                                uap->eventlist, uap->nevents,
2102                                0ULL, 0ULL,
2103                                flags,
2104                                uap->timeout,
2105                                kevent_continue,
2106                                retval);
2107 }
2108
2109 int
2110 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
2111 {
2112         unsigned int flags;
2113
2114         /* restrict to user flags and set legacy64 */
2115         flags = uap->flags & KEVENT_FLAG_USER;
2116         flags |= KEVENT_FLAG_LEGACY64;
2117
2118         return kevent_internal(p,
2119                                uap->fd,
2120                                uap->changelist, uap->nchanges,
2121                                uap->eventlist, uap->nevents,
2122                                0ULL, 0ULL,
2123                                flags,
2124                                uap->timeout,
2125                                kevent_continue,
2126                                retval);
2127 }
2128
2129 int
2130 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
2131 {
2132         /* restrict to user flags */
2133         uap->flags &= KEVENT_FLAG_USER;
2134
2135         return kevent_internal(p,
2136                                uap->fd,
2137                                uap->changelist, uap->nchanges,
2138                                uap->eventlist,  uap->nevents,
2139                                uap->data_out, (uint64_t)uap->data_available,
2140                                uap->flags,
2141                                0ULL,
2142                                kevent_continue,
2143                                retval);
2144 }
2145
2146 int
2147 kevent_qos_internal(struct proc *p, int fd,
2148                     user_addr_t changelist, int nchanges,
2149                     user_addr_t eventlist, int nevents,
2150                     user_addr_t data_out, user_size_t *data_available,
2151                     unsigned int flags,
2152                     int32_t *retval)
2153 {
2154         return kevent_internal(p,
2155                                fd,
2156                                changelist, nchanges,
2157                                eventlist, nevents,
2158                                data_out, (uint64_t)data_available,
2159                                (flags | KEVENT_FLAG_KERNEL),
2160                                0ULL,
2161                                NULL,
2162                                retval);
2163 }
2164
2165 static int
2166 kevent_get_timeout(struct proc *p,
2167                    user_addr_t utimeout,
2168                    unsigned int flags,
2169                    struct timeval *atvp)
2170 {
2171         struct timeval atv;
2172         int error = 0;
2173
2174         if (flags & KEVENT_FLAG_IMMEDIATE) {
2175                 getmicrouptime(&atv);
2176         } else if (utimeout != USER_ADDR_NULL) {
2177                 struct timeval rtv;
2178                 if (flags & KEVENT_FLAG_KERNEL) {
2179                         struct timespec *tsp = (struct timespec *)utimeout;
2180                         TIMESPEC_TO_TIMEVAL(&rtv, tsp);
2181                 } else if (IS_64BIT_PROCESS(p)) {
2182                         struct user64_timespec ts;
2183                         error = copyin(utimeout, &ts, sizeof(ts));
2184                         if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
2185                                 error = EINVAL;
2186                         else
2187                                 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
2188                 } else {
2189                         struct user32_timespec ts;
2190                         error = copyin(utimeout, &ts, sizeof(ts));
2191                         TIMESPEC_TO_TIMEVAL(&rtv, &ts);
2192                 }
2193                 if (error)
2194                         return (error);
2195                 if (itimerfix(&rtv))
2196                         return (EINVAL);
2197                 getmicrouptime(&atv);
2198                 timevaladd(&atv, &rtv);
2199         } else {
2200                 /* wait forever value */
2201                 atv.tv_sec = 0;
2202                 atv.tv_usec = 0;
2203         }
2204         *atvp = atv;
2205         return 0;
2206 }
2207
2208 static int
2209 kevent_set_kq_mode(struct kqueue *kq, unsigned int flags)
2210 {
2211         /* each kq should only be used for events of one type */
2212         kqlock(kq);
2213         if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) {
2214                 if (flags & KEVENT_FLAG_LEGACY32) {
2215                         if ((kq->kq_state & KQ_KEV32) == 0) {
2216                                 kqunlock(kq);
2217                                 return EINVAL;
2218                         }
2219                 } else if (kq->kq_state & KQ_KEV32) {
2220                         kqunlock(kq);
2221                         return EINVAL;
2222                 }
2223         } else if (flags & KEVENT_FLAG_LEGACY32) {
2224                 kq->kq_state |= KQ_KEV32;
2225         } else {
2226                 /* JMM - set KQ_KEVQOS when we are ready for exclusive */
2227                 kq->kq_state |= KQ_KEV64;
2228         }
2229         kqunlock(kq);
2230         return 0;
2231 }
2232
2233 static int
2234 kevent_get_kq(struct proc *p, int fd, unsigned int flags, struct fileproc **fpp, struct kqueue **kqp)
2235 {
2236         struct fileproc *fp = NULL;
2237         struct kqueue *kq;
2238         int error;
2239
2240         if (flags & KEVENT_FLAG_WORKQ) {
2241                 /*
2242                  * use the private kq associated with the proc workq.
2243                  * Just being a thread within the process (and not
2244                  * being the exit/exec thread) is enough to hold a
2245                  * reference on this special kq.
2246                  */
2247                 kq = p->p_wqkqueue;
2248                 if (kq == NULL) {
2249                         struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ);
2250                         if (alloc_kq == NULL)
2251                                 return ENOMEM;
2252
2253                         proc_fdlock(p);
2254                         if (p->p_wqkqueue == NULL) {
2255                                 kq = p->p_wqkqueue = alloc_kq;
2256                                 proc_fdunlock(p);
2257                         } else {
2258                                 proc_fdunlock(p);
2259                                 kq = p->p_wqkqueue;
2260                                 kqueue_dealloc(alloc_kq);
2261                         }
2262                 }
2263         } else {
2264                 /* get a usecount for the kq itself */
2265                 if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
2266                         return (error);
2267         }
2268         if ((error = kevent_set_kq_mode(kq, flags)) != 0) {
2269                 /* drop the usecount */
2270                 if (fp != NULL)
2271                         fp_drop(p, fd, fp, 0);
2272                 return error;
2273         }
2274
2275         *fpp = fp;
2276         *kqp = kq;
2277         return 0;
2278 }
2279
2280
2281 static int
2282 kevent_internal(struct proc *p,
2283                 int fd,
2284                 user_addr_t changelist, int nchanges,
2285                 user_addr_t ueventlist, int nevents,
2286                 user_addr_t data_out, uint64_t data_available,
2287                 unsigned int flags,
2288                 user_addr_t utimeout,
2289                 kqueue_continue_t continuation,
2290                 int32_t *retval)
2291 {
2292         struct _kevent *cont_args;
2293         uthread_t ut;
2294         struct kqueue *kq;
2295         struct fileproc *fp = NULL;
2296         struct kevent_internal_s kev;
2297         int error, noutputs;
2298         struct timeval atv;
2299         user_size_t data_size;
2300         user_size_t data_resid;
2301
2302         /* Don't allow user-space threads to process output events from the workq kq */
2303         if ((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ &&
2304             !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0)
2305                 return EINVAL;
2306
2307         /* prepare to deal with stack-wise allocation of out events */
2308         if (flags & KEVENT_FLAG_STACK_EVENTS) {
2309                 int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
2310                              (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
2311                                                     sizeof(struct user32_kevent)) :
2312                              ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
2313                                                                sizeof(struct kevent_qos_s)));
2314                 ueventlist += nevents * scale;
2315         }
2316
2317         /* convert timeout to absolute - if we have one (and not immediate) */
2318         error = kevent_get_timeout(p, utimeout, flags, &atv);
2319         if (error)
2320                 return error;
2321
2322         /* copyin initial value of data residual from data_available */
2323         error = kevent_get_data_size(p, data_available, flags, &data_size);
2324         if (error)
2325                 return error;
2326
2327         /* get the kq we are going to be working on */
2328         error = kevent_get_kq(p, fd, flags, &fp, &kq);
2329         if (error)
2330                 return error;
2331
2332         /* register all the change requests the user provided... */
2333         noutputs = 0;
2334         while (nchanges > 0 && error == 0) {
2335                 error = kevent_copyin(&changelist, &kev, p, flags);
2336                 if (error)
2337                         break;
2338
2339                 /* Make sure user doesn't pass in any system flags */
2340                 kev.flags &= ~EV_SYSFLAGS;
2341
2342                 kevent_register(kq, &kev, p);
2343
2344                 if (nevents > 0 &&
2345                     ((kev.flags & EV_ERROR) || (kev.flags & EV_RECEIPT))) {
2346                         if (kev.flags & EV_RECEIPT) {
2347                                 kev.flags |= EV_ERROR;
2348                                 kev.data = 0;
2349                         }
2350                         error = kevent_copyout(&kev, &ueventlist, p, flags);
2351                         if (error == 0) {
2352                                 nevents--;
2353                                 noutputs++;
2354                         }
2355                 } else if (kev.flags & EV_ERROR) {
2356                         error = kev.data;
2357                 }
2358                 nchanges--;
2359         }
2360
2361         /* short-circuit the scan if we only want error events */
2362         if (flags & KEVENT_FLAG_ERROR_EVENTS)
2363                 nevents = 0;
2364
2365         /* process pending events */
2366         if (nevents > 0 && noutputs == 0 && error == 0) {
2367
2368                 /* store the continuation/completion data in the uthread */
2369                 ut = (uthread_t)get_bsdthread_info(current_thread());
2370                 cont_args = &ut->uu_kevent.ss_kevent;
2371                 cont_args->fp = fp;
2372                 cont_args->fd = fd;
2373                 cont_args->retval = retval;
2374                 cont_args->eventlist = ueventlist;
2375                 cont_args->eventcount = nevents;
2376                 cont_args->eventout = noutputs;
2377                 cont_args->data_available = data_available;
2378                 cont_args->process_data.fp_fd = fd;
2379                 cont_args->process_data.fp_flags = flags;
2380                 cont_args->process_data.fp_data_out = data_out;
2381                 cont_args->process_data.fp_data_size = data_size;
2382                 cont_args->process_data.fp_data_resid = data_size;
2383
2384                 error = kqueue_scan(kq, kevent_callback,
2385                                     continuation, cont_args,
2386                                     &cont_args->process_data,
2387                                     &atv, p);
2388
2389                 /* process remaining outputs */
2390                 noutputs = cont_args->eventout;
2391                 data_resid = cont_args->process_data.fp_data_resid;
2392
2393                 /* copyout residual data size value (if it needs to be copied out) */
2394                 /* don't abandon other output just because of residual copyout failures */
2395                 if (error == 0 && data_available && data_resid != data_size) {
2396                         (void)kevent_put_data_size(p, data_available, flags, data_resid);
2397                 }
2398         }
2399
2400         /* don't restart after signals... */
2401         if (error == ERESTART)
2402                 error = EINTR;
2403         else if (error == EWOULDBLOCK)
2404                 error = 0;
2405         if (error == 0)
2406                 *retval = noutputs;
2407         if (fp != NULL)
2408                 fp_drop(p, fd, fp, 0);
2409         return (error);
2410 }
2411
2412
2413 /*
2414  * kevent_callback - callback for each individual event
2415  *
2416  * called with nothing locked
2417  * caller holds a reference on the kqueue
2418  */
2419 static int
2420 kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
2421     void *data)
2422 {
2423         struct _kevent *cont_args;
2424         int error;
2425
2426         cont_args = (struct _kevent *)data;
2427         assert(cont_args->eventout < cont_args->eventcount);
2428
2429         /*
2430          * Copy out the appropriate amount of event data for this user.
2431          */
2432         error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
2433                                cont_args->process_data.fp_flags);
2434
2435         /*
2436          * If there isn't space for additional events, return
2437          * a harmless error to stop the processing here
2438          */
2439         if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
2440                 error = EWOULDBLOCK;
2441         return (error);
2442 }
2443
2444 /*
2445  * kevent_description - format a description of a kevent for diagnostic output
2446  *
2447  * called with a 256-byte string buffer
2448  */
2449
2450 char *
2451 kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
2452 {
2453         snprintf(s, n,
2454             "kevent="
2455             "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
2456             kevp->ident,
2457             kevp->filter,
2458             kevp->flags,
2459             kevp->udata,
2460             kevp->fflags,
2461             kevp->data,
2462             kevp->ext[0],
2463             kevp->ext[1] );
2464
2465         return (s);
2466 }
2467
2468 /*
2469  * kevent_register - add a new event to a kqueue
2470  *
2471  *      Creates a mapping between the event source and
2472  *      the kqueue via a knote data structure.
2473  *
2474  *      Because many/most the event sources are file
2475  *      descriptor related, the knote is linked off
2476  *      the filedescriptor table for quick access.
2477  *
2478  *      called with nothing locked
2479  *      caller holds a reference on the kqueue
2480  */
2481
2482 void
2483 kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
2484     __unused struct proc *ctxp)
2485 {
2486         struct proc *p = kq->kq_p;
2487         struct filterops *fops;
2488         struct knote *kn = NULL;
2489         int result = 0;
2490         int error = 0;
2491
2492         if (kev->filter < 0) {
2493                 if (kev->filter + EVFILT_SYSCOUNT < 0) {
2494                         error = EINVAL;
2495                         goto out;
2496                 }
2497                 fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
2498         } else {
2499                 error = EINVAL;
2500                 goto out;
2501         }
2502
2503         /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
2504         if ((kev->flags & EV_VANISHED) &&
2505             (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) {
2506                 error = EINVAL;
2507                 goto out;
2508         }
2509
2510         /* Simplify the flags - delete and disable overrule */
2511         if (kev->flags & EV_DELETE)
2512                 kev->flags &= ~EV_ADD;
2513         if (kev->flags & EV_DISABLE)
2514                 kev->flags &= ~EV_ENABLE;
2515
2516 restart:
2517
2518         proc_fdlock(p);
2519
2520         /* find the matching knote from the fd tables/hashes */
2521         kn = knote_fdfind(kq, kev, p);
2522
2523         if (kn == NULL) {
2524                 if (kev->flags & EV_ADD) {
2525                         struct fileproc *fp = NULL;
2526
2527                         /* grab a file reference for the new knote */
2528                         if (fops->f_isfd) {
2529                                 if ((error = fp_lookup(p, kev->ident, &fp, 1)) != 0) {
2530                                         proc_fdunlock(p);
2531                                         goto out;
2532                                 }
2533                         }
2534
2535                         kn = knote_alloc();
2536                         if (kn == NULL) {
2537                                 proc_fdunlock(p);
2538                                 error = ENOMEM;
2539                                 if (fp != NULL)
2540                                         fp_drop(p, kev->ident, fp, 0);
2541                                 goto out;
2542                         }
2543
2544                         kn->kn_fp = fp;
2545                         knote_set_kq(kn,kq);
2546                         kn->kn_filtid = ~kev->filter;
2547                         kn->kn_inuse = 1;  /* for f_attach() */
2548                         kn->kn_status = KN_ATTACHING | KN_ATTACHED;
2549
2550                         /* was vanish support requested */
2551                         if (kev->flags & EV_VANISHED) {
2552                                 kev->flags &= ~EV_VANISHED;
2553                                 kn->kn_status |= KN_REQVANISH;
2554                         }
2555
2556                         /* snapshot matching/dispatching protcol flags into knote */
2557                         if (kev->flags & EV_DISPATCH)
2558                                 kn->kn_status |= KN_DISPATCH;
2559                         if (kev->flags & EV_UDATA_SPECIFIC)
2560                                 kn->kn_status |= KN_UDATA_SPECIFIC;
2561
2562                         /*
2563                          * copy the kevent state into knote
2564                          * protocol is that fflags and data
2565                          * are saved off, and cleared before
2566                          * calling the attach routine.
2567                          */
2568                         kn->kn_kevent = *kev;
2569                         kn->kn_sfflags = kev->fflags;
2570                         kn->kn_sdata = kev->data;
2571                         kn->kn_fflags = 0;
2572                         kn->kn_data = 0;
2573
2574                         /* invoke pthread kext to convert kevent qos to thread qos */
2575                         if (kq->kq_state & KQ_WORKQ) {
2576                                 kn->kn_qos = canonicalize_kevent_qos(kn->kn_qos);
2577                                 knote_set_qos_index(kn, qos_index_from_qos(kn->kn_qos, FALSE));
2578                                 knote_set_qos_override_index(kn, QOS_INDEX_KQFILE);
2579                                 assert(knote_get_qos_index(kn) < KQWQ_NQOS);
2580                         } else {
2581                                 knote_set_qos_index(kn, QOS_INDEX_KQFILE);
2582                                 knote_set_qos_override_index(kn, QOS_INDEX_KQFILE);
2583                         }
2584
2585                         /* before anyone can find it */
2586                         if (kev->flags & EV_DISABLE)
2587                                 knote_disable(kn);
2588
2589                         /* Add the knote for lookup thru the fd table */
2590                         error = knote_fdadd(kn, p);
2591                         proc_fdunlock(p);
2592
2593                         if (error) {
2594                                 knote_free(kn);
2595                                 if (fp != NULL)
2596                                         fp_drop(p, kev->ident, fp, 0);
2597                                 goto out;
2598                         }
2599
2600                         /* fp reference count now applies to knote */
2601
2602                         /* call filter attach routine */
2603                         result = fops->f_attach(kn);
2604
2605                         /*
2606                          * Trade knote use count for kq lock.
2607                          * Cannot be dropped because we held
2608                          * KN_ATTACHING throughout.
2609                          */
2610                         knoteuse2kqlock(kq, kn, 1);
2611
2612                         if (kn->kn_flags & EV_ERROR) {
2613                                 /*
2614                                  * Failed to attach correctly, so drop.
2615                                  * All other possible users/droppers
2616                                  * have deferred to us.  Save the error
2617                                  * to return to our caller.
2618                                  */
2619                                 kn->kn_status &= ~KN_ATTACHED;
2620                                 kn->kn_status |= KN_DROPPING;
2621                                 error = kn->kn_data;
2622                                 kqunlock(kq);
2623                                 knote_drop(kn, p);
2624                                 goto out;
2625                         }
2626
2627                         /* end "attaching" phase - now just attached */
2628                         kn->kn_status &= ~KN_ATTACHING;
2629
2630                         if (kn->kn_status & KN_DROPPING) {
2631                                 /*
2632                                  * Attach succeeded, but someone else
2633                                  * deferred their drop - now we have
2634                                  * to do it for them.
2635                                  */
2636                                 kqunlock(kq);
2637                                 knote_drop(kn, p);
2638                                 goto out;
2639                         }
2640
2641                         /*
2642                          * If the attach routine indicated that an
2643                          * event is already fired, activate the knote.
2644                          */
2645                         if (result)
2646                                 knote_activate(kn);
2647
2648                 } else {
2649                         proc_fdunlock(p);
2650                         error = ENOENT;
2651                         goto out;
2652                 }
2653
2654         } else {
2655                 /* existing knote - get kqueue lock */
2656                 kqlock(kq);
2657                 proc_fdunlock(p);
2658
2659                 if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
2660                         /*
2661                          * The knote is not in a stable state, wait for that
2662                          * transition to complete and then redrive the lookup.
2663                          */
2664                         kn->kn_status |= KN_USEWAIT;
2665                         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
2666                                             CAST_EVENT64_T(&kn->kn_status),
2667                                             THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
2668                         kqunlock(kq);
2669                         thread_block(THREAD_CONTINUE_NULL);
2670                         goto restart;
2671                 }
2672
2673                 if (kev->flags & EV_DELETE) {
2674
2675                         /*
2676                          * If attempting to delete a disabled dispatch2 knote,
2677                          * we must wait for the knote to be re-enabled (unless
2678                          * it is being re-enabled atomically here).
2679                          */
2680                         if ((kev->flags & EV_ENABLE) == 0 &&
2681                             (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) ==
2682                                              (KN_DISPATCH2 | KN_DISABLED)) {
2683                                 kn->kn_status |= KN_DEFERDELETE;
2684                                 kqunlock(kq);
2685                                 error = EINPROGRESS;
2686                         } else if (kqlock2knotedrop(kq, kn)) {
2687                                 knote_drop(kn, p);
2688                         } else {
2689                                 /*
2690                                  * The kqueue is unlocked, it's not being
2691                                  * dropped, and kqlock2knotedrop returned 0:
2692                                  * this means that someone stole the drop of
2693                                  * the knote from us.
2694                                  */
2695                                 error = EINPROGRESS;
2696                         }
2697                         goto out;
2698                 }
2699
2700                 /*
2701                  * If we are re-enabling a deferred-delete knote,
2702                  * just enable it now and avoid calling the
2703                  * filter touch routine (it has delivered its
2704                  * last event already).
2705                  */
2706                 if ((kev->flags & EV_ENABLE) &&
2707                     (kn->kn_status & KN_DEFERDELETE)) {
2708                         assert(kn->kn_status & KN_DISABLED);
2709                         knote_activate(kn);
2710                         knote_enable(kn);
2711                         kqunlock(kq);
2712                         goto out;
2713                 }
2714
2715                 /*
2716                  * If we are disabling, do it before unlocking and
2717                  * calling the touch routine (so no processing can
2718                  * see the new kevent state before the disable is
2719                  * applied).
2720                  */
2721                 if (kev->flags & EV_DISABLE)
2722                         knote_disable(kn);
2723
2724                 /*
2725                  * Convert the kqlock to a use reference on the
2726                  * knote so we can call the filter touch routine.
2727                  */
2728                 if (kqlock2knoteuse(kq, kn)) {
2729
2730                         /*
2731                          * Call touch routine to notify filter of changes
2732                          * in filter values (and to re-determine if any
2733                          * events are fired).
2734                          */
2735                         result = knote_fops(kn)->f_touch(kn, kev);
2736
2737                         /* Get the kq lock back (don't defer droppers). */
2738                         if (!knoteuse2kqlock(kq, kn, 0)) {
2739                                 kqunlock(kq);
2740                                 goto out;
2741                         }
2742
2743                         /* Activate it if the touch routine said to */
2744                         if (result)
2745                                 knote_activate(kn);
2746                 }
2747
2748                 /* Enable the knote if called for */
2749                 if (kev->flags & EV_ENABLE)
2750                         knote_enable(kn);
2751
2752         }
2753
2754         /* still have kqlock held and knote is valid */
2755         kqunlock(kq);
2756
2757  out:
2758         /* output local errors through the kevent */
2759         if (error) {
2760                 kev->flags |= EV_ERROR;
2761                 kev->data = error;
2762         }
2763 }
2764
2765
2766 /*
2767  * knote_process - process a triggered event
2768  *
2769  *      Validate that it is really still a triggered event
2770  *      by calling the filter routines (if necessary).  Hold
2771  *      a use reference on the knote to avoid it being detached.
2772  *
2773  *      If it is still considered triggered, we will have taken
2774  *      a copy of the state under the filter lock.  We use that
2775  *      snapshot to dispatch the knote for future processing (or
2776  *      not, if this was a lost event).
2777  *
2778  *      Our caller assures us that nobody else can be processing
2779  *      events from this knote during the whole operation. But
2780  *      others can be touching or posting events to the knote
2781  *      interspersed with our processing it.
2782  *
2783  *      caller holds a reference on the kqueue.
2784  *      kqueue locked on entry and exit - but may be dropped
2785  */
2786 static int
2787 knote_process(struct knote *kn,
2788         kevent_callback_t callback,
2789         void *callback_data,
2790         struct filt_process_s *process_data,
2791         struct proc *p)
2792 {
2793         struct kevent_internal_s kev;
2794         struct kqueue *kq = knote_get_kq(kn);
2795         int result = 0;
2796         int error = 0;
2797
2798         bzero(&kev, sizeof(kev));
2799
2800         /*
2801          * Must be active or stayactive
2802          * Must be queued and not disabled/suppressed
2803          */
2804         assert(kn->kn_status & KN_QUEUED);
2805         assert(kn->kn_status & (KN_ACTIVE|KN_STAYACTIVE));
2806         assert(!(kn->kn_status & (KN_DISABLED|KN_SUPPRESSED|KN_DROPPING)));
2807
2808         /*
2809          * For deferred-drop or vanished events, we just create a fake
2810          * event to acknowledge end-of-life.  Otherwise, we call the
2811          * filter's process routine to snapshot the kevent state under
2812          * the filter's locking protocol.
2813          */
2814         if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
2815                 /* create fake event */
2816                 kev.filter = kn->kn_filter;
2817                 kev.ident = kn->kn_id;
2818                 kev.qos = kn->kn_qos;
2819                 kev.flags = (kn->kn_status & KN_DEFERDELETE) ?
2820                             EV_DELETE : EV_VANISHED;
2821                 kev.flags |= (EV_DISPATCH2 | EV_ONESHOT);
2822                 kev.udata = kn->kn_udata;
2823                 result = 1;
2824
2825                 knote_suppress(kn);
2826         } else {
2827
2828                 /* deactivate - so new activations indicate a wakeup */
2829                 knote_deactivate(kn);
2830
2831                 /* suppress knotes to avoid returning the same event multiple times in a single call. */
2832                 knote_suppress(kn);
2833
2834                 /* convert lock to a knote use reference */
2835                 if (!kqlock2knoteuse(kq, kn))
2836                         panic("dropping knote found on queue\n");
2837
2838                 /* call out to the filter to process with just a ref */
2839                 result = knote_fops(kn)->f_process(kn, process_data, &kev);
2840
2841                 /*
2842                  * convert our reference back to a lock. accept drop
2843                  * responsibility from others if we've committed to
2844                  * delivering event data.
2845                  */
2846                 if (!knoteuse2kqlock(kq, kn, result)) {
2847                         /* knote dropped */
2848                         kn = NULL;
2849                 }
2850         }
2851
2852         if (kn != NULL) {
2853                 /*
2854                  * Determine how to dispatch the knote for future event handling.
2855                  * not-fired: just return (do not callout, leave deactivated).
2856                  * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
2857                  *            is the deferred delete event delivery itself).  Otherwise,
2858                  *            drop it.
2859                  * stolendrop:We took responsibility for someone else's drop attempt.
2860                  *            treat this just like one-shot and prepare to turn it back
2861                  *            into a deferred delete if required.
2862                  * Dispatch:  don't clear state, just mark it disabled.
2863                  * Cleared:   just leave it deactivated.
2864                  * Others:    re-activate as there may be more events to handle.
2865                  *            This will not wake up more handlers right now, but
2866                  *            at the completion of handling events it may trigger
2867                  *            more handler threads (TODO: optimize based on more than
2868                  *            just this one event being detected by the filter).
2869                  */
2870
2871                 if (result == 0)
2872                         return (EJUSTRETURN);
2873
2874                 if ((kev.flags & EV_ONESHOT) || (kn->kn_status & KN_STOLENDROP)) {
2875                         if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) {
2876                                 /* defer dropping non-delete oneshot dispatch2 events */
2877                                 kn->kn_status |= KN_DEFERDELETE;
2878                                 knote_disable(kn);
2879
2880                                 /* if we took over another's drop clear those flags here */
2881                                 if (kn->kn_status & KN_STOLENDROP) {
2882                                         assert(kn->kn_status & KN_DROPPING);
2883                                         /*
2884                                          * the knote will be dropped when the
2885                                          * deferred deletion occurs
2886                                          */
2887                                         kn->kn_status &= ~(KN_DROPPING|KN_STOLENDROP);
2888                                 }
2889                         } else if (kn->kn_status & KN_STOLENDROP) {
2890                                 /* We now own the drop of the knote. */
2891                                 assert(kn->kn_status & KN_DROPPING);
2892                                 knote_unsuppress(kn);
2893                                 kqunlock(kq);
2894                                 knote_drop(kn, p);
2895                                 kqlock(kq);
2896                         } else if (kqlock2knotedrop(kq, kn)) {
2897                                 /* just EV_ONESHOT, _not_ DISPATCH2 */
2898                                 knote_drop(kn, p);
2899                                 kqlock(kq);
2900                         }
2901                 } else if (kn->kn_status & KN_DISPATCH) {
2902                         /* disable all dispatch knotes */
2903                         knote_disable(kn);
2904                 } else if ((kev.flags & EV_CLEAR) == 0) {
2905                         /* re-activate in case there are more events */
2906                         knote_activate(kn);
2907                 }
2908         }
2909
2910         /*
2911          * callback to handle each event as we find it.
2912          * If we have to detach and drop the knote, do
2913          * it while we have the kq unlocked.
2914          */
2915         if (result) {
2916                 kqunlock(kq);
2917                 error = (callback)(kq, &kev, callback_data);
2918                 kqlock(kq);
2919         }
2920         return (error);
2921 }
2922
2923
2924 /*
2925  * Return 0 to indicate that processing should proceed,
2926  * -1 if there is nothing to process.
2927  *
2928  * Called with kqueue locked and returns the same way,
2929  * but may drop lock temporarily.
2930  */
2931 static int
2932 kqworkq_begin_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
2933 {
2934         struct kqrequest *kqr;
2935         thread_t self = current_thread();
2936         __assert_only struct uthread *ut = get_bsdthread_info(self);
2937         thread_t thread;
2938
2939         assert(kqwq->kqwq_state & KQ_WORKQ);
2940         assert(qos_index < KQWQ_NQOS);
2941
2942         kqwq_req_lock(kqwq);
2943         kqr = kqworkq_get_request(kqwq, qos_index);
2944
2945         thread = kqr->kqr_thread;
2946
2947         /* manager skips buckets that haven't ask for its help */
2948         if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
2949
2950                 /* If nothing for manager to do, just return */
2951                 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
2952                         assert(kqr->kqr_thread != self);
2953                         kqwq_req_unlock(kqwq);
2954                         return -1;
2955                 }
2956
2957                 /* bind manager thread from this time on */
2958                 kqworkq_bind_thread(kqwq, qos_index, self, flags);
2959
2960         } else {
2961                 /* must have been bound by now */
2962                 assert(thread == self);
2963                 assert(ut->uu_kqueue_bound == qos_index);
2964                 assert((ut->uu_kqueue_flags & flags) == ut->uu_kqueue_flags);
2965         }
2966
2967         /* nobody else should still be processing */
2968         assert(kqr->kqr_state & KQWQ_THREQUESTED);
2969         assert((kqr->kqr_state & KQWQ_PROCESSING) == 0);
2970
2971         /* anything left to process? */
2972         if (kqueue_queue_empty(&kqwq->kqwq_kqueue, qos_index)) {
2973                 kqwq_req_unlock(kqwq);
2974                 return -1;
2975         }
2976
2977         /* convert to processing mode */
2978         /* reset workq triggers and thread requests - maybe processing */
2979         kqr->kqr_state &= ~(KQWQ_HOOKCALLED | KQWQ_WAKEUP);
2980         kqr->kqr_state |= KQWQ_PROCESSING;
2981         kqwq_req_unlock(kqwq);
2982         return 0;
2983 }
2984
2985 /*
2986  * Return 0 to indicate that processing should proceed,
2987  * -1 if there is nothing to process.
2988  *
2989  * Called with kqueue locked and returns the same way,
2990  * but may drop lock temporarily.
2991  * May block.
2992  */
2993 static int
2994 kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags)
2995 {
2996         struct kqtailq *suppressq;
2997
2998         if (kq->kq_state & KQ_WORKQ)
2999                 return kqworkq_begin_processing((struct kqworkq *)kq, qos_index, flags);
3000
3001         assert(qos_index == QOS_INDEX_KQFILE);
3002
3003         /* wait to become the exclusive processing thread */
3004         for (;;) {
3005                 if (kq->kq_state & KQ_DRAIN)
3006                         return -1;
3007
3008                 if ((kq->kq_state & KQ_PROCESSING) == 0)
3009                         break;
3010
3011                 /* if someone else is processing the queue, wait */
3012                 kq->kq_state |= KQ_PROCWAIT;
3013                 suppressq = kqueue_get_suppressed_queue(kq, qos_index);
3014                 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
3015                                     CAST_EVENT64_T(suppressq),
3016                                     THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
3017
3018                 kqunlock(kq);
3019                 thread_block(THREAD_CONTINUE_NULL);
3020                 kqlock(kq);
3021         }
3022
3023         /* Nobody else processing */
3024
3025         /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
3026         waitq_set_clear_preposts(&kq->kq_wqs);
3027         kq->kq_state &= ~KQ_WAKEUP;
3028
3029         /* anything left to process? */
3030         if (kqueue_queue_empty(kq, qos_index))
3031                 return -1;
3032
3033         /* convert to processing mode */
3034         kq->kq_state |= KQ_PROCESSING;
3035
3036         return 0;
3037 }
3038
3039 /*
3040  *      kqworkq_end_processing - Complete the processing of a workq kqueue
3041  *
3042  *      We may have to request new threads.
3043  *      This can happen there are no waiting processing threads and:
3044  *      - there were active events we never got to (count > 0)
3045  *      - we pended waitq hook callouts during processing
3046  *      - we pended wakeups while processing (or unsuppressing)
3047  *
3048  *      Called with kqueue lock held.
3049  */
3050 static void
3051 kqworkq_end_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
3052 {
3053 #pragma unused(flags)
3054
3055         struct kqueue *kq = &kqwq->kqwq_kqueue;
3056         struct kqtailq *suppressq = kqueue_get_suppressed_queue(kq, qos_index);
3057
3058         thread_t self = current_thread();
3059         __assert_only struct uthread *ut = get_bsdthread_info(self);
3060         struct knote *kn;
3061         struct kqrequest *kqr;
3062         int queued_events;
3063         uint16_t pended;
3064         thread_t thread;
3065
3066         assert(kqwq->kqwq_state & KQ_WORKQ);
3067         assert(qos_index < KQWQ_NQOS);
3068
3069         /* leave early if we are not even processing */
3070         kqwq_req_lock(kqwq);
3071         kqr = kqworkq_get_request(kqwq, qos_index);
3072         thread = kqr->kqr_thread;
3073
3074         if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
3075                 assert(ut->uu_kqueue_bound == KQWQ_QOS_MANAGER);
3076                 assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
3077
3078                 /* if this bucket didn't need manager help, bail */
3079                 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
3080                         assert(thread != self);
3081                         kqwq_req_unlock(kqwq);
3082                         return;
3083                 }
3084
3085                 assert(kqr->kqr_state & KQWQ_THREQUESTED);
3086
3087                 /* unbound bucket - see if still needs servicing */
3088                 if (thread == THREAD_NULL) {
3089                         assert((kqr->kqr_state & KQWQ_PROCESSING) == 0);
3090                         assert(TAILQ_EMPTY(suppressq));
3091                 } else {
3092                         assert(thread == self);
3093                 }
3094
3095         } else {
3096                 assert(thread == self);
3097                 assert(ut->uu_kqueue_bound == qos_index);
3098                 assert((ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0);
3099         }
3100
3101         kqwq_req_unlock(kqwq);
3102
3103         /* Any events queued before we put suppressed ones back? */
3104         queued_events = !kqueue_queue_empty(kq, qos_index);
3105
3106         /*
3107          * Return suppressed knotes to their original state.
3108          * For workq kqueues, suppressed ones that are still
3109          * truly active (not just forced into the queue) will
3110          * set flags we check below to see if anything got
3111          * woken up.
3112          */
3113         while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
3114                 assert(kn->kn_status & KN_SUPPRESSED);
3115                 knote_unsuppress(kn);
3116         }
3117
3118         kqwq_req_lock(kqwq);
3119
3120         /* Determine if wakeup-type events were pended during servicing */
3121         pended = (kqr->kqr_state & (KQWQ_HOOKCALLED | KQWQ_WAKEUP));
3122
3123         /* unbind thread thread */
3124         kqworkq_unbind_thread(kqwq, qos_index, self, flags);
3125
3126         /* Indicate that we are done processing */
3127         kqr->kqr_state &= ~(KQWQ_PROCESSING | \
3128                             KQWQ_THREQUESTED | KQWQ_THMANAGER);
3129
3130         /*
3131          * request a new thread if events have happened
3132          * (not just putting stay-active events back).
3133          */
3134         if ((queued_events || pended) &&
3135             !kqueue_queue_empty(kq, qos_index)) {
3136                 kqworkq_request_thread(kqwq, qos_index);
3137         }
3138
3139         kqwq_req_unlock(kqwq);
3140 }
3141
3142 /*
3143  * Called with kqueue lock held.
3144  */
3145 static void
3146 kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags)
3147 {
3148         struct knote *kn;
3149         struct kqtailq *suppressq;
3150         int procwait;
3151
3152         if (kq->kq_state & KQ_WORKQ) {
3153                 kqworkq_end_processing((struct kqworkq *)kq, qos_index, flags);
3154                 return;
3155         }
3156
3157         assert(qos_index == QOS_INDEX_KQFILE);
3158
3159         /*
3160          * Return suppressed knotes to their original state.
3161          * For workq kqueues, suppressed ones that are still
3162          * truly active (not just forced into the queue) will
3163          * set flags we check below to see if anything got
3164          * woken up.
3165          */
3166         suppressq = kqueue_get_suppressed_queue(kq, qos_index);
3167         while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
3168                 assert(kn->kn_status & KN_SUPPRESSED);
3169                 knote_unsuppress(kn);
3170         }
3171
3172         procwait = (kq->kq_state & KQ_PROCWAIT);
3173         kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
3174
3175         if (procwait) {
3176                 /* first wake up any thread already waiting to process */
3177                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
3178                                    CAST_EVENT64_T(suppressq),
3179                                    THREAD_AWAKENED,
3180                                    WAITQ_ALL_PRIORITIES);
3181         }
3182 }
3183
3184 /*
3185  *      kevent_qos_internal_bind - bind thread to processing kqueue
3186  *
3187  *      Indicates that the provided thread will be responsible for
3188  *      servicing the particular QoS class index specified in the
3189  *      parameters. Once the binding is done, any overrides that may
3190  *      be associated with the cooresponding events can be applied.
3191  *
3192  *      This should be called as soon as the thread identity is known,
3193  *      preferably while still at high priority during creation.
3194  *
3195  *  - caller holds a reference on the kqueue.
3196  *      - the thread MUST call kevent_qos_internal after being bound
3197  *        or the bucket of events may never be delivered.
3198  *      - Nothing locked (may take mutex or block).
3199  */
3200
3201 int
3202 kevent_qos_internal_bind(
3203         struct proc *p,
3204         int qos_class,
3205         thread_t thread,
3206         unsigned int flags)
3207 {
3208         struct fileproc *fp = NULL;
3209         struct kqueue *kq = NULL;
3210         struct kqworkq *kqwq;
3211         struct kqrequest *kqr;
3212         struct uthread *ut;
3213         kq_index_t qos_index;
3214         int res = 0;
3215
3216         assert(thread != THREAD_NULL);
3217         assert(flags & KEVENT_FLAG_WORKQ);
3218
3219         if (thread == THREAD_NULL ||
3220             (flags & KEVENT_FLAG_WORKQ) == 0) {
3221                 return EINVAL;
3222         }
3223
3224         ut = get_bsdthread_info(thread);
3225
3226         /* find the kqueue */
3227         res = kevent_get_kq(p, -1, flags, &fp, &kq);
3228         assert(fp == NULL);
3229         if (res)
3230                 return res;
3231
3232         /* get the qos index we're going to service */
3233         qos_index = qos_index_for_servicer(qos_class, thread, flags);
3234
3235         /* No need to bind the manager thread to any bucket */
3236         if (qos_index == KQWQ_QOS_MANAGER) {
3237                 assert(ut->uu_kqueue_bound == 0);
3238                 ut->uu_kqueue_bound = qos_index;
3239                 ut->uu_kqueue_flags = flags;
3240                 return 0;
3241         }
3242
3243         kqlock(kq);
3244         assert(kq->kq_state & KQ_WORKQ);
3245
3246         kqwq = (struct kqworkq *)kq;
3247         kqr = kqworkq_get_request(kqwq, qos_index);
3248
3249         kqwq_req_lock(kqwq);
3250
3251         /*
3252          * A (non-emergency) request should have been made
3253          * and nobody should already be servicing this bucket.
3254          */
3255         assert(kqr->kqr_state & KQWQ_THREQUESTED);
3256         assert((kqr->kqr_state & KQWQ_THMANAGER) == 0);
3257         assert((kqr->kqr_state & KQWQ_PROCESSING) == 0);
3258
3259         /* Is this is an extraneous bind? */
3260         if (thread == kqr->kqr_thread) {
3261                 assert(ut->uu_kqueue_bound == qos_index);
3262                 goto out;
3263         }
3264
3265         /* nobody else bound and we're not bound elsewhere */
3266         assert(ut->uu_kqueue_bound == 0);
3267         assert(ut->uu_kqueue_flags == 0);
3268         assert(kqr->kqr_thread == THREAD_NULL);
3269
3270         /* Don't bind if there is a conflict */
3271         if (kqr->kqr_thread != THREAD_NULL ||
3272             (kqr->kqr_state & KQWQ_THMANAGER)) {
3273                 res = EINPROGRESS;
3274                 goto out;
3275         }
3276
3277         /* finally bind the thread */
3278         kqr->kqr_thread = thread;
3279         ut->uu_kqueue_bound = qos_index;
3280         ut->uu_kqueue_flags = flags;
3281
3282         /* add any pending overrides to the thread */
3283         if (kqr->kqr_override_delta) {
3284                 thread_add_ipc_override(thread, qos_index + kqr->kqr_override_delta);
3285         }
3286
3287 out:
3288         kqwq_req_unlock(kqwq);
3289         kqunlock(kq);
3290
3291         return res;
3292 }
3293
3294 /*
3295  *      kevent_qos_internal_unbind - unbind thread from processing kqueue
3296  *
3297  *      End processing the per-QoS bucket of events and allow other threads
3298  *      to be requested for future servicing.
3299  *
3300  *      caller holds a reference on the kqueue.
3301  *      thread is the current thread.
3302  */
3303
3304 int
3305 kevent_qos_internal_unbind(
3306         struct proc *p,
3307         int qos_class,
3308         thread_t thread,
3309         unsigned int flags)
3310 {
3311         struct kqueue *kq;
3312         struct uthread *ut;
3313         struct fileproc *fp = NULL;
3314         kq_index_t qos_index;
3315         kq_index_t end_index;
3316         int res;
3317
3318         assert(flags & KEVENT_FLAG_WORKQ);
3319         assert(thread == current_thread());
3320
3321         if (thread == THREAD_NULL ||
3322             (flags & KEVENT_FLAG_WORKQ) == 0)
3323                 return EINVAL;
3324
3325         /* get the kq */
3326         res = kevent_get_kq(p, -1, flags, &fp, &kq);
3327         assert(fp == NULL);
3328         if (res)
3329                 return res;
3330
3331         assert(kq->kq_state & KQ_WORKQ);
3332
3333         /* get the index we have been servicing */
3334         qos_index = qos_index_for_servicer(qos_class, thread, flags);
3335
3336         ut = get_bsdthread_info(thread);
3337
3338         /* early out if we were already unbound - or never bound */
3339         if (ut->uu_kqueue_bound != qos_index) {
3340                 __assert_only struct kqworkq *kqwq = (struct kqworkq *)kq;
3341                 __assert_only struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
3342
3343                 assert(ut->uu_kqueue_bound == 0);
3344                 assert(ut->uu_kqueue_flags == 0);
3345                 assert(kqr->kqr_thread != thread);
3346                 return EALREADY;
3347         }
3348
3349         /* unbind from all the buckets we might own */
3350         end_index = (qos_index == KQWQ_QOS_MANAGER) ?
3351                     0 : qos_index;
3352         kqlock(kq);
3353         do {
3354                 kqueue_end_processing(kq, qos_index, flags);
3355         } while (qos_index-- > end_index);
3356         kqunlock(kq);
3357
3358         /* indicate that we are done processing in the uthread */
3359         ut->uu_kqueue_bound = 0;
3360         ut->uu_kqueue_flags = 0;
3361
3362         return 0;
3363 }
3364
3365 /*
3366  * kqueue_process - process the triggered events in a kqueue
3367  *
3368  *      Walk the queued knotes and validate that they are
3369  *      really still triggered events by calling the filter
3370  *      routines (if necessary).  Hold a use reference on
3371  *      the knote to avoid it being detached. For each event
3372  *      that is still considered triggered, invoke the
3373  *      callback routine provided.
3374  *
3375  *      caller holds a reference on the kqueue.
3376  *      kqueue locked on entry and exit - but may be dropped
3377  *      kqueue list locked (held for duration of call)
3378  */
3379
3380 static int
3381 kqueue_process(struct kqueue *kq,
3382     kevent_callback_t callback,
3383     void *callback_data,
3384     struct filt_process_s *process_data,
3385     kq_index_t servicer_qos_index,
3386     int *countp,
3387     struct proc *p)
3388 {
3389         unsigned int flags = process_data ? process_data->fp_flags : 0;
3390         kq_index_t start_index, end_index, i;
3391         struct knote *kn;
3392         int nevents = 0;
3393         int error = 0;
3394
3395         /*
3396          * Based on the native QoS of the servicer,
3397          * determine the range of QoSes that need checking
3398          */
3399         start_index = servicer_qos_index;
3400         end_index = (start_index == KQWQ_QOS_MANAGER) ? 0 : start_index;
3401
3402         i = start_index;
3403
3404         do {
3405                 if (kqueue_begin_processing(kq, i, flags) == -1) {
3406                         *countp = 0;
3407                         /* Nothing to process */
3408                         continue;
3409                 }
3410
3411                 /*
3412                  * loop through the enqueued knotes, processing each one and
3413                  * revalidating those that need it. As they are processed,
3414                  * they get moved to the inprocess queue (so the loop can end).
3415                  */
3416                 error = 0;
3417
3418                 struct kqtailq *base_queue = kqueue_get_base_queue(kq, i);
3419                 struct kqtailq *queue = kqueue_get_high_queue(kq, i);
3420                 do {
3421                         while (error == 0 &&
3422                                (kn = TAILQ_FIRST(queue)) != NULL) {
3423                                 /* Process the knote */
3424                                 error = knote_process(kn, callback, callback_data, process_data, p);
3425                                 if (error == EJUSTRETURN)
3426                                         error = 0;
3427                                 else
3428                                         nevents++;
3429
3430                                 /* break out if no more space for additional events */
3431                                 if (error == EWOULDBLOCK) {
3432                                         if ((kq->kq_state & KQ_WORKQ) == 0)
3433                                                 kqueue_end_processing(kq, i, flags);
3434                                         error = 0;
3435                                         goto out;
3436                                 }
3437                         }
3438                 } while (error == 0 && queue-- > base_queue);
3439
3440                 /* let somebody else process events if we're not in workq mode */
3441                 if ((kq->kq_state & KQ_WORKQ) == 0)
3442                         kqueue_end_processing(kq, i, flags);
3443
3444         } while (i-- > end_index);
3445
3446 out:
3447         *countp = nevents;
3448         return (error);
3449 }
3450
3451 static void
3452 kqueue_scan_continue(void *data, wait_result_t wait_result)
3453 {
3454         thread_t self = current_thread();
3455         uthread_t ut = (uthread_t)get_bsdthread_info(self);
3456         struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
3457         struct kqueue *kq = (struct kqueue *)data;
3458         struct filt_process_s *process_data = cont_args->process_data;
3459         int error;
3460         int count;
3461
3462         /* convert the (previous) wait_result to a proper error */
3463         switch (wait_result) {
3464         case THREAD_AWAKENED: {
3465                 kqlock(kq);
3466         retry:
3467                 error = kqueue_process(kq, cont_args->call, cont_args->data,
3468                                        process_data, cont_args->servicer_qos_index,
3469                                        &count, current_proc());
3470                 if (error == 0 && count == 0) {
3471                         if (kq->kq_state & KQ_WAKEUP)
3472                                 goto retry;
3473                         waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
3474                                             KQ_EVENT, THREAD_ABORTSAFE,
3475                                             cont_args->deadline);
3476                         kq->kq_state |= KQ_SLEEP;
3477                         kqunlock(kq);
3478                         thread_block_parameter(kqueue_scan_continue, kq);
3479                         /* NOTREACHED */
3480                 }
3481                 kqunlock(kq);
3482                 } break;
3483         case THREAD_TIMED_OUT:
3484                 error = EWOULDBLOCK;
3485                 break;
3486         case THREAD_INTERRUPTED:
3487                 error = EINTR;
3488                 break;
3489         case THREAD_RESTART:
3490                 error = EBADF;
3491                 break;
3492         default:
3493                 panic("%s: - invalid wait_result (%d)", __func__,
3494                     wait_result);
3495                 error = 0;
3496         }
3497
3498         /* call the continuation with the results */
3499         assert(cont_args->cont != NULL);
3500         (cont_args->cont)(kq, cont_args->data, error);
3501 }
3502
3503
3504 /*
3505  * kqueue_scan - scan and wait for events in a kqueue
3506  *
3507  *      Process the triggered events in a kqueue.
3508  *
3509  *      If there are no events triggered arrange to
3510  *      wait for them. If the caller provided a
3511  *      continuation routine, then kevent_scan will
3512  *      also.
3513  *
3514  *      The callback routine must be valid.
3515  *      The caller must hold a use-count reference on the kq.
3516  */
3517
3518 int
3519 kqueue_scan(struct kqueue *kq,
3520             kevent_callback_t callback,
3521             kqueue_continue_t continuation,
3522             void *callback_data,
3523             struct filt_process_s *process_data,
3524             struct timeval *atvp,
3525             struct proc *p)
3526 {
3527         thread_continue_t cont = THREAD_CONTINUE_NULL;
3528         kq_index_t servicer_qos_index;
3529         unsigned int flags;
3530         uint64_t deadline;
3531         int error;
3532         int first;
3533         int fd;
3534
3535         assert(callback != NULL);
3536
3537         /*
3538          * Determine which QoS index we are servicing
3539          */
3540         flags = (process_data) ? process_data->fp_flags : 0;
3541         fd = (process_data) ? process_data->fp_fd : -1;
3542         servicer_qos_index = (kq->kq_state & KQ_WORKQ) ?
3543             qos_index_for_servicer(fd, current_thread(), flags) :
3544             QOS_INDEX_KQFILE;
3545
3546         first = 1;
3547         for (;;) {
3548                 wait_result_t wait_result;
3549                 int count;
3550
3551                 /*
3552                  * Make a pass through the kq to find events already
3553                  * triggered.
3554                  */
3555                 kqlock(kq);
3556                 error = kqueue_process(kq, callback, callback_data,
3557                                        process_data, servicer_qos_index,
3558                                        &count, p);
3559                 if (error || count)
3560                         break; /* lock still held */
3561
3562                 /* looks like we have to consider blocking */
3563                 if (first) {
3564                         first = 0;
3565                         /* convert the timeout to a deadline once */
3566                         if (atvp->tv_sec || atvp->tv_usec) {
3567                                 uint64_t now;
3568
3569                                 clock_get_uptime(&now);
3570                                 nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
3571                                                             atvp->tv_usec * (long)NSEC_PER_USEC,
3572                                                             &deadline);
3573                                 if (now >= deadline) {
3574                                         /* non-blocking call */
3575                                         error = EWOULDBLOCK;
3576                                         break; /* lock still held */
3577                                 }
3578                                 deadline -= now;
3579                                 clock_absolutetime_interval_to_deadline(deadline, &deadline);
3580                         } else {
3581                                 deadline = 0;   /* block forever */
3582                         }
3583
3584                         if (continuation) {
3585                                 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
3586                                 struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
3587
3588                                 cont_args->call = callback;
3589                                 cont_args->cont = continuation;
3590                                 cont_args->deadline = deadline;
3591                                 cont_args->data = callback_data;
3592                                 cont_args->process_data = process_data;
3593                                 cont_args->servicer_qos_index = servicer_qos_index;
3594                                 cont = kqueue_scan_continue;
3595                         }
3596                 }
3597
3598                 /* If awakened during processing, try again */
3599                 if (kq->kq_state & KQ_WAKEUP) {
3600                         kqunlock(kq);
3601                         continue;
3602                 }
3603
3604                 /* go ahead and wait */
3605                 waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
3606                                            KQ_EVENT, THREAD_ABORTSAFE,
3607                                            TIMEOUT_URGENCY_USER_NORMAL,
3608                                            deadline, TIMEOUT_NO_LEEWAY);
3609                 kq->kq_state |= KQ_SLEEP;
3610                 kqunlock(kq);
3611                 wait_result = thread_block_parameter(cont, kq);
3612                 /* NOTREACHED if (continuation != NULL) */
3613
3614                 switch (wait_result) {
3615                 case THREAD_AWAKENED:
3616                         continue;
3617                 case THREAD_TIMED_OUT:
3618                         return EWOULDBLOCK;
3619                 case THREAD_INTERRUPTED:
3620                         return EINTR;
3621                 case THREAD_RESTART:
3622                         return EBADF;
3623                 default:
3624                         panic("%s: - bad wait_result (%d)", __func__,
3625                             wait_result);
3626                         error = 0;
3627                 }
3628         }
3629         kqunlock(kq);
3630         return (error);
3631 }
3632
3633
3634 /*
3635  * XXX
3636  * This could be expanded to call kqueue_scan, if desired.
3637  */
3638 /*ARGSUSED*/
3639 static int
3640 kqueue_read(__unused struct fileproc *fp,
3641     __unused struct uio *uio,
3642     __unused int flags,
3643     __unused vfs_context_t ctx)
3644 {
3645         return (ENXIO);
3646 }
3647
3648 /*ARGSUSED*/
3649 static int
3650 kqueue_write(__unused struct fileproc *fp,
3651     __unused struct uio *uio,
3652     __unused int flags,
3653     __unused vfs_context_t ctx)
3654 {
3655         return (ENXIO);
3656 }
3657
3658 /*ARGSUSED*/
3659 static int
3660 kqueue_ioctl(__unused struct fileproc *fp,
3661     __unused u_long com,
3662     __unused caddr_t data,
3663     __unused vfs_context_t ctx)
3664 {
3665         return (ENOTTY);
3666 }
3667
3668 /*ARGSUSED*/
3669 static int
3670 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
3671     __unused vfs_context_t ctx)
3672 {
3673         struct kqueue *kq = (struct kqueue *)fp->f_data;
3674         struct kqtailq *queue;
3675         struct kqtailq *suppressq;
3676         struct knote *kn;
3677         int retnum = 0;
3678
3679         if (which != FREAD)
3680                 return (0);
3681
3682         kqlock(kq);
3683
3684         assert((kq->kq_state & KQ_WORKQ) == 0);
3685
3686         /*
3687          * If this is the first pass, link the wait queue associated with the
3688          * the kqueue onto the wait queue set for the select().  Normally we
3689          * use selrecord() for this, but it uses the wait queue within the
3690          * selinfo structure and we need to use the main one for the kqueue to
3691          * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
3692          * (The select() call will unlink them when it ends).
3693          */
3694         if (wq_link_id != NULL) {
3695                 thread_t cur_act = current_thread();
3696                 struct uthread * ut = get_bsdthread_info(cur_act);
3697
3698                 kq->kq_state |= KQ_SEL;
3699                 waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset,
3700                            WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
3701
3702                 /* always consume the reserved link object */
3703                 waitq_link_release(*(uint64_t *)wq_link_id);
3704                 *(uint64_t *)wq_link_id = 0;
3705
3706                 /*
3707                  * selprocess() is expecting that we send it back the waitq
3708                  * that was just added to the thread's waitq set. In order
3709                  * to not change the selrecord() API (which is exported to
3710                  * kexts), we pass this value back through the
3711                  * void *wq_link_id pointer we were passed. We need to use
3712                  * memcpy here because the pointer may not be properly aligned
3713                  * on 32-bit systems.
3714                  */
3715                 void *wqptr = &kq->kq_wqs;
3716                 memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
3717         }
3718
3719         if (kqueue_begin_processing(kq, QOS_INDEX_KQFILE, 0) == -1) {
3720                 kqunlock(kq);
3721                 return (0);
3722         }
3723
3724         queue = kqueue_get_base_queue(kq, QOS_INDEX_KQFILE);
3725         if (!TAILQ_EMPTY(queue)) {
3726                 /*
3727                  * there is something queued - but it might be a
3728                  * KN_STAYACTIVE knote, which may or may not have
3729                  * any events pending.  Otherwise, we have to walk
3730                  * the list of knotes to see, and peek at the
3731                  * (non-vanished) stay-active ones to be really sure.
3732                  */
3733                 while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
3734                         if (kn->kn_status & KN_ACTIVE) {
3735                                 retnum = 1;
3736                                 goto out;
3737                         }
3738                         assert(kn->kn_status & KN_STAYACTIVE);
3739                         knote_suppress(kn);
3740                 }
3741
3742                 /*
3743                  * There were no regular events on the queue, so take
3744                  * a deeper look at the stay-queued ones we suppressed.
3745                  */
3746                 suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
3747                 while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
3748                         unsigned peek = 1;
3749
3750                         /* If didn't vanish while suppressed - peek at it */
3751                         if (kqlock2knoteuse(kq, kn)) {
3752
3753                                 peek = knote_fops(kn)->f_peek(kn);
3754
3755                                 /* if it dropped while getting lock - move on */
3756                                 if (!knoteuse2kqlock(kq, kn, 0))
3757                                         continue;
3758                         }
3759
3760                         /* unsuppress it */
3761                         knote_unsuppress(kn);
3762
3763                         /* has data or it has to report a vanish */
3764                         if (peek > 0) {
3765                                 retnum = 1;
3766                                 goto out;
3767                         }
3768                 }
3769         }
3770
3771 out:
3772         kqueue_end_processing(kq, QOS_INDEX_KQFILE, 0);
3773         kqunlock(kq);
3774         return (retnum);
3775 }
3776
3777 /*
3778  * kqueue_close -
3779  */
3780 /*ARGSUSED*/
3781 static int
3782 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
3783 {
3784         struct kqfile *kqf = (struct kqfile *)fg->fg_data;
3785
3786         assert((kqf->kqf_state & KQ_WORKQ) == 0);
3787         kqueue_dealloc(&kqf->kqf_kqueue);
3788         fg->fg_data = NULL;
3789         return (0);
3790 }
3791
3792 /*ARGSUSED*/
3793 /*
3794  * The callers has taken a use-count reference on this kqueue and will donate it
3795  * to the kqueue we are being added to.  This keeps the kqueue from closing until
3796  * that relationship is torn down.
3797  */
3798 static int
3799 kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
3800 {
3801         struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
3802         struct kqueue *kq = &kqf->kqf_kqueue;
3803         struct kqueue *parentkq = knote_get_kq(kn);
3804
3805         assert((kqf->kqf_state & KQ_WORKQ) == 0);
3806
3807         if (parentkq == kq ||
3808             kn->kn_filter != EVFILT_READ) {
3809                 kn->kn_flags = EV_ERROR;
3810                 kn->kn_data = EINVAL;
3811                 return 0;
3812         }
3813
3814         /*
3815          * We have to avoid creating a cycle when nesting kqueues
3816          * inside another.  Rather than trying to walk the whole
3817          * potential DAG of nested kqueues, we just use a simple
3818          * ceiling protocol.  When a kqueue is inserted into another,
3819          * we check that the (future) parent is not already nested
3820          * into another kqueue at a lower level than the potenial
3821          * child (because it could indicate a cycle).  If that test
3822          * passes, we just mark the nesting levels accordingly.
3823          */
3824
3825         kqlock(parentkq);
3826         if (parentkq->kq_level > 0 &&
3827             parentkq->kq_level < kq->kq_level)
3828         {
3829                 kqunlock(parentkq);
3830                 kn->kn_flags = EV_ERROR;
3831                 kn->kn_data = EINVAL;
3832                 return 0;
3833         } else {
3834                 /* set parent level appropriately */
3835                 if (parentkq->kq_level == 0)
3836                         parentkq->kq_level = 2;
3837                 if (parentkq->kq_level < kq->kq_level + 1)
3838                         parentkq->kq_level = kq->kq_level + 1;
3839                 kqunlock(parentkq);
3840
3841                 kn->kn_filtid = EVFILTID_KQREAD;
3842                 kqlock(kq);
3843                 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
3844                 /* indicate nesting in child, if needed */
3845                 if (kq->kq_level == 0)
3846                         kq->kq_level = 1;
3847
3848                 int count = kq->kq_count;
3849                 kqunlock(kq);
3850                 return (count > 0);
3851         }
3852 }
3853
3854 /*
3855  * kqueue_drain - called when kq is closed
3856  */
3857 /*ARGSUSED*/
3858 static int
3859 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
3860 {
3861         struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
3862
3863         assert((kq->kq_state & KQ_WORKQ) == 0);
3864
3865         kqlock(kq);
3866         kq->kq_state |= KQ_DRAIN;
3867         kqueue_interrupt(kq);
3868         kqunlock(kq);
3869         return (0);
3870 }
3871
3872 /*ARGSUSED*/
3873 int
3874 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
3875 {
3876         assert((kq->kq_state & KQ_WORKQ) == 0);
3877
3878         kqlock(kq);
3879         if (isstat64 != 0) {
3880                 struct stat64 *sb64 = (struct stat64 *)ub;
3881
3882                 bzero((void *)sb64, sizeof(*sb64));
3883                 sb64->st_size = kq->kq_count;
3884                 if (kq->kq_state & KQ_KEV_QOS)
3885                         sb64->st_blksize = sizeof(struct kevent_qos_s);
3886                 else if (kq->kq_state & KQ_KEV64)
3887                         sb64->st_blksize = sizeof(struct kevent64_s);
3888                 else if (IS_64BIT_PROCESS(p))
3889                         sb64->st_blksize = sizeof(struct user64_kevent);
3890                 else
3891                         sb64->st_blksize = sizeof(struct user32_kevent);
3892                 sb64->st_mode = S_IFIFO;
3893         } else {
3894                 struct stat *sb = (struct stat *)ub;
3895
3896                 bzero((void *)sb, sizeof(*sb));
3897                 sb->st_size = kq->kq_count;
3898                 if (kq->kq_state & KQ_KEV_QOS)
3899                         sb->st_blksize = sizeof(struct kevent_qos_s);
3900                 else if (kq->kq_state & KQ_KEV64)
3901                         sb->st_blksize = sizeof(struct kevent64_s);
3902                 else if (IS_64BIT_PROCESS(p))
3903                         sb->st_blksize = sizeof(struct user64_kevent);
3904                 else
3905                         sb->st_blksize = sizeof(struct user32_kevent);
3906                 sb->st_mode = S_IFIFO;
3907         }
3908         kqunlock(kq);
3909         return (0);
3910 }
3911
3912
3913 /*
3914  * Interact with the pthread kext to request a servicing there.
3915  * Eventually, this will request threads at specific QoS levels.
3916  * For now, it only requests a dispatch-manager-QoS thread, and
3917  * only one-at-a-time.
3918  *
3919  * - Caller holds the workq request lock
3920  *
3921  * - May be called with the kqueue's wait queue set locked,
3922  *   so cannot do anything that could recurse on that.
3923  */
3924 static void
3925 kqworkq_request_thread(
3926         struct kqworkq *kqwq,
3927         kq_index_t qos_index)
3928 {
3929         struct kqrequest *kqr;
3930
3931         assert(kqwq->kqwq_state & KQ_WORKQ);
3932         assert(qos_index < KQWQ_NQOS);
3933
3934         kqr = kqworkq_get_request(kqwq, qos_index);
3935
3936         /*
3937          * If we have already requested a thread, and it hasn't
3938          * started processing yet, there's no use hammering away
3939          * on the pthread kext.
3940          */
3941         if (kqr->kqr_state & KQWQ_THREQUESTED)
3942                 return;
3943
3944         assert(kqr->kqr_thread == THREAD_NULL);
3945
3946         /* request additional workq threads if appropriate */
3947         if (pthread_functions != NULL &&
3948             pthread_functions->workq_reqthreads != NULL) {
3949                 unsigned int flags = KEVENT_FLAG_WORKQ;
3950
3951                 /* Compute a priority based on qos_index. */
3952                 struct workq_reqthreads_req_s request = {
3953                         .priority = qos_from_qos_index(qos_index),
3954                         .count = 1
3955                 };
3956
3957                 thread_t wqthread;
3958                 wqthread = (*pthread_functions->workq_reqthreads)(kqwq->kqwq_p, 1, &request);
3959                 kqr->kqr_state |= KQWQ_THREQUESTED;
3960
3961                 /* Have we been switched to the emergency/manager thread? */
3962                 if (wqthread == (thread_t)-1) {
3963                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
3964                         wqthread = THREAD_NULL;
3965                 } else if (qos_index == KQWQ_QOS_MANAGER)
3966                         flags |= KEVENT_FLAG_WORKQ_MANAGER;
3967
3968                 /* bind the thread */
3969                 kqworkq_bind_thread(kqwq, qos_index, wqthread, flags);
3970         }
3971 }
3972
3973 /*
3974  * If we aren't already busy processing events [for this QoS],
3975  * request workq thread support as appropriate.
3976  *
3977  * TBD - for now, we don't segregate out processing by QoS.
3978  *
3979  * - May be called with the kqueue's wait queue set locked,
3980  *   so cannot do anything that could recurse on that.
3981  */
3982 static void
3983 kqworkq_request_help(
3984         struct kqworkq *kqwq,
3985         kq_index_t qos_index,
3986         uint32_t type)
3987 {
3988         struct kqrequest *kqr;
3989
3990         /* convert to thread qos value */
3991         assert(qos_index < KQWQ_NQOS);
3992
3993         kqwq_req_lock(kqwq);
3994         kqr = kqworkq_get_request(kqwq, qos_index);
3995
3996         /*
3997          * If someone is processing the queue, just mark what type
3998          * of attempt this was (from a kq wakeup or from a waitq hook).
3999          * They'll be noticed at the end of servicing and a new thread
4000          * will be requested at that point.
4001          */
4002         if (kqr->kqr_state & KQWQ_PROCESSING) {
4003                 kqr->kqr_state |= type;
4004                 kqwq_req_unlock(kqwq);
4005                 return;
4006         }
4007
4008         kqworkq_request_thread(kqwq, qos_index);
4009         kqwq_req_unlock(kqwq);
4010 }
4011
4012 /*
4013  * These arrays described the low and high qindexes for a given qos_index.
4014  * The values come from the chart in <sys/eventvar.h> (must stay in sync).
4015  */
4016 static kq_index_t _kq_base_index[KQWQ_NQOS] = {0, 0, 6, 11, 15, 18, 20, 21};
4017 static kq_index_t _kq_high_index[KQWQ_NQOS] = {0, 5, 10, 14, 17, 19, 20, 21};
4018
4019 static struct kqtailq *
4020 kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index)
4021 {
4022         assert(qos_index < KQWQ_NQOS);
4023         return &kq->kq_queue[_kq_base_index[qos_index]];
4024 }
4025
4026 static struct kqtailq *
4027 kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index)
4028 {
4029         assert(qos_index < KQWQ_NQOS);
4030         return &kq->kq_queue[_kq_high_index[qos_index]];
4031 }
4032
4033 static int
4034 kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
4035 {
4036         struct kqtailq *base_queue = kqueue_get_base_queue(kq, qos_index);
4037         struct kqtailq *queue = kqueue_get_high_queue(kq, qos_index);
4038
4039         do {
4040                 if (!TAILQ_EMPTY(queue))
4041                         return 0;
4042         } while (queue-- > base_queue);
4043         return 1;
4044 }
4045
4046 static struct kqtailq *
4047 kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index)
4048 {
4049         if (kq->kq_state & KQ_WORKQ) {
4050                 struct kqworkq *kqwq = (struct kqworkq *)kq;
4051                 struct kqrequest *kqr;
4052
4053                 kqr = kqworkq_get_request(kqwq, qos_index);
4054                 return &kqr->kqr_suppressed;
4055         } else {
4056                 struct kqfile *kqf = (struct kqfile *)kq;
4057                 return &kqf->kqf_suppressed;
4058         }
4059 }
4060
4061 static kq_index_t
4062 knote_get_queue_index(struct knote *kn)
4063 {
4064         kq_index_t override_index = knote_get_qos_override_index(kn);
4065         kq_index_t qos_index = knote_get_qos_index(kn);
4066         struct kqueue *kq = knote_get_kq(kn);
4067         kq_index_t res;
4068
4069         if ((kq->kq_state & KQ_WORKQ) == 0) {
4070                 assert(qos_index == 0);
4071                 assert(override_index == 0);
4072         }
4073         res = _kq_base_index[qos_index];
4074         if (override_index > qos_index)
4075                 res += override_index - qos_index;
4076
4077         assert(res <= _kq_high_index[qos_index]);
4078         return res;
4079 }
4080
4081 static struct kqtailq *
4082 knote_get_queue(struct knote *kn)
4083 {
4084         kq_index_t qindex = knote_get_queue_index(kn);
4085
4086         return &(knote_get_kq(kn))->kq_queue[qindex];
4087 }
4088
4089 static struct kqtailq *
4090 knote_get_suppressed_queue(struct knote *kn)
4091 {
4092         kq_index_t qos_index = knote_get_qos_index(kn);
4093         struct kqueue *kq = knote_get_kq(kn);
4094
4095         return kqueue_get_suppressed_queue(kq, qos_index);
4096 }
4097
4098 static kq_index_t
4099 knote_get_req_index(struct knote *kn)
4100 {
4101         return kn->kn_req_index;
4102 }
4103
4104 static kq_index_t
4105 knote_get_qos_index(struct knote *kn)
4106 {
4107         return kn->kn_qos_index;
4108 }
4109
4110 static void
4111 knote_set_qos_index(struct knote *kn, kq_index_t qos_index)
4112 {
4113         struct kqueue *kq = knote_get_kq(kn);
4114
4115         assert(qos_index < KQWQ_NQOS);
4116         assert((kn->kn_status & KN_QUEUED) == 0);
4117
4118         if (kq->kq_state & KQ_WORKQ)
4119                 assert(qos_index > QOS_INDEX_KQFILE);
4120         else
4121                 assert(qos_index == QOS_INDEX_KQFILE);
4122
4123         /* always set requested */
4124         kn->kn_req_index = qos_index;
4125
4126         /* only adjust in-use qos index when not suppressed */
4127         if ((kn->kn_status & KN_SUPPRESSED) == 0)
4128                 kn->kn_qos_index = qos_index;
4129 }
4130
4131 static kq_index_t
4132 knote_get_qos_override_index(struct knote *kn)
4133 {
4134         return kn->kn_qos_override;
4135 }
4136
4137 static void
4138 knote_set_qos_override_index(struct knote *kn, kq_index_t override_index)
4139 {
4140         struct kqueue *kq = knote_get_kq(kn);
4141         kq_index_t qos_index = knote_get_qos_index(kn);
4142
4143         assert((kn->kn_status & KN_QUEUED) == 0);
4144
4145         if (override_index == KQWQ_QOS_MANAGER)
4146                 assert(qos_index == KQWQ_QOS_MANAGER);
4147         else
4148                 assert(override_index < KQWQ_QOS_MANAGER);
4149
4150         kn->kn_qos_override = override_index;
4151
4152         /*
4153          * If this is a workq kqueue, apply the override to the
4154          * workq servicing thread.
4155          */
4156         if (kq->kq_state & KQ_WORKQ)  {
4157                 struct kqworkq *kqwq = (struct kqworkq *)kq;
4158
4159                 assert(qos_index > QOS_INDEX_KQFILE);
4160                 kqworkq_update_override(kqwq, qos_index, override_index);
4161         }
4162 }
4163
4164 static void
4165 kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index)
4166 {
4167         struct kqrequest *kqr;
4168         kq_index_t new_delta;
4169         kq_index_t old_delta;
4170
4171         new_delta = (override_index > qos_index) ?
4172                     override_index - qos_index : 0;
4173
4174         kqr = kqworkq_get_request(kqwq, qos_index);
4175
4176         kqwq_req_lock(kqwq);
4177         old_delta = kqr->kqr_override_delta;
4178
4179         if (new_delta > old_delta) {
4180                 thread_t wqthread = kqr->kqr_thread;
4181
4182                 /* store the new override delta */
4183                 kqr->kqr_override_delta = new_delta;
4184
4185                 /* apply the override to [incoming?] servicing thread */
4186                 if (wqthread) {
4187                         /* only apply if non-manager */
4188                     if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
4189                                 if (old_delta)
4190                                         thread_update_ipc_override(wqthread, override_index);
4191                                 else
4192                                         thread_add_ipc_override(wqthread, override_index);
4193                         }
4194                 }
4195         }
4196         kqwq_req_unlock(kqwq);
4197 }
4198
4199 /* called with the kqworkq lock held */
4200 static void
4201 kqworkq_bind_thread(
4202         struct kqworkq *kqwq,
4203         kq_index_t qos_index,
4204         thread_t thread,
4205         unsigned int flags)
4206 {
4207         struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
4208         thread_t old_thread = kqr->kqr_thread;
4209         struct uthread *ut;
4210
4211         assert(kqr->kqr_state & KQWQ_THREQUESTED);
4212
4213         /* If no identity yet, just set flags as needed */
4214         if (thread == THREAD_NULL) {
4215                 assert(old_thread == THREAD_NULL);
4216
4217                 /* emergency or unindetified */
4218                 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
4219                         assert((kqr->kqr_state & KQWQ_THMANAGER) == 0);
4220                         kqr->kqr_state |= KQWQ_THMANAGER;
4221                 }
4222                 return;
4223         }
4224
4225         /* Known thread identity */
4226         ut = get_bsdthread_info(thread);
4227
4228         /*
4229          * If this is a manager, and the manager request bit is
4230          * not set, assure no other thread is bound. If the bit
4231          * is set, make sure the old thread is us (or not set).
4232          */
4233         if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
4234                 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
4235                         assert(old_thread == THREAD_NULL);
4236                         kqr->kqr_state |= KQWQ_THMANAGER;
4237                 } else if (old_thread == THREAD_NULL) {
4238                         kqr->kqr_thread = thread;
4239                         ut->uu_kqueue_bound = KQWQ_QOS_MANAGER;
4240                         ut->uu_kqueue_flags = (KEVENT_FLAG_WORKQ |
4241                                                KEVENT_FLAG_WORKQ_MANAGER);
4242                 } else {
4243                         assert(thread == old_thread);
4244                         assert(ut->uu_kqueue_bound == KQWQ_QOS_MANAGER);
4245                         assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
4246                 }
4247                 return;
4248         }
4249
4250         /* Just a normal one-queue servicing thread */
4251         assert(old_thread == THREAD_NULL);
4252         assert((kqr->kqr_state & KQWQ_THMANAGER) == 0);
4253
4254         kqr->kqr_thread = thread;
4255
4256         /* apply an ipc QoS override if one is needed */
4257         if (kqr->kqr_override_delta)
4258                 thread_add_ipc_override(thread, qos_index + kqr->kqr_override_delta);
4259
4260         /* indicate that we are processing in the uthread */
4261         ut->uu_kqueue_bound = qos_index;
4262         ut->uu_kqueue_flags = flags;
4263 }
4264
4265 /* called with the kqworkq lock held */
4266 static void
4267 kqworkq_unbind_thread(
4268         struct kqworkq *kqwq,
4269         kq_index_t qos_index,
4270         thread_t thread,
4271         __unused unsigned int flags)
4272 {
4273         struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
4274         kq_index_t override = 0;
4275
4276         assert(thread == current_thread());
4277
4278         /*
4279          * If there is an override, drop it from the current thread
4280          * and then we are free to recompute (a potentially lower)
4281          * minimum override to apply to the next thread request.
4282          */
4283         if (kqr->kqr_override_delta) {
4284                 struct kqtailq *base_queue = kqueue_get_base_queue(&kqwq->kqwq_kqueue, qos_index);
4285                 struct kqtailq *queue = kqueue_get_high_queue(&kqwq->kqwq_kqueue, qos_index);
4286
4287                 /* if not bound to a manager thread, drop the current ipc override */
4288                 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
4289                         assert(thread == kqr->kqr_thread);
4290                         thread_drop_ipc_override(thread);
4291                 }
4292
4293                 /* recompute the new override */
4294                 do {
4295                         if (!TAILQ_EMPTY(queue)) {
4296                                 override = queue - base_queue;
4297                                 break;
4298                         }
4299                 } while (queue-- > base_queue);
4300         }
4301
4302         /* unbind the thread and apply the new override */
4303         kqr->kqr_thread = THREAD_NULL;
4304         kqr->kqr_override_delta = override;
4305 }
4306
4307 struct kqrequest *
4308 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
4309 {
4310         assert(qos_index < KQWQ_NQOS);
4311         return &kqwq->kqwq_request[qos_index];
4312 }
4313
4314 void
4315 knote_adjust_qos(struct knote *kn, qos_t new_qos, qos_t new_override)
4316 {
4317         if (knote_get_kq(kn)->kq_state & KQ_WORKQ) {
4318                 kq_index_t new_qos_index;
4319                 kq_index_t new_override_index;
4320                 kq_index_t servicer_qos_index;
4321
4322                 new_qos_index = qos_index_from_qos(new_qos, FALSE);
4323                 new_override_index = qos_index_from_qos(new_override, TRUE);
4324
4325                 /* make sure the servicer qos acts as a floor */
4326                 servicer_qos_index = qos_index_from_qos(kn->kn_qos, FALSE);
4327                 if (servicer_qos_index > new_qos_index)
4328                         new_qos_index = servicer_qos_index;
4329                 if (servicer_qos_index > new_override_index)
4330                         new_override_index = servicer_qos_index;
4331
4332                 kqlock(knote_get_kq(kn));
4333                 if (new_qos_index != knote_get_req_index(kn) ||
4334                     new_override_index != knote_get_qos_override_index(kn)) {
4335                         if (kn->kn_status & KN_QUEUED) {
4336                                 knote_dequeue(kn);
4337                                 knote_set_qos_index(kn, new_qos_index);
4338                                 knote_set_qos_override_index(kn, new_override_index);
4339                                 knote_enqueue(kn);
4340                                 knote_wakeup(kn);
4341                         } else {
4342                                 knote_set_qos_index(kn, new_qos_index);
4343                                 knote_set_qos_override_index(kn, new_override_index);
4344                         }
4345                 }
4346                 kqunlock(knote_get_kq(kn));
4347         }
4348 }
4349
4350 static void
4351 knote_wakeup(struct knote *kn)
4352 {
4353         struct kqueue *kq = knote_get_kq(kn);
4354
4355         if (kq->kq_state & KQ_WORKQ) {
4356                 /* request a servicing thread */
4357                 struct kqworkq *kqwq = (struct kqworkq *)kq;
4358                 kq_index_t qos_index = knote_get_qos_index(kn);
4359
4360                 kqworkq_request_help(kqwq, qos_index, KQWQ_WAKEUP);
4361
4362         } else {
4363                 struct kqfile *kqf = (struct kqfile *)kq;
4364
4365                 /* flag wakeups during processing */
4366                 if (kq->kq_state & KQ_PROCESSING)
4367                         kq->kq_state |= KQ_WAKEUP;
4368
4369                 /* wakeup a thread waiting on this queue */
4370                 if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
4371                         kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
4372                         waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
4373                                            KQ_EVENT,
4374                                            THREAD_AWAKENED,
4375                                            WAITQ_ALL_PRIORITIES);
4376                 }
4377
4378                 /* wakeup other kqueues/select sets we're inside */
4379                 KNOTE(&kqf->kqf_sel.si_note, 0);
4380         }
4381 }
4382
4383 /*
4384  * Called with the kqueue locked
4385  */
4386 static void
4387 kqueue_interrupt(struct kqueue *kq)
4388 {
4389         assert((kq->kq_state & KQ_WORKQ) == 0);
4390
4391         /* wakeup sleeping threads */
4392         if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0) {
4393                 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
4394                 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
4395                                          KQ_EVENT,
4396                                          THREAD_RESTART,
4397                                          WAITQ_ALL_PRIORITIES);
4398         }
4399
4400         /* wakeup threads waiting their turn to process */
4401         if (kq->kq_state & KQ_PROCWAIT) {
4402                 struct kqtailq *suppressq;
4403
4404                 assert(kq->kq_state & KQ_PROCESSING);
4405
4406                 kq->kq_state &= ~KQ_PROCWAIT;
4407                 suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
4408                 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
4409                                          CAST_EVENT64_T(suppressq),
4410                                          THREAD_RESTART,
4411                                          WAITQ_ALL_PRIORITIES);
4412         }
4413 }
4414
4415 /*
4416  * Called back from waitq code when no threads waiting and the hook was set.
4417  *
4418  * Interrupts are likely disabled and spin locks are held - minimal work
4419  * can be done in this context!!!
4420  *
4421  * JMM - in the future, this will try to determine which knotes match the
4422  * wait queue wakeup and apply these wakeups against those knotes themselves.
4423  * For now, all the events dispatched this way are dispatch-manager handled,
4424  * so hard-code that for now.
4425  */
4426 void
4427 waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos)
4428 {
4429 #pragma unused(knote_hook, qos)
4430
4431         struct kqworkq *kqwq = (struct kqworkq *)kq_hook;
4432
4433         assert(kqwq->kqwq_state & KQ_WORKQ);
4434         kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER, KQWQ_HOOKCALLED);
4435 }
4436
4437 void
4438 klist_init(struct klist *list)
4439 {
4440         SLIST_INIT(list);
4441 }
4442
4443
4444 /*
4445  * Query/Post each knote in the object's list
4446  *
4447  *      The object lock protects the list. It is assumed
4448  *      that the filter/event routine for the object can
4449  *      determine that the object is already locked (via
4450  *      the hint) and not deadlock itself.
4451  *
4452  *      The object lock should also hold off pending
4453  *      detach/drop operations.  But we'll prevent it here
4454  *      too (by taking a use reference) - just in case.
4455  */
4456 void
4457 knote(struct klist *list, long hint)
4458 {
4459         struct knote *kn;
4460
4461         SLIST_FOREACH(kn, list, kn_selnext) {
4462                 struct kqueue *kq = knote_get_kq(kn);
4463
4464                 kqlock(kq);
4465
4466                 /* If we can get a use reference - deliver event */
4467                 if (kqlock2knoteuse(kq, kn)) {
4468                         int result;
4469
4470                         /* call the event with only a use count */
4471                         result = knote_fops(kn)->f_event(kn, hint);
4472
4473                         /* if its not going away and triggered */
4474                         if (knoteuse2kqlock(kq, kn, 0) && result)
4475                                 knote_activate(kn);
4476                         /* kq lock held */
4477                 }
4478                 kqunlock(kq);
4479         }
4480 }
4481
4482 /*
4483  * attach a knote to the specified list.  Return true if this is the first entry.
4484  * The list is protected by whatever lock the object it is associated with uses.
4485  */
4486 int
4487 knote_attach(struct klist *list, struct knote *kn)
4488 {
4489         int ret = SLIST_EMPTY(list);
4490         SLIST_INSERT_HEAD(list, kn, kn_selnext);
4491         return (ret);
4492 }
4493
4494 /*
4495  * detach a knote from the specified list.  Return true if that was the last entry.
4496  * The list is protected by whatever lock the object it is associated with uses.
4497  */
4498 int
4499 knote_detach(struct klist *list, struct knote *kn)
4500 {
4501         SLIST_REMOVE(list, kn, knote, kn_selnext);
4502         return (SLIST_EMPTY(list));
4503 }
4504
4505 /*
4506  * knote_vanish - Indicate that the source has vanished
4507  *
4508  * If the knote has requested EV_VANISHED delivery,
4509  * arrange for that. Otherwise, deliver a NOTE_REVOKE
4510  * event for backward compatibility.
4511  *
4512  * The knote is marked as having vanished, but is not
4513  * actually detached from the source in this instance.
4514  * The actual detach is deferred until the knote drop.
4515  *
4516  * Our caller already has the object lock held. Calling
4517  * the detach routine would try to take that lock
4518  * recursively - which likely is not supported.
4519  */
4520 void
4521 knote_vanish(struct klist *list)
4522 {
4523         struct knote *kn;
4524         struct knote *kn_next;
4525
4526         SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
4527                 struct kqueue *kq = knote_get_kq(kn);
4528                 int result;
4529
4530                 kqlock(kq);
4531                 if ((kn->kn_status & KN_DROPPING) == 0) {
4532
4533                         /* If EV_VANISH supported - prepare to deliver one */
4534                         if (kn->kn_status & KN_REQVANISH) {
4535                                 kn->kn_status |= KN_VANISHED;
4536                                 knote_activate(kn);
4537
4538                         } else if (kqlock2knoteuse(kq, kn)) {
4539                                 /* call the event with only a use count */
4540                                 result = knote_fops(kn)->f_event(kn, NOTE_REVOKE);
4541
4542                                 /* if its not going away and triggered */
4543                                 if (knoteuse2kqlock(kq, kn, 0) && result)
4544                                         knote_activate(kn);
4545                                 /* lock held again */
4546                         }
4547                 }
4548                 kqunlock(kq);
4549         }
4550 }
4551
4552 /*
4553  * For a given knote, link a provided wait queue directly with the kqueue.
4554  * Wakeups will happen via recursive wait queue support.  But nothing will move
4555  * the knote to the active list at wakeup (nothing calls knote()).  Instead,
4556  * we permanently enqueue them here.
4557  *
4558  * kqueue and knote references are held by caller.
4559  * waitq locked by caller.
4560  *
4561  * caller provides the wait queue link structure.
4562  */
4563 int
4564 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
4565 {
4566         struct kqueue *kq = knote_get_kq(kn);
4567         kern_return_t kr;
4568
4569         kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
4570         if (kr == KERN_SUCCESS) {
4571                 knote_markstayactive(kn);
4572                 return (0);
4573         } else {
4574                 return (EINVAL);
4575         }
4576 }
4577
4578 /*
4579  * Unlink the provided wait queue from the kqueue associated with a knote.
4580  * Also remove it from the magic list of directly attached knotes.
4581  *
4582  * Note that the unlink may have already happened from the other side, so
4583  * ignore any failures to unlink and just remove it from the kqueue list.
4584  *
4585  * On success, caller is responsible for the link structure
4586  */
4587 int
4588 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
4589 {
4590         struct kqueue *kq = knote_get_kq(kn);
4591         kern_return_t kr;
4592
4593         kr = waitq_unlink(wq, &kq->kq_wqs);
4594         knote_clearstayactive(kn);
4595         return ((kr != KERN_SUCCESS) ? EINVAL : 0);
4596 }
4597
4598 /*
4599  * remove all knotes referencing a specified fd
4600  *
4601  * Essentially an inlined knote_remove & knote_drop
4602  * when we know for sure that the thing is a file
4603  *
4604  * Entered with the proc_fd lock already held.
4605  * It returns the same way, but may drop it temporarily.
4606  */
4607 void
4608 knote_fdclose(struct proc *p, int fd, int force)
4609 {
4610         struct klist *list;
4611         struct knote *kn;
4612
4613 restart:
4614         list = &p->p_fd->fd_knlist[fd];
4615         SLIST_FOREACH(kn, list, kn_link) {
4616                 struct kqueue *kq = knote_get_kq(kn);
4617
4618                 kqlock(kq);
4619
4620                 if (kq->kq_p != p)
4621                         panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
4622                             __func__, kq->kq_p, p);
4623
4624                 /*
4625                  * If the knote supports EV_VANISHED delivery,
4626                  * transition it to vanished mode (or skip over
4627                  * it if already vanished).
4628                  */
4629                 if (!force && (kn->kn_status & KN_REQVANISH)) {
4630
4631                         if ((kn->kn_status & KN_VANISHED) == 0) {
4632                                 proc_fdunlock(p);
4633
4634                                 /* get detach reference (also marks vanished) */
4635                                 if (kqlock2knotedetach(kq, kn)) {
4636
4637                                         /* detach knote and drop fp use reference */
4638                                         knote_fops(kn)->f_detach(kn);
4639                                         if (knote_fops(kn)->f_isfd)
4640                                                 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
4641
4642                                         /* activate it if it's still in existence */
4643                                         if (knoteuse2kqlock(kq, kn, 0)) {
4644                                                 knote_activate(kn);
4645                                         }
4646                                         kqunlock(kq);
4647                                 }
4648                                 proc_fdlock(p);
4649                                 goto restart;
4650                         } else {
4651                                 kqunlock(kq);
4652                                 continue;
4653                         }
4654                 }
4655
4656                 proc_fdunlock(p);
4657
4658                 /*
4659                  * Convert the kq lock to a drop ref.
4660                  * If we get it, go ahead and drop it.
4661                  * Otherwise, we waited for the blocking
4662                  * condition to complete. Either way,
4663                  * we dropped the fdlock so start over.
4664                  */
4665                 if (kqlock2knotedrop(kq, kn)) {
4666                         knote_drop(kn, p);
4667                 }
4668
4669                 proc_fdlock(p);
4670                 goto restart;
4671         }
4672 }
4673
4674 /*
4675  * knote_fdadd - Add knote to the fd table for process
4676  *
4677  * All file-based filters associate a list of knotes by file
4678  * descriptor index. All other filters hash the knote by ident.
4679  *
4680  * May have to grow the table of knote lists to cover the
4681  * file descriptor index presented.
4682  *
4683  * proc_fdlock held on entry (and exit)
4684  */
4685 static int
4686 knote_fdadd(struct knote *kn, struct proc *p)
4687 {
4688         struct filedesc *fdp = p->p_fd;
4689         struct klist *list = NULL;
4690
4691         if (! knote_fops(kn)->f_isfd) {
4692                 if (fdp->fd_knhashmask == 0)
4693                         fdp->fd_knhash = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
4694                             &fdp->fd_knhashmask);
4695                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
4696         } else {
4697                 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
4698                         u_int size = 0;
4699
4700                         if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
4701                             || kn->kn_id >= (uint64_t)maxfiles)
4702                                 return (EINVAL);
4703
4704                         /* have to grow the fd_knlist */
4705                         size = fdp->fd_knlistsize;
4706                         while (size <= kn->kn_id)
4707                                 size += KQEXTENT;
4708
4709                         if (size >= (UINT_MAX/sizeof(struct klist *)))
4710                                 return (EINVAL);
4711
4712                         MALLOC(list, struct klist *,
4713                             size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
4714                         if (list == NULL)
4715                                 return (ENOMEM);
4716
4717                         bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
4718                             fdp->fd_knlistsize * sizeof(struct klist *));
4719                         bzero((caddr_t)list +
4720                             fdp->fd_knlistsize * sizeof(struct klist *),
4721                             (size - fdp->fd_knlistsize) * sizeof(struct klist *));
4722                         FREE(fdp->fd_knlist, M_KQUEUE);
4723                         fdp->fd_knlist = list;
4724                         fdp->fd_knlistsize = size;
4725                 }
4726                 list = &fdp->fd_knlist[kn->kn_id];
4727         }
4728         SLIST_INSERT_HEAD(list, kn, kn_link);
4729         return (0);
4730 }
4731
4732 /*
4733  * knote_fdremove - remove a knote from the fd table for process
4734  *
4735  * If the filter is file-based, remove based on fd index.
4736  * Otherwise remove from the hash based on the ident.
4737  *
4738  * proc_fdlock held on entry (and exit)
4739  */
4740 static void
4741 knote_fdremove(struct knote *kn, struct proc *p)
4742 {
4743         struct filedesc *fdp = p->p_fd;
4744         struct klist *list = NULL;
4745
4746         if (knote_fops(kn)->f_isfd) {
4747                 assert ((u_int)fdp->fd_knlistsize > kn->kn_id);
4748                 list = &fdp->fd_knlist[kn->kn_id];
4749         } else {
4750                 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
4751         }
4752         SLIST_REMOVE(list, kn, knote, kn_link);
4753 }
4754
4755 /*
4756  * knote_fdfind - lookup a knote in the fd table for process
4757  *
4758  * If the filter is file-based, lookup based on fd index.
4759  * Otherwise use a hash based on the ident.
4760  *
4761  * Matching is based on kq, filter, and ident. Optionally,
4762  * it may also be based on the udata field in the kevent -
4763  * allowing multiple event registration for the file object
4764  * per kqueue.
4765  *
4766  * proc_fdlock held on entry (and exit)
4767  */
4768 static struct knote *
4769 knote_fdfind(struct kqueue *kq,
4770              struct kevent_internal_s *kev,
4771              struct proc *p)
4772 {
4773         struct filedesc *fdp = p->p_fd;
4774         struct klist *list = NULL;
4775         struct knote *kn = NULL;
4776         struct filterops *fops;
4777
4778         fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
4779
4780         /*
4781          * determine where to look for the knote
4782          */
4783         if (fops->f_isfd) {
4784                 /* fd-based knotes are linked off the fd table */
4785                 if (kev->ident < (u_int)fdp->fd_knlistsize) {
4786                         list = &fdp->fd_knlist[kev->ident];
4787                 }
4788         } else if (fdp->fd_knhashmask != 0) {
4789                 /* hash non-fd knotes here too */
4790                 list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
4791         }
4792
4793         /*
4794          * scan the selected list looking for a match
4795          */
4796         if (list != NULL) {
4797                 SLIST_FOREACH(kn, list, kn_link) {
4798                         if (kq == knote_get_kq(kn) &&
4799                             kev->ident == kn->kn_id &&
4800                             kev->filter == kn->kn_filter) {
4801                                 if (kev->flags & EV_UDATA_SPECIFIC) {
4802                                         if ((kn->kn_status & KN_UDATA_SPECIFIC) &&
4803                                             kev->udata == kn->kn_udata) {
4804                                                 break; /* matching udata-specific knote */
4805                                         }
4806                                 } else if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
4807                                         break; /* matching non-udata-specific knote */
4808                                 }
4809                         }
4810                 }
4811         }
4812         return kn;
4813 }
4814
4815 /*
4816  * knote_drop - disconnect and drop the knote
4817  *
4818  * Called with the kqueue unlocked and holding a
4819  * "drop reference" on the knote in question.
4820  * This reference is most often aquired thru a call
4821  * to kqlock2knotedrop(). But it can also be acquired
4822  * through stealing a drop reference via a call to
4823  * knoteuse2knotedrop() or during the initial attach
4824  * of the knote.
4825  *
4826  * The knote may have already been detached from
4827  * (or not yet attached to) its source object.
4828  *
4829  * should be called at spl == 0, since we don't want to hold spl
4830  * while calling fdrop and free.
4831  */
4832 static void
4833 knote_drop(struct knote *kn, __unused struct proc *ctxp)
4834 {
4835         struct kqueue *kq = knote_get_kq(kn);
4836         struct proc *p = kq->kq_p;
4837         int needswakeup;
4838
4839         /* We have to have a dropping reference on the knote */
4840         assert(kn->kn_status & KN_DROPPING);
4841
4842         /* If we are attached, disconnect from the source first */
4843         if (kn->kn_status & KN_ATTACHED) {
4844                 knote_fops(kn)->f_detach(kn);
4845         }
4846
4847         proc_fdlock(p);
4848
4849         /* Remove the source from the appropriate hash */
4850         knote_fdremove(kn, p);
4851
4852         /* trade fdlock for kq lock */
4853         kqlock(kq);
4854         proc_fdunlock(p);
4855
4856         /* determine if anyone needs to know about the drop */
4857         assert((kn->kn_status & (KN_SUPPRESSED | KN_QUEUED)) == 0);
4858         needswakeup = (kn->kn_status & KN_USEWAIT);
4859         kqunlock(kq);
4860
4861         if (needswakeup)
4862                 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
4863                                    CAST_EVENT64_T(&kn->kn_status),
4864                                    THREAD_RESTART,
4865                                    WAITQ_ALL_PRIORITIES);
4866
4867         if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0))
4868                 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
4869
4870         knote_free(kn);
4871 }
4872
4873 /* called with kqueue lock held */
4874 static void
4875 knote_activate(struct knote *kn)
4876 {
4877         if (kn->kn_status & KN_ACTIVE)
4878                 return;
4879
4880         kn->kn_status |= KN_ACTIVE;
4881         if (knote_enqueue(kn))
4882                 knote_wakeup(kn);
4883 }
4884
4885 /* called with kqueue lock held */
4886 static void
4887 knote_deactivate(struct knote *kn)
4888 {
4889         kn->kn_status &= ~KN_ACTIVE;
4890         if ((kn->kn_status & KN_STAYACTIVE) == 0)
4891                 knote_dequeue(kn);
4892 }
4893
4894 /* called with kqueue lock held */
4895 static void
4896 knote_enable(struct knote *kn)
4897 {
4898         if ((kn->kn_status & KN_DISABLED) == 0)
4899                 return;
4900
4901         kn->kn_status &= ~KN_DISABLED;
4902         if (knote_enqueue(kn))
4903                 knote_wakeup(kn);
4904 }
4905
4906 /* called with kqueue lock held */
4907 static void
4908 knote_disable(struct knote *kn)
4909 {
4910         if (kn->kn_status & KN_DISABLED)
4911                 return;
4912
4913         kn->kn_status |= KN_DISABLED;
4914         knote_dequeue(kn);
4915 }
4916
4917 /* called with kqueue lock held */
4918 static void
4919 knote_suppress(struct knote *kn)
4920 {
4921         struct kqtailq *suppressq;
4922
4923         if (kn->kn_status & KN_SUPPRESSED)
4924                 return;
4925
4926         knote_dequeue(kn);
4927         kn->kn_status |= KN_SUPPRESSED;
4928         suppressq = knote_get_suppressed_queue(kn);
4929         TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
4930 }
4931
4932 /* called with kqueue lock held */
4933 static void
4934 knote_unsuppress(struct knote *kn)
4935 {
4936         struct kqtailq *suppressq;
4937
4938         if ((kn->kn_status & KN_SUPPRESSED) == 0)
4939                 return;
4940
4941         kn->kn_status &= ~KN_SUPPRESSED;
4942         suppressq = knote_get_suppressed_queue(kn);
4943         TAILQ_REMOVE(suppressq, kn, kn_tqe);
4944
4945         /* udate in-use qos to equal requested qos */
4946         kn->kn_qos_index = kn->kn_req_index;
4947
4948         /* don't wakeup if unsuppressing just a stay-active knote */
4949         if (knote_enqueue(kn) &&
4950             (kn->kn_status & KN_ACTIVE))
4951                 knote_wakeup(kn);
4952 }
4953
4954 /* called with kqueue lock held */
4955 static int
4956 knote_enqueue(struct knote *kn)
4957 {
4958         if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0 ||
4959             (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)))
4960                 return 0;
4961
4962         if ((kn->kn_status & KN_QUEUED) == 0) {
4963                 struct kqtailq *queue = knote_get_queue(kn);
4964                 struct kqueue *kq = knote_get_kq(kn);
4965
4966                 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
4967                 kn->kn_status |= KN_QUEUED;
4968                 kq->kq_count++;
4969                 return 1;
4970         }
4971         return ((kn->kn_status & KN_STAYACTIVE) != 0);
4972 }
4973
4974
4975 /* called with kqueue lock held */
4976 static void
4977 knote_dequeue(struct knote *kn)
4978 {
4979         struct kqueue *kq = knote_get_kq(kn);
4980         struct kqtailq *queue;
4981
4982         if ((kn->kn_status & KN_QUEUED) == 0)
4983                 return;
4984
4985         queue = knote_get_queue(kn);
4986         TAILQ_REMOVE(queue, kn, kn_tqe);
4987         kn->kn_status &= ~KN_QUEUED;
4988         kq->kq_count--;
4989 }
4990
4991 void
4992 knote_init(void)
4993 {
4994         knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote),
4995                            8192, "knote zone");
4996
4997         kqfile_zone = zinit(sizeof(struct kqfile), 8192*sizeof(struct kqfile),
4998                             8192, "kqueue file zone");
4999
5000         kqworkq_zone = zinit(sizeof(struct kqworkq), 8192*sizeof(struct kqworkq),
5001                             8192, "kqueue workq zone");
5002
5003         /* allocate kq lock group attribute and group */
5004         kq_lck_grp_attr = lck_grp_attr_alloc_init();
5005
5006         kq_lck_grp = lck_grp_alloc_init("kqueue",  kq_lck_grp_attr);
5007
5008         /* Allocate kq lock attribute */
5009         kq_lck_attr = lck_attr_alloc_init();
5010
5011         /* Initialize the timer filter lock */
5012         lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
5013
5014         /* Initialize the user filter lock */
5015         lck_spin_init(&_filt_userlock, kq_lck_grp, kq_lck_attr);
5016
5017 #if CONFIG_MEMORYSTATUS
5018         /* Initialize the memorystatus list lock */
5019         memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
5020 #endif
5021 }
5022 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
5023
5024 struct filterops *
5025 knote_fops(struct knote *kn)
5026 {
5027         return sysfilt_ops[kn->kn_filtid];
5028 }
5029
5030 static struct knote *
5031 knote_alloc(void)
5032 {
5033         return ((struct knote *)zalloc(knote_zone));
5034 }
5035
5036 static void
5037 knote_free(struct knote *kn)
5038 {
5039         zfree(knote_zone, kn);
5040 }
5041
5042 #if SOCKETS
5043 #include <sys/param.h>
5044 #include <sys/socket.h>
5045 #include <sys/protosw.h>
5046 #include <sys/domain.h>
5047 #include <sys/mbuf.h>
5048 #include <sys/kern_event.h>
5049 #include <sys/malloc.h>
5050 #include <sys/sys_domain.h>
5051 #include <sys/syslog.h>
5052
5053 #ifndef ROUNDUP64
5054 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
5055 #endif
5056
5057 #ifndef ADVANCE64
5058 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
5059 #endif
5060
5061 static lck_grp_attr_t *kev_lck_grp_attr;
5062 static lck_attr_t *kev_lck_attr;
5063 static lck_grp_t *kev_lck_grp;
5064 static decl_lck_rw_data(,kev_lck_data);
5065 static lck_rw_t *kev_rwlock = &kev_lck_data;
5066
5067 static int kev_attach(struct socket *so, int proto, struct proc *p);
5068 static int kev_detach(struct socket *so);
5069 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
5070     struct ifnet *ifp, struct proc *p);
5071 static lck_mtx_t * event_getlock(struct socket *, int);
5072 static int event_lock(struct socket *, int, void *);
5073 static int event_unlock(struct socket *, int, void *);
5074
5075 static int event_sofreelastref(struct socket *);
5076 static void kev_delete(struct kern_event_pcb *);
5077
5078 static struct pr_usrreqs event_usrreqs = {
5079         .pru_attach =           kev_attach,
5080         .pru_control =          kev_control,
5081         .pru_detach =           kev_detach,
5082         .pru_soreceive =        soreceive,
5083 };
5084
5085 static struct protosw eventsw[] = {
5086 {
5087         .pr_type =              SOCK_RAW,
5088         .pr_protocol =          SYSPROTO_EVENT,
5089         .pr_flags =             PR_ATOMIC,
5090         .pr_usrreqs =           &event_usrreqs,
5091         .pr_lock =              event_lock,
5092         .pr_unlock =            event_unlock,
5093         .pr_getlock =           event_getlock,
5094 }
5095 };
5096
5097 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
5098 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
5099
5100 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
5101         CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Kernel event family");
5102
5103 struct kevtstat kevtstat;
5104 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
5105     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
5106     kevt_getstat, "S,kevtstat", "");
5107
5108 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
5109         CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
5110         kevt_pcblist, "S,xkevtpcb", "");
5111
5112 static lck_mtx_t *
5113 event_getlock(struct socket *so, int locktype)
5114 {
5115 #pragma unused(locktype)
5116         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
5117
5118         if (so->so_pcb != NULL)  {
5119                 if (so->so_usecount < 0)
5120                         panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
5121                             so, so->so_usecount, solockhistory_nr(so));
5122                         /* NOTREACHED */
5123         } else {
5124                 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
5125                     so, solockhistory_nr(so));
5126                 /* NOTREACHED */
5127         }
5128         return (&ev_pcb->evp_mtx);
5129 }
5130
5131 static int
5132 event_lock(struct socket *so, int refcount, void *lr)
5133 {
5134         void *lr_saved;
5135
5136         if (lr == NULL)
5137                 lr_saved = __builtin_return_address(0);
5138         else
5139                 lr_saved = lr;
5140
5141         if (so->so_pcb != NULL) {
5142                 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
5143         } else  {
5144                 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
5145                     so, lr_saved, solockhistory_nr(so));
5146                 /* NOTREACHED */
5147         }
5148
5149         if (so->so_usecount < 0) {
5150                 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
5151                     so, so->so_pcb, lr_saved, so->so_usecount,
5152                     solockhistory_nr(so));
5153                 /* NOTREACHED */
5154         }
5155
5156         if (refcount)
5157                 so->so_usecount++;
5158
5159         so->lock_lr[so->next_lock_lr] = lr_saved;
5160         so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
5161         return (0);
5162 }
5163
5164 static int
5165 event_unlock(struct socket *so, int refcount, void *lr)
5166 {
5167         void *lr_saved;
5168         lck_mtx_t *mutex_held;
5169
5170         if (lr == NULL)
5171                 lr_saved = __builtin_return_address(0);
5172         else
5173                 lr_saved = lr;
5174
5175         if (refcount)
5176                 so->so_usecount--;
5177
5178         if (so->so_usecount < 0) {
5179                 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
5180                     so, so->so_usecount, solockhistory_nr(so));
5181                 /* NOTREACHED */
5182         }
5183         if (so->so_pcb == NULL) {
5184                 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
5185                     so, so->so_usecount, (void *)lr_saved,
5186                     solockhistory_nr(so));
5187                 /* NOTREACHED */
5188         }
5189         mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
5190
5191         lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5192         so->unlock_lr[so->next_unlock_lr] = lr_saved;
5193         so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
5194
5195         if (so->so_usecount == 0) {
5196                 VERIFY(so->so_flags & SOF_PCBCLEARING);
5197                 event_sofreelastref(so);
5198         } else {
5199                 lck_mtx_unlock(mutex_held);
5200         }
5201
5202         return (0);
5203 }
5204
5205 static int
5206 event_sofreelastref(struct socket *so)
5207 {
5208         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
5209
5210         lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
5211
5212         so->so_pcb = NULL;
5213
5214         /*
5215          * Disable upcall in the event another thread is in kev_post_msg()
5216          * appending record to the receive socket buffer, since sbwakeup()
5217          * may release the socket lock otherwise.
5218          */
5219         so->so_rcv.sb_flags &= ~SB_UPCALL;
5220         so->so_snd.sb_flags &= ~SB_UPCALL;
5221         so->so_event = sonullevent;
5222         lck_mtx_unlock(&(ev_pcb->evp_mtx));
5223
5224         lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
5225         lck_rw_lock_exclusive(kev_rwlock);
5226         LIST_REMOVE(ev_pcb, evp_link);
5227         kevtstat.kes_pcbcount--;
5228         kevtstat.kes_gencnt++;
5229         lck_rw_done(kev_rwlock);
5230         kev_delete(ev_pcb);
5231
5232         sofreelastref(so, 1);
5233         return (0);
5234 }
5235
5236 static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));
5237
5238 static
5239 struct kern_event_head kern_event_head;
5240
5241 static u_int32_t static_event_id = 0;
5242
5243 #define EVPCB_ZONE_MAX          65536
5244 #define EVPCB_ZONE_NAME         "kerneventpcb"
5245 static struct zone *ev_pcb_zone;
5246
5247 /*
5248  * Install the protosw's for the NKE manager.  Invoked at extension load time
5249  */
5250 void
5251 kern_event_init(struct domain *dp)
5252 {
5253         struct protosw *pr;
5254         int i;
5255
5256         VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
5257         VERIFY(dp == systemdomain);
5258
5259         kev_lck_grp_attr = lck_grp_attr_alloc_init();
5260         if (kev_lck_grp_attr == NULL) {
5261                 panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
5262                 /* NOTREACHED */
5263         }
5264
5265         kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
5266             kev_lck_grp_attr);
5267         if (kev_lck_grp == NULL) {
5268                 panic("%s: lck_grp_alloc_init failed\n", __func__);
5269                 /* NOTREACHED */
5270         }
5271
5272         kev_lck_attr = lck_attr_alloc_init();
5273         if (kev_lck_attr == NULL) {
5274                 panic("%s: lck_attr_alloc_init failed\n", __func__);
5275                 /* NOTREACHED */
5276         }
5277
5278         lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
5279         if (kev_rwlock == NULL) {
5280                 panic("%s: lck_mtx_alloc_init failed\n", __func__);
5281                 /* NOTREACHED */
5282         }
5283
5284         for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++)
5285                 net_add_proto(pr, dp, 1);
5286
5287         ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
5288             EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
5289         if (ev_pcb_zone == NULL) {
5290                 panic("%s: failed allocating ev_pcb_zone", __func__);
5291                 /* NOTREACHED */
5292         }
5293         zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
5294         zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
5295 }
5296
5297 static int
5298 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
5299 {
5300         int error = 0;
5301         struct kern_event_pcb *ev_pcb;
5302
5303         error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
5304         if (error != 0)
5305                 return (error);
5306
5307         if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
5308                 return (ENOBUFS);
5309         }
5310         bzero(ev_pcb, sizeof(struct kern_event_pcb));
5311         lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
5312
5313         ev_pcb->evp_socket = so;
5314         ev_pcb->evp_vendor_code_filter = 0xffffffff;
5315
5316         so->so_pcb = (caddr_t) ev_pcb;
5317         lck_rw_lock_exclusive(kev_rwlock);
5318         LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
5319         kevtstat.kes_pcbcount++;
5320         kevtstat.kes_gencnt++;
5321         lck_rw_done(kev_rwlock);
5322
5323         return (error);
5324 }
5325
5326 static void
5327 kev_delete(struct kern_event_pcb *ev_pcb)
5328 {
5329         VERIFY(ev_pcb != NULL);
5330         lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
5331         zfree(ev_pcb_zone, ev_pcb);
5332 }
5333
5334 static int
5335 kev_detach(struct socket *so)
5336 {
5337         struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
5338
5339         if (ev_pcb != NULL) {
5340                 soisdisconnected(so);
5341                 so->so_flags |= SOF_PCBCLEARING;
5342         }
5343
5344         return (0);
5345 }
5346
5347 /*
5348  * For now, kev_vendor_code and mbuf_tags use the same
5349  * mechanism.
5350  */
5351 errno_t kev_vendor_code_find(
5352         const char      *string,
5353         u_int32_t       *out_vendor_code)
5354 {
5355         if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
5356                 return (EINVAL);
5357         }
5358         return (net_str_id_find_internal(string, out_vendor_code,
5359             NSI_VENDOR_CODE, 1));
5360 }
5361
5362 errno_t
5363 kev_msg_post(struct kev_msg *event_msg)
5364 {
5365         mbuf_tag_id_t min_vendor, max_vendor;
5366
5367         net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
5368
5369         if (event_msg == NULL)
5370                 return (EINVAL);
5371
5372         /*
5373          * Limit third parties to posting events for registered vendor codes
5374          * only
5375          */
5376         if (event_msg->vendor_code < min_vendor ||
5377             event_msg->vendor_code > max_vendor) {
5378                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
5379                 return (EINVAL);
5380         }
5381         return (kev_post_msg(event_msg));
5382 }
5383
5384 int
5385 kev_post_msg(struct kev_msg *event_msg)
5386 {
5387         struct mbuf *m, *m2;
5388         struct kern_event_pcb *ev_pcb;
5389         struct kern_event_msg *ev;
5390         char *tmp;
5391         u_int32_t total_size;
5392         int i;
5393
5394         /* Verify the message is small enough to fit in one mbuf w/o cluster */
5395         total_size = KEV_MSG_HEADER_SIZE;
5396
5397         for (i = 0; i < 5; i++) {
5398                 if (event_msg->dv[i].data_length == 0)
5399                         break;
5400                 total_size += event_msg->dv[i].data_length;
5401         }
5402
5403         if (total_size > MLEN) {
5404                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
5405                 return (EMSGSIZE);
5406         }
5407
5408         m = m_get(M_DONTWAIT, MT_DATA);
5409         if (m == 0) {
5410                 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
5411                 return (ENOMEM);
5412         }
5413         ev = mtod(m, struct kern_event_msg *);
5414         total_size = KEV_MSG_HEADER_SIZE;
5415
5416         tmp = (char *) &ev->event_data[0];
5417         for (i = 0; i < 5; i++) {
5418                 if (event_msg->dv[i].data_length == 0)
5419                         break;
5420
5421                 total_size += event_msg->dv[i].data_length;
5422                 bcopy(event_msg->dv[i].data_ptr, tmp,
5423                     event_msg->dv[i].data_length);
5424                 tmp += event_msg->dv[i].data_length;
5425         }
5426
5427         ev->id = ++static_event_id;
5428         ev->total_size   = total_size;
5429         ev->vendor_code  = event_msg->vendor_code;
5430         ev->kev_class    = event_msg->kev_class;
5431         ev->kev_subclass = event_msg->kev_subclass;
5432         ev->event_code   = event_msg->event_code;
5433
5434         m->m_len = total_size;
5435         lck_rw_lock_shared(kev_rwlock);
5436         for (ev_pcb = LIST_FIRST(&kern_event_head);
5437             ev_pcb;
5438             ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
5439                 lck_mtx_lock(&ev_pcb->evp_mtx);
5440                 if (ev_pcb->evp_socket->so_pcb == NULL) {
5441                         lck_mtx_unlock(&ev_pcb->evp_mtx);
5442                         continue;
5443                 }
5444                 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
5445                         if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
5446                                 lck_mtx_unlock(&ev_pcb->evp_mtx);
5447                                 continue;
5448                         }
5449
5450                         if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
5451                                 if (ev_pcb->evp_class_filter != ev->kev_class) {
5452                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
5453                                         continue;
5454                                 }
5455
5456                                 if ((ev_pcb->evp_subclass_filter !=
5457                                     KEV_ANY_SUBCLASS) &&
5458                                     (ev_pcb->evp_subclass_filter !=
5459                                     ev->kev_subclass)) {
5460                                         lck_mtx_unlock(&ev_pcb->evp_mtx);
5461                                         continue;
5462                                 }
5463                         }
5464                 }
5465
5466                 m2 = m_copym(m, 0, m->m_len, M_NOWAIT);
5467                 if (m2 == 0) {
5468                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
5469                         m_free(m);
5470                         lck_mtx_unlock(&ev_pcb->evp_mtx);
5471                         lck_rw_done(kev_rwlock);
5472                         return (ENOMEM);
5473                 }
5474                 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
5475                         /*
5476                          * We use "m" for the socket stats as it would be
5477                          * unsafe to use "m2"
5478                          */
5479                         so_inc_recv_data_stat(ev_pcb->evp_socket,
5480                             1, m->m_len, MBUF_TC_BE);
5481
5482                         sorwakeup(ev_pcb->evp_socket);
5483                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
5484                 } else {
5485                         OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
5486                 }
5487                 lck_mtx_unlock(&ev_pcb->evp_mtx);
5488         }
5489         m_free(m);
5490         lck_rw_done(kev_rwlock);
5491
5492         return (0);
5493 }
5494
5495 static int
5496 kev_control(struct socket *so,
5497     u_long cmd,
5498     caddr_t data,
5499     __unused struct ifnet *ifp,
5500     __unused struct proc *p)
5501 {
5502         struct kev_request *kev_req = (struct kev_request *) data;
5503         struct kern_event_pcb  *ev_pcb;
5504         struct kev_vendor_code *kev_vendor;
5505         u_int32_t  *id_value = (u_int32_t *) data;
5506
5507         switch (cmd) {
5508                 case SIOCGKEVID:
5509                         *id_value = static_event_id;
5510                         break;
5511                 case SIOCSKEVFILT:
5512                         ev_pcb = (struct kern_event_pcb *) so->so_pcb;
5513                         ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
5514                         ev_pcb->evp_class_filter = kev_req->kev_class;
5515                         ev_pcb->evp_subclass_filter  = kev_req->kev_subclass;
5516                         break;
5517                 case SIOCGKEVFILT:
5518                         ev_pcb = (struct kern_event_pcb *) so->so_pcb;
5519                         kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
5520                         kev_req->kev_class   = ev_pcb->evp_class_filter;
5521                         kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
5522                         break;
5523                 case SIOCGKEVVENDOR:
5524                         kev_vendor = (struct kev_vendor_code *)data;
5525                         /* Make sure string is NULL terminated */
5526                         kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
5527                         return (net_str_id_find_internal(kev_vendor->vendor_string,
5528                             &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0));
5529                 default:
5530                         return (ENOTSUP);
5531         }
5532
5533         return (0);
5534 }
5535
5536 int
5537 kevt_getstat SYSCTL_HANDLER_ARGS
5538 {
5539 #pragma unused(oidp, arg1, arg2)
5540         int error = 0;
5541
5542         lck_rw_lock_shared(kev_rwlock);
5543
5544         if (req->newptr != USER_ADDR_NULL) {
5545                 error = EPERM;
5546                 goto done;
5547         }
5548         if (req->oldptr == USER_ADDR_NULL) {
5549                 req->oldidx = sizeof(struct kevtstat);
5550                 goto done;
5551         }
5552
5553         error = SYSCTL_OUT(req, &kevtstat,
5554             MIN(sizeof(struct kevtstat), req->oldlen));
5555 done:
5556         lck_rw_done(kev_rwlock);
5557
5558         return (error);
5559 }
5560
5561 __private_extern__ int
5562 kevt_pcblist SYSCTL_HANDLER_ARGS
5563 {
5564 #pragma unused(oidp, arg1, arg2)
5565         int error = 0;
5566         int n, i;
5567         struct xsystmgen xsg;
5568         void *buf = NULL;
5569         size_t item_size = ROUNDUP64(sizeof (struct xkevtpcb)) +
5570                 ROUNDUP64(sizeof (struct xsocket_n)) +
5571                 2 * ROUNDUP64(sizeof (struct xsockbuf_n)) +
5572                 ROUNDUP64(sizeof (struct xsockstat_n));
5573         struct kern_event_pcb  *ev_pcb;
5574
5575         buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
5576         if (buf == NULL)
5577                 return (ENOMEM);
5578
5579         lck_rw_lock_shared(kev_rwlock);
5580
5581         n = kevtstat.kes_pcbcount;
5582
5583         if (req->oldptr == USER_ADDR_NULL) {
5584                 req->oldidx = (n + n/8) * item_size;
5585                 goto done;
5586         }
5587         if (req->newptr != USER_ADDR_NULL) {
5588                 error = EPERM;
5589                 goto done;
5590         }
5591         bzero(&xsg, sizeof (xsg));
5592         xsg.xg_len = sizeof (xsg);
5593         xsg.xg_count = n;
5594         xsg.xg_gen = kevtstat.kes_gencnt;
5595         xsg.xg_sogen = so_gencnt;
5596         error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
5597         if (error) {
5598                 goto done;
5599         }
5600         /*
5601          * We are done if there is no pcb
5602          */
5603         if (n == 0) {
5604                 goto done;
5605         }
5606
5607         i = 0;
5608         for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
5609             i < n && ev_pcb != NULL;
5610             i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
5611                 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
5612                 struct xsocket_n *xso = (struct xsocket_n *)
5613                         ADVANCE64(xk, sizeof (*xk));
5614                 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
5615                         ADVANCE64(xso, sizeof (*xso));
5616                 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
5617                         ADVANCE64(xsbrcv, sizeof (*xsbrcv));
5618                 struct xsockstat_n *xsostats = (struct xsockstat_n *)
5619                         ADVANCE64(xsbsnd, sizeof (*xsbsnd));
5620
5621                 bzero(buf, item_size);
5622
5623                 lck_mtx_lock(&ev_pcb->evp_mtx);
5624
5625                 xk->kep_len = sizeof(struct xkevtpcb);
5626                 xk->kep_kind = XSO_EVT;
5627                 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
5628                 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
5629                 xk->kep_class_filter = ev_pcb->evp_class_filter;
5630                 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
5631
5632                 sotoxsocket_n(ev_pcb->evp_socket, xso);
5633                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
5634                         &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
5635                 sbtoxsockbuf_n(ev_pcb->evp_socket ?
5636                         &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
5637                 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
5638
5639                 lck_mtx_unlock(&ev_pcb->evp_mtx);
5640
5641                 error = SYSCTL_OUT(req, buf, item_size);
5642         }
5643
5644         if (error == 0) {
5645                 /*
5646                  * Give the user an updated idea of our state.
5647                  * If the generation differs from what we told
5648                  * her before, she knows that something happened
5649                  * while we were processing this request, and it
5650                  * might be necessary to retry.
5651                  */
5652                 bzero(&xsg, sizeof (xsg));
5653                 xsg.xg_len = sizeof (xsg);
5654                 xsg.xg_count = n;
5655                 xsg.xg_gen = kevtstat.kes_gencnt;
5656                 xsg.xg_sogen = so_gencnt;
5657                 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
5658                 if (error) {
5659                         goto done;
5660                 }
5661         }
5662
5663 done:
5664         lck_rw_done(kev_rwlock);
5665
5666         return (error);
5667 }
5668
5669 #endif /* SOCKETS */
5670
5671
5672 int
5673 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
5674 {
5675         struct vinfo_stat * st;
5676
5677         st = &kinfo->kq_stat;
5678
5679         st->vst_size = kq->kq_count;
5680         if (kq->kq_state & KQ_KEV_QOS)
5681                 st->vst_blksize = sizeof(struct kevent_qos_s);
5682         else if (kq->kq_state & KQ_KEV64)
5683                 st->vst_blksize = sizeof(struct kevent64_s);
5684         else
5685                 st->vst_blksize = sizeof(struct kevent);
5686         st->vst_mode = S_IFIFO;
5687
5688         /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
5689 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS)
5690         kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
5691
5692         return (0);
5693 }
5694
5695
5696 void
5697 knote_markstayactive(struct knote *kn)
5698 {
5699         kqlock(knote_get_kq(kn));
5700         kn->kn_status |= KN_STAYACTIVE;
5701
5702         /* handle all stayactive knotes on the manager */
5703         if (knote_get_kq(kn)->kq_state & KQ_WORKQ)
5704                 knote_set_qos_index(kn, KQWQ_QOS_MANAGER);
5705
5706         knote_activate(kn);
5707         kqunlock(knote_get_kq(kn));
5708 }
5709
5710 void
5711 knote_clearstayactive(struct knote *kn)
5712 {
5713         kqlock(knote_get_kq(kn));
5714         kn->kn_status &= ~KN_STAYACTIVE;
5715         knote_deactivate(kn);
5716         kqunlock(knote_get_kq(kn));
5717 }
5718
5719 static unsigned long
5720 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
5721                 unsigned long buflen, unsigned long nknotes)
5722 {
5723         struct kevent_internal_s *kevp;
5724         for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
5725                 if (kq == knote_get_kq(kn)) {
5726                         if (nknotes < buflen) {
5727                                 struct kevent_extinfo *info = &buf[nknotes];
5728                                 struct kevent_qos_s kevqos;
5729
5730                                 kqlock(kq);
5731                                 kevp = &(kn->kn_kevent);
5732
5733                                 bzero(&kevqos, sizeof(kevqos));
5734                                 kevqos.ident = kevp->ident;
5735                                 kevqos.filter = kevp->filter;
5736                                 kevqos.flags = kevp->flags;
5737                                 kevqos.fflags = kevp->fflags;
5738                                 kevqos.data = (int64_t) kevp->data;
5739                                 kevqos.udata = kevp->udata;
5740                                 kevqos.ext[0] = kevp->ext[0];
5741                                 kevqos.ext[1] = kevp->ext[1];
5742
5743                                 memcpy(&info->kqext_kev, &kevqos, sizeof(info->kqext_kev));
5744                                 info->kqext_sdata = kn->kn_sdata;
5745                                 info->kqext_status = kn->kn_status;
5746                                 info->kqext_sfflags = kn->kn_sfflags;
5747
5748                                 kqunlock(kq);
5749                         }
5750
5751                         /* we return total number of knotes, which may be more than requested */
5752                         nknotes++;
5753                 }
5754         }
5755
5756         return nknotes;
5757 }
5758
5759 int
5760 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
5761                 uint32_t bufsize, int32_t *retval)
5762 {
5763         struct knote *kn;
5764         int i;
5765         int err = 0;
5766         struct filedesc *fdp = p->p_fd;
5767         unsigned long nknotes = 0;
5768         unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
5769         struct kevent_extinfo *kqext = NULL;
5770
5771         /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
5772         buflen = min(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
5773
5774         kqext = kalloc(buflen * sizeof(struct kevent_extinfo));
5775         if (kqext == NULL) {
5776                 err = ENOMEM;
5777                 goto out;
5778         }
5779         bzero(kqext, buflen * sizeof(struct kevent_extinfo));
5780
5781         proc_fdlock(p);
5782
5783         for (i = 0; i < fdp->fd_knlistsize; i++) {
5784                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
5785                 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
5786         }
5787
5788         if (fdp->fd_knhashmask != 0) {
5789                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
5790                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
5791                         nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
5792                 }
5793         }
5794
5795         proc_fdunlock(p);
5796
5797         assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
5798         err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
5799
5800  out:
5801         if (kqext) {
5802                 kfree(kqext, buflen * sizeof(struct kevent_extinfo));
5803                 kqext = NULL;
5804         }
5805
5806         if (!err) {
5807                 *retval = min(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
5808         }
5809         return err;
5810 }
5811
5812 static unsigned long
5813 kevent_udatainfo_emit(struct kqueue *kq, struct knote *kn, uint64_t *buf,
5814                 unsigned long buflen, unsigned long nknotes)
5815 {
5816         struct kevent_internal_s *kevp;
5817         for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
5818                 if (kq == knote_get_kq(kn)) {
5819                         if (nknotes < buflen) {
5820                                 kqlock(kq);
5821                                 kevp = &(kn->kn_kevent);
5822                                 buf[nknotes] = kevp->udata;
5823                                 kqunlock(kq);
5824                         }
5825
5826                         /* we return total number of knotes, which may be more than requested */
5827                         nknotes++;
5828                 }
5829         }
5830
5831         return nknotes;
5832 }
5833
5834 int
5835 pid_kqueue_udatainfo(proc_t p, struct kqueue *kq, uint64_t *buf,
5836                 uint32_t bufsize)
5837 {
5838         struct knote *kn;
5839         int i;
5840         struct filedesc *fdp = p->p_fd;
5841         unsigned long nknotes = 0;
5842         unsigned long buflen = bufsize / sizeof(uint64_t);
5843
5844         proc_fdlock(p);
5845
5846         for (i = 0; i < fdp->fd_knlistsize; i++) {
5847                 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
5848                 nknotes = kevent_udatainfo_emit(kq, kn, buf, buflen, nknotes);
5849         }
5850
5851         if (fdp->fd_knhashmask != 0) {
5852                 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
5853                         kn = SLIST_FIRST(&fdp->fd_knhash[i]);
5854                         nknotes = kevent_udatainfo_emit(kq, kn, buf, buflen, nknotes);
5855                 }
5856         }
5857
5858         proc_fdunlock(p);
5859         return (int)nknotes;
5860 }
5861