]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_event.c
xnu-7195.81.3.tar.gz
[apple/xnu.git] / bsd / kern / kern_event.c
1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29 /*-
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54 /*
55 * @(#)kern_event.c 1.0 (3/31/2000)
56 */
57 #include <stdint.h>
58 #include <machine/atomic.h>
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
80 #include <sys/uio.h>
81 #include <sys/sysproto.h>
82 #include <sys/user.h>
83 #include <sys/vnode_internal.h>
84 #include <string.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
89 #include <os/base.h>
90 #include <pexpert/pexpert.h>
91
92 #include <kern/locks.h>
93 #include <kern/clock.h>
94 #include <kern/cpu_data.h>
95 #include <kern/policy_internal.h>
96 #include <kern/thread_call.h>
97 #include <kern/sched_prim.h>
98 #include <kern/waitq.h>
99 #include <kern/zalloc.h>
100 #include <kern/kalloc.h>
101 #include <kern/assert.h>
102 #include <kern/ast.h>
103 #include <kern/thread.h>
104 #include <kern/kcdata.h>
105
106 #include <pthread/priority_private.h>
107 #include <pthread/workqueue_syscalls.h>
108 #include <pthread/workqueue_internal.h>
109 #include <libkern/libkern.h>
110
111 #include "net/net_str_id.h"
112
113 #include <mach/task.h>
114 #include <libkern/section_keywords.h>
115
116 #if CONFIG_MEMORYSTATUS
117 #include <sys/kern_memorystatus.h>
118 #endif
119
120 #if DEVELOPMENT || DEBUG
121 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0)
122 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1)
123 TUNABLE(uint32_t, kevent_debug_flags, "kevent_debug", 0);
124 #endif
125
126 static LCK_GRP_DECLARE(kq_lck_grp, "kqueue");
127 SECURITY_READ_ONLY_EARLY(vm_packing_params_t) kn_kq_packing_params =
128 VM_PACKING_PARAMS(KNOTE_KQ_PACKED);
129
130 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
131 extern int cansignal(struct proc *, kauth_cred_t, struct proc *, int); /* bsd/kern/kern_sig.c */
132
133 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
134
135 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
136
137 #define KQ_EVENT NO_EVENT64
138
139 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
140 vfs_context_t ctx);
141 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
142 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
143 struct kevent_qos_s *kev);
144 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
145
146 static const struct fileops kqueueops = {
147 .fo_type = DTYPE_KQUEUE,
148 .fo_read = fo_no_read,
149 .fo_write = fo_no_write,
150 .fo_ioctl = fo_no_ioctl,
151 .fo_select = kqueue_select,
152 .fo_close = kqueue_close,
153 .fo_drain = kqueue_drain,
154 .fo_kqfilter = kqueue_kqfilter,
155 };
156
157 static inline int kevent_modern_copyout(struct kevent_qos_s *, user_addr_t *);
158 static int kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int result);
159 static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
160 thread_continue_t cont, struct _kevent_register *cont_args) __dead2;
161 static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
162 static void kevent_register_wait_cleanup(struct knote *kn);
163
164 static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
165 static void kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t, kq_index_t qos, int flags);
166
167 static void kqworkq_unbind(proc_t p, workq_threadreq_t);
168 static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, workq_threadreq_t, thread_t thread);
169 static workq_threadreq_t kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
170
171 static void kqworkloop_unbind(struct kqworkloop *kwql);
172
173 enum kqwl_unbind_locked_mode {
174 KQWL_OVERRIDE_DROP_IMMEDIATELY,
175 KQWL_OVERRIDE_DROP_DELAYED,
176 };
177 static void kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread,
178 enum kqwl_unbind_locked_mode how);
179 static void kqworkloop_unbind_delayed_override_drop(thread_t thread);
180 static kq_index_t kqworkloop_override(struct kqworkloop *kqwl);
181 static void kqworkloop_set_overcommit(struct kqworkloop *kqwl);
182 enum {
183 KQWL_UTQ_NONE,
184 /*
185 * The wakeup qos is the qos of QUEUED knotes.
186 *
187 * This QoS is accounted for with the events override in the
188 * kqr_override_index field. It is raised each time a new knote is queued at
189 * a given QoS. The kqwl_wakeup_indexes field is a superset of the non empty
190 * knote buckets and is recomputed after each event delivery.
191 */
192 KQWL_UTQ_UPDATE_WAKEUP_QOS,
193 KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
194 KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
195 KQWL_UTQ_UNBINDING, /* attempt to rebind */
196 KQWL_UTQ_PARKING,
197 /*
198 * The wakeup override is for suppressed knotes that have fired again at
199 * a higher QoS than the one for which they are suppressed already.
200 * This override is cleared when the knote suppressed list becomes empty.
201 */
202 KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
203 KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
204 /*
205 * The QoS is the maximum QoS of an event enqueued on this workloop in
206 * userland. It is copied from the only EVFILT_WORKLOOP knote with
207 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
208 * such knote, this QoS is 0.
209 */
210 KQWL_UTQ_SET_QOS_INDEX,
211 KQWL_UTQ_REDRIVE_EVENTS,
212 };
213 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
214 static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
215
216 static struct knote *knote_alloc(void);
217 static void knote_free(struct knote *kn);
218 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
219 struct knote_lock_ctx *knlc, struct proc *p);
220 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq,
221 struct kevent_qos_s *kev, bool is_fd, struct proc *p);
222
223 static void knote_activate(kqueue_t kqu, struct knote *kn, int result);
224 static void knote_dequeue(kqueue_t kqu, struct knote *kn);
225
226 static void knote_apply_touch(kqueue_t kqu, struct knote *kn,
227 struct kevent_qos_s *kev, int result);
228 static void knote_suppress(kqueue_t kqu, struct knote *kn);
229 static void knote_unsuppress(kqueue_t kqu, struct knote *kn);
230 static void knote_drop(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc);
231
232 // both these functions may dequeue the knote and it is up to the caller
233 // to enqueue the knote back
234 static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
235 static void knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp);
236
237 static ZONE_DECLARE(knote_zone, "knote zone",
238 sizeof(struct knote), ZC_CACHING | ZC_ZFREE_CLEARMEM);
239 static ZONE_DECLARE(kqfile_zone, "kqueue file zone",
240 sizeof(struct kqfile), ZC_ZFREE_CLEARMEM);
241 static ZONE_DECLARE(kqworkq_zone, "kqueue workq zone",
242 sizeof(struct kqworkq), ZC_ZFREE_CLEARMEM);
243 static ZONE_DECLARE(kqworkloop_zone, "kqueue workloop zone",
244 sizeof(struct kqworkloop), ZC_CACHING | ZC_ZFREE_CLEARMEM);
245
246 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
247
248 static int filt_no_attach(struct knote *kn, struct kevent_qos_s *kev);
249 static void filt_no_detach(struct knote *kn);
250 static int filt_bad_event(struct knote *kn, long hint);
251 static int filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev);
252 static int filt_bad_process(struct knote *kn, struct kevent_qos_s *kev);
253
254 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
255 .f_attach = filt_no_attach,
256 .f_detach = filt_no_detach,
257 .f_event = filt_bad_event,
258 .f_touch = filt_bad_touch,
259 .f_process = filt_bad_process,
260 };
261
262 #if CONFIG_MEMORYSTATUS
263 extern const struct filterops memorystatus_filtops;
264 #endif /* CONFIG_MEMORYSTATUS */
265 extern const struct filterops fs_filtops;
266 extern const struct filterops sig_filtops;
267 extern const struct filterops machport_filtops;
268 extern const struct filterops pipe_nfiltops;
269 extern const struct filterops pipe_rfiltops;
270 extern const struct filterops pipe_wfiltops;
271 extern const struct filterops ptsd_kqops;
272 extern const struct filterops ptmx_kqops;
273 extern const struct filterops soread_filtops;
274 extern const struct filterops sowrite_filtops;
275 extern const struct filterops sock_filtops;
276 extern const struct filterops soexcept_filtops;
277 extern const struct filterops spec_filtops;
278 extern const struct filterops bpfread_filtops;
279 extern const struct filterops necp_fd_rfiltops;
280 extern const struct filterops fsevent_filtops;
281 extern const struct filterops vnode_filtops;
282 extern const struct filterops tty_filtops;
283
284 const static struct filterops file_filtops;
285 const static struct filterops kqread_filtops;
286 const static struct filterops proc_filtops;
287 const static struct filterops timer_filtops;
288 const static struct filterops user_filtops;
289 const static struct filterops workloop_filtops;
290
291 /*
292 *
293 * Rules for adding new filters to the system:
294 * Public filters:
295 * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
296 * in the exported section of the header
297 * - Update the EVFILT_SYSCOUNT value to reflect the new addition
298 * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
299 * of the Public Filters section in the array.
300 * Private filters:
301 * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
302 * in the XNU_KERNEL_PRIVATE section of the header
303 * - Update the EVFILTID_MAX value to reflect the new addition
304 * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
305 * the Private filters section of the array.
306 */
307 static_assert(EVFILTID_MAX < UINT8_MAX, "kn_filtid expects this to be true");
308 static const struct filterops * const sysfilt_ops[EVFILTID_MAX] = {
309 /* Public Filters */
310 [~EVFILT_READ] = &file_filtops,
311 [~EVFILT_WRITE] = &file_filtops,
312 [~EVFILT_AIO] = &bad_filtops,
313 [~EVFILT_VNODE] = &file_filtops,
314 [~EVFILT_PROC] = &proc_filtops,
315 [~EVFILT_SIGNAL] = &sig_filtops,
316 [~EVFILT_TIMER] = &timer_filtops,
317 [~EVFILT_MACHPORT] = &machport_filtops,
318 [~EVFILT_FS] = &fs_filtops,
319 [~EVFILT_USER] = &user_filtops,
320 [~EVFILT_UNUSED_11] = &bad_filtops,
321 [~EVFILT_VM] = &bad_filtops,
322 [~EVFILT_SOCK] = &file_filtops,
323 #if CONFIG_MEMORYSTATUS
324 [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops,
325 #else
326 [~EVFILT_MEMORYSTATUS] = &bad_filtops,
327 #endif
328 [~EVFILT_EXCEPT] = &file_filtops,
329 [~EVFILT_WORKLOOP] = &workloop_filtops,
330
331 /* Private filters */
332 [EVFILTID_KQREAD] = &kqread_filtops,
333 [EVFILTID_PIPE_N] = &pipe_nfiltops,
334 [EVFILTID_PIPE_R] = &pipe_rfiltops,
335 [EVFILTID_PIPE_W] = &pipe_wfiltops,
336 [EVFILTID_PTSD] = &ptsd_kqops,
337 [EVFILTID_SOREAD] = &soread_filtops,
338 [EVFILTID_SOWRITE] = &sowrite_filtops,
339 [EVFILTID_SCK] = &sock_filtops,
340 [EVFILTID_SOEXCEPT] = &soexcept_filtops,
341 [EVFILTID_SPEC] = &spec_filtops,
342 [EVFILTID_BPFREAD] = &bpfread_filtops,
343 [EVFILTID_NECP_FD] = &necp_fd_rfiltops,
344 [EVFILTID_FSEVENT] = &fsevent_filtops,
345 [EVFILTID_VN] = &vnode_filtops,
346 [EVFILTID_TTY] = &tty_filtops,
347 [EVFILTID_PTMX] = &ptmx_kqops,
348
349 /* fake filter for detached knotes, keep last */
350 [EVFILTID_DETACHED] = &bad_filtops,
351 };
352
353 /* waitq prepost callback */
354 void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *kq_hook);
355
356 static inline bool
357 kqr_thread_bound(workq_threadreq_t kqr)
358 {
359 return kqr->tr_state == WORKQ_TR_STATE_BOUND;
360 }
361
362 static inline bool
363 kqr_thread_requested_pending(workq_threadreq_t kqr)
364 {
365 workq_tr_state_t tr_state = kqr->tr_state;
366 return tr_state > WORKQ_TR_STATE_IDLE && tr_state < WORKQ_TR_STATE_BOUND;
367 }
368
369 static inline bool
370 kqr_thread_requested(workq_threadreq_t kqr)
371 {
372 return kqr->tr_state != WORKQ_TR_STATE_IDLE;
373 }
374
375 static inline thread_t
376 kqr_thread_fast(workq_threadreq_t kqr)
377 {
378 assert(kqr_thread_bound(kqr));
379 return kqr->tr_thread;
380 }
381
382 static inline thread_t
383 kqr_thread(workq_threadreq_t kqr)
384 {
385 return kqr_thread_bound(kqr) ? kqr->tr_thread : THREAD_NULL;
386 }
387
388 static inline struct kqworkloop *
389 kqr_kqworkloop(workq_threadreq_t kqr)
390 {
391 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
392 return __container_of(kqr, struct kqworkloop, kqwl_request);
393 }
394 return NULL;
395 }
396
397 static inline kqueue_t
398 kqr_kqueue(proc_t p, workq_threadreq_t kqr)
399 {
400 kqueue_t kqu;
401 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
402 kqu.kqwl = kqr_kqworkloop(kqr);
403 } else {
404 kqu.kqwq = p->p_fd->fd_wqkqueue;
405 assert(kqr >= kqu.kqwq->kqwq_request &&
406 kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
407 }
408 return kqu;
409 }
410
411 /*
412 * kqueue/note lock implementations
413 *
414 * The kqueue lock guards the kq state, the state of its queues,
415 * and the kqueue-aware status and locks of individual knotes.
416 *
417 * The kqueue workq lock is used to protect state guarding the
418 * interaction of the kqueue with the workq. This state cannot
419 * be guarded by the kq lock - as it needs to be taken when we
420 * already have the waitq set lock held (during the waitq hook
421 * callback). It might be better to use the waitq lock itself
422 * for this, but the IRQ requirements make that difficult).
423 *
424 * Knote flags, filter flags, and associated data are protected
425 * by the underlying object lock - and are only ever looked at
426 * by calling the filter to get a [consistent] snapshot of that
427 * data.
428 */
429
430 static inline void
431 kqlock(kqueue_t kqu)
432 {
433 lck_spin_lock(&kqu.kq->kq_lock);
434 }
435
436 static inline void
437 kqlock_held(__assert_only kqueue_t kqu)
438 {
439 LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
440 }
441
442 static inline void
443 kqunlock(kqueue_t kqu)
444 {
445 lck_spin_unlock(&kqu.kq->kq_lock);
446 }
447
448 static inline void
449 knhash_lock(struct filedesc *fdp)
450 {
451 lck_mtx_lock(&fdp->fd_knhashlock);
452 }
453
454 static inline void
455 knhash_unlock(struct filedesc *fdp)
456 {
457 lck_mtx_unlock(&fdp->fd_knhashlock);
458 }
459
460 /* wait event for knote locks */
461 static inline event_t
462 knote_lock_wev(struct knote *kn)
463 {
464 return (event_t)(&kn->kn_hook);
465 }
466
467 /* wait event for kevent_register_wait_* */
468 static inline event64_t
469 knote_filt_wev64(struct knote *kn)
470 {
471 /* kdp_workloop_sync_wait_find_owner knows about this */
472 return CAST_EVENT64_T(kn);
473 }
474
475 /* wait event for knote_post/knote_drop */
476 static inline event64_t
477 knote_post_wev64(struct knote *kn)
478 {
479 return CAST_EVENT64_T(&kn->kn_kevent);
480 }
481
482 /*!
483 * @function knote_has_qos
484 *
485 * @brief
486 * Whether the knote has a regular QoS.
487 *
488 * @discussion
489 * kn_qos_override is:
490 * - 0 on kqfiles
491 * - THREAD_QOS_LAST for special buckets (stayactive, manager)
492 *
493 * Other values mean the knote participates to QoS propagation.
494 */
495 static inline bool
496 knote_has_qos(struct knote *kn)
497 {
498 return kn->kn_qos_override > 0 && kn->kn_qos_override < THREAD_QOS_LAST;
499 }
500
501 #pragma mark knote locks
502
503 /*
504 * Enum used by the knote_lock_* functions.
505 *
506 * KNOTE_KQ_LOCK_ALWAYS
507 * The function will always return with the kq lock held.
508 *
509 * KNOTE_KQ_LOCK_ON_SUCCESS
510 * The function will return with the kq lock held if it was successful
511 * (knote_lock() is the only function that can fail).
512 *
513 * KNOTE_KQ_LOCK_ON_FAILURE
514 * The function will return with the kq lock held if it was unsuccessful
515 * (knote_lock() is the only function that can fail).
516 *
517 * KNOTE_KQ_UNLOCK:
518 * The function returns with the kq unlocked.
519 */
520 enum kqlocking {
521 KNOTE_KQ_LOCK_ALWAYS,
522 KNOTE_KQ_LOCK_ON_SUCCESS,
523 KNOTE_KQ_LOCK_ON_FAILURE,
524 KNOTE_KQ_UNLOCK,
525 };
526
527 static struct knote_lock_ctx *
528 knote_lock_ctx_find(kqueue_t kqu, struct knote *kn)
529 {
530 struct knote_lock_ctx *ctx;
531 LIST_FOREACH(ctx, &kqu.kq->kq_knlocks, knlc_link) {
532 if (ctx->knlc_knote == kn) {
533 return ctx;
534 }
535 }
536 panic("knote lock context not found: %p", kn);
537 __builtin_trap();
538 }
539
540 /* slowpath of knote_lock() */
541 __attribute__((noinline))
542 static bool __result_use_check
543 knote_lock_slow(kqueue_t kqu, struct knote *kn,
544 struct knote_lock_ctx *knlc, int kqlocking)
545 {
546 struct knote_lock_ctx *owner_lc;
547 struct uthread *uth = current_uthread();
548 wait_result_t wr;
549
550 kqlock_held(kqu);
551
552 owner_lc = knote_lock_ctx_find(kqu, kn);
553 #if DEBUG || DEVELOPMENT
554 knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
555 #endif
556 owner_lc->knlc_waiters++;
557
558 /*
559 * Make our lock context visible to knote_unlock()
560 */
561 uth->uu_knlock = knlc;
562
563 wr = lck_spin_sleep_with_inheritor(&kqu.kq->kq_lock, LCK_SLEEP_UNLOCK,
564 knote_lock_wev(kn), owner_lc->knlc_thread,
565 THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
566
567 if (wr == THREAD_RESTART) {
568 /*
569 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
570 * We need to cleanup the state since no one did.
571 */
572 uth->uu_knlock = NULL;
573 #if DEBUG || DEVELOPMENT
574 assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
575 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
576 #endif
577
578 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
579 kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
580 kqlock(kqu);
581 }
582 return false;
583 } else {
584 if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
585 kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
586 kqlock(kqu);
587 #if DEBUG || DEVELOPMENT
588 /*
589 * This state is set under the lock so we can't
590 * really assert this unless we hold the lock.
591 */
592 assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
593 #endif
594 }
595 return true;
596 }
597 }
598
599 /*
600 * Attempts to take the "knote" lock.
601 *
602 * Called with the kqueue lock held.
603 *
604 * Returns true if the knote lock is acquired, false if it has been dropped
605 */
606 static bool __result_use_check
607 knote_lock(kqueue_t kqu, struct knote *kn, struct knote_lock_ctx *knlc,
608 enum kqlocking kqlocking)
609 {
610 kqlock_held(kqu);
611
612 #if DEBUG || DEVELOPMENT
613 assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
614 #endif
615 knlc->knlc_knote = kn;
616 knlc->knlc_thread = current_thread();
617 knlc->knlc_waiters = 0;
618
619 if (__improbable(kn->kn_status & KN_LOCKED)) {
620 return knote_lock_slow(kqu, kn, knlc, kqlocking);
621 }
622
623 /*
624 * When the knote will be dropped, the knote lock is taken before
625 * KN_DROPPING is set, and then the knote will be removed from any
626 * hash table that references it before the lock is canceled.
627 */
628 assert((kn->kn_status & KN_DROPPING) == 0);
629 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, knlc, knlc_link);
630 kn->kn_status |= KN_LOCKED;
631 #if DEBUG || DEVELOPMENT
632 knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
633 #endif
634
635 if (kqlocking == KNOTE_KQ_UNLOCK ||
636 kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
637 kqunlock(kqu);
638 }
639 return true;
640 }
641
642 /*
643 * Unlocks a knote successfully locked with knote_lock().
644 *
645 * Called with the kqueue lock held.
646 *
647 * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
648 */
649 static void
650 knote_unlock(kqueue_t kqu, struct knote *kn,
651 struct knote_lock_ctx *knlc, enum kqlocking kqlocking)
652 {
653 kqlock_held(kqu);
654
655 assert(knlc->knlc_knote == kn);
656 assert(kn->kn_status & KN_LOCKED);
657 #if DEBUG || DEVELOPMENT
658 assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
659 #endif
660
661 LIST_REMOVE(knlc, knlc_link);
662
663 if (knlc->knlc_waiters) {
664 thread_t thread = THREAD_NULL;
665
666 wakeup_one_with_inheritor(knote_lock_wev(kn), THREAD_AWAKENED,
667 LCK_WAKE_DEFAULT, &thread);
668
669 /*
670 * knote_lock_slow() publishes the lock context of waiters
671 * in uthread::uu_knlock.
672 *
673 * Reach out and make this context the new owner.
674 */
675 struct uthread *ut = get_bsdthread_info(thread);
676 struct knote_lock_ctx *next_owner_lc = ut->uu_knlock;
677
678 assert(next_owner_lc->knlc_knote == kn);
679 next_owner_lc->knlc_waiters = knlc->knlc_waiters - 1;
680 LIST_INSERT_HEAD(&kqu.kq->kq_knlocks, next_owner_lc, knlc_link);
681 #if DEBUG || DEVELOPMENT
682 next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
683 #endif
684 ut->uu_knlock = NULL;
685 thread_deallocate_safe(thread);
686 } else {
687 kn->kn_status &= ~KN_LOCKED;
688 }
689
690 if ((kn->kn_status & KN_MERGE_QOS) && !(kn->kn_status & KN_POSTING)) {
691 /*
692 * No f_event() in flight anymore, we can leave QoS "Merge" mode
693 *
694 * See knote_adjust_qos()
695 */
696 kn->kn_status &= ~KN_MERGE_QOS;
697 }
698 if (kqlocking == KNOTE_KQ_UNLOCK) {
699 kqunlock(kqu);
700 }
701 #if DEBUG || DEVELOPMENT
702 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
703 #endif
704 }
705
706 /*
707 * Aborts all waiters for a knote lock, and unlock the knote.
708 *
709 * Called with the kqueue lock held.
710 *
711 * Returns with the kqueue unlocked.
712 */
713 static void
714 knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
715 struct knote_lock_ctx *knlc)
716 {
717 kqlock_held(kq);
718
719 assert(knlc->knlc_knote == kn);
720 assert(kn->kn_status & KN_LOCKED);
721 assert(kn->kn_status & KN_DROPPING);
722
723 LIST_REMOVE(knlc, knlc_link);
724 kn->kn_status &= ~KN_LOCKED;
725 kqunlock(kq);
726
727 if (knlc->knlc_waiters) {
728 wakeup_all_with_inheritor(knote_lock_wev(kn), THREAD_RESTART);
729 }
730 #if DEBUG || DEVELOPMENT
731 knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
732 #endif
733 }
734
735 /*
736 * Call the f_event hook of a given filter.
737 *
738 * Takes a use count to protect against concurrent drops.
739 */
740 static void
741 knote_post(struct knote *kn, long hint)
742 {
743 struct kqueue *kq = knote_get_kq(kn);
744 int dropping, result;
745
746 kqlock(kq);
747
748 if (__improbable(kn->kn_status & (KN_DROPPING | KN_VANISHED))) {
749 return kqunlock(kq);
750 }
751
752 if (__improbable(kn->kn_status & KN_POSTING)) {
753 panic("KNOTE() called concurrently on knote %p", kn);
754 }
755
756 kn->kn_status |= KN_POSTING;
757
758 kqunlock(kq);
759 result = filter_call(knote_fops(kn), f_event(kn, hint));
760 kqlock(kq);
761
762 dropping = (kn->kn_status & KN_DROPPING);
763
764 if (!dropping && (result & FILTER_ACTIVE)) {
765 knote_activate(kq, kn, result);
766 }
767
768 if ((kn->kn_status & KN_LOCKED) == 0) {
769 /*
770 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
771 *
772 * See knote_adjust_qos()
773 */
774 kn->kn_status &= ~(KN_POSTING | KN_MERGE_QOS);
775 } else {
776 kn->kn_status &= ~KN_POSTING;
777 }
778
779 if (__improbable(dropping)) {
780 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, knote_post_wev64(kn),
781 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
782 }
783
784 kqunlock(kq);
785 }
786
787 /*
788 * Called by knote_drop() to wait for the last f_event() caller to be done.
789 *
790 * - kq locked at entry
791 * - kq unlocked at exit
792 */
793 static void
794 knote_wait_for_post(struct kqueue *kq, struct knote *kn)
795 {
796 wait_result_t wr = THREAD_NOT_WAITING;
797
798 kqlock_held(kq);
799
800 assert(kn->kn_status & KN_DROPPING);
801
802 if (kn->kn_status & KN_POSTING) {
803 wr = waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
804 knote_post_wev64(kn), THREAD_UNINT | THREAD_WAIT_NOREPORT,
805 TIMEOUT_WAIT_FOREVER);
806 }
807 kqunlock(kq);
808 if (wr == THREAD_WAITING) {
809 thread_block(THREAD_CONTINUE_NULL);
810 }
811 }
812
813 #pragma mark knote helpers for filters
814
815 OS_ALWAYS_INLINE
816 void
817 knote_set_error(struct knote *kn, int error)
818 {
819 kn->kn_flags |= EV_ERROR;
820 kn->kn_sdata = error;
821 }
822
823 OS_ALWAYS_INLINE
824 int64_t
825 knote_low_watermark(const struct knote *kn)
826 {
827 return (kn->kn_sfflags & NOTE_LOWAT) ? kn->kn_sdata : 1;
828 }
829
830 /*!
831 * @function knote_fill_kevent_with_sdata
832 *
833 * @brief
834 * Fills in a kevent from the current content of a knote.
835 *
836 * @discussion
837 * This is meant to be called from filter's f_event hooks.
838 * The kevent data is filled with kn->kn_sdata.
839 *
840 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
841 *
842 * Using knote_fill_kevent is typically preferred.
843 */
844 OS_ALWAYS_INLINE
845 void
846 knote_fill_kevent_with_sdata(struct knote *kn, struct kevent_qos_s *kev)
847 {
848 #define knote_assert_aliases(name1, offs1, name2) \
849 static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
850 offsetof(struct kevent_internal_s, name2), \
851 "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
852 /*
853 * All the code makes assumptions on these aliasing,
854 * so make sure we fail the build if we ever ever ever break them.
855 */
856 knote_assert_aliases(ident, 0, kei_ident);
857 #ifdef __LITTLE_ENDIAN__
858 knote_assert_aliases(filter, 0, kei_filter); // non trivial overlap
859 knote_assert_aliases(filter, 1, kei_filtid); // non trivial overlap
860 #else
861 knote_assert_aliases(filter, 0, kei_filtid); // non trivial overlap
862 knote_assert_aliases(filter, 1, kei_filter); // non trivial overlap
863 #endif
864 knote_assert_aliases(flags, 0, kei_flags);
865 knote_assert_aliases(qos, 0, kei_qos);
866 knote_assert_aliases(udata, 0, kei_udata);
867 knote_assert_aliases(fflags, 0, kei_fflags);
868 knote_assert_aliases(xflags, 0, kei_sfflags); // non trivial overlap
869 knote_assert_aliases(data, 0, kei_sdata); // non trivial overlap
870 knote_assert_aliases(ext, 0, kei_ext);
871 #undef knote_assert_aliases
872
873 /*
874 * Fix the differences between kevent_qos_s and kevent_internal_s:
875 * - xflags is where kn_sfflags lives, we need to zero it
876 * - fixup the high bits of `filter` where kn_filtid lives
877 */
878 *kev = *(struct kevent_qos_s *)&kn->kn_kevent;
879 kev->xflags = 0;
880 kev->filter |= 0xff00;
881 if (kn->kn_flags & EV_CLEAR) {
882 kn->kn_fflags = 0;
883 }
884 }
885
886 /*!
887 * @function knote_fill_kevent
888 *
889 * @brief
890 * Fills in a kevent from the current content of a knote.
891 *
892 * @discussion
893 * This is meant to be called from filter's f_event hooks.
894 * The kevent data is filled with the passed in data.
895 *
896 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
897 */
898 OS_ALWAYS_INLINE
899 void
900 knote_fill_kevent(struct knote *kn, struct kevent_qos_s *kev, int64_t data)
901 {
902 knote_fill_kevent_with_sdata(kn, kev);
903 kev->filter = kn->kn_filter;
904 kev->data = data;
905 }
906
907
908 #pragma mark file_filtops
909
910 static int
911 filt_fileattach(struct knote *kn, struct kevent_qos_s *kev)
912 {
913 return fo_kqfilter(kn->kn_fp, kn, kev);
914 }
915
916 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
917 .f_isfd = 1,
918 .f_attach = filt_fileattach,
919 };
920
921 #pragma mark kqread_filtops
922
923 #define f_flag fp_glob->fg_flag
924 #define f_ops fp_glob->fg_ops
925 #define f_data fp_glob->fg_data
926 #define f_lflags fp_glob->fg_lflags
927
928 static void
929 filt_kqdetach(struct knote *kn)
930 {
931 struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
932 struct kqueue *kq = &kqf->kqf_kqueue;
933
934 kqlock(kq);
935 KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
936 kqunlock(kq);
937 }
938
939 static int
940 filt_kqueue(struct knote *kn, __unused long hint)
941 {
942 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
943
944 return kq->kq_count > 0;
945 }
946
947 static int
948 filt_kqtouch(struct knote *kn, struct kevent_qos_s *kev)
949 {
950 #pragma unused(kev)
951 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
952 int res;
953
954 kqlock(kq);
955 res = (kq->kq_count > 0);
956 kqunlock(kq);
957
958 return res;
959 }
960
961 static int
962 filt_kqprocess(struct knote *kn, struct kevent_qos_s *kev)
963 {
964 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
965 int res = 0;
966
967 kqlock(kq);
968 if (kq->kq_count) {
969 knote_fill_kevent(kn, kev, kq->kq_count);
970 res = 1;
971 }
972 kqunlock(kq);
973
974 return res;
975 }
976
977 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
978 .f_isfd = 1,
979 .f_detach = filt_kqdetach,
980 .f_event = filt_kqueue,
981 .f_touch = filt_kqtouch,
982 .f_process = filt_kqprocess,
983 };
984
985 #pragma mark proc_filtops
986
987 static int
988 filt_procattach(struct knote *kn, __unused struct kevent_qos_s *kev)
989 {
990 struct proc *p;
991
992 assert(PID_MAX < NOTE_PDATAMASK);
993
994 if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
995 knote_set_error(kn, ENOTSUP);
996 return 0;
997 }
998
999 p = proc_find((int)kn->kn_id);
1000 if (p == NULL) {
1001 knote_set_error(kn, ESRCH);
1002 return 0;
1003 }
1004
1005 const uint32_t NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
1006
1007 if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits) {
1008 do {
1009 pid_t selfpid = proc_selfpid();
1010
1011 if (p->p_ppid == selfpid) {
1012 break; /* parent => ok */
1013 }
1014 if ((p->p_lflag & P_LTRACED) != 0 &&
1015 (p->p_oppid == selfpid)) {
1016 break; /* parent-in-waiting => ok */
1017 }
1018 if (cansignal(current_proc(), kauth_cred_get(), p, SIGKILL)) {
1019 break; /* allowed to signal => ok */
1020 }
1021 proc_rele(p);
1022 knote_set_error(kn, EACCES);
1023 return 0;
1024 } while (0);
1025 }
1026
1027 kn->kn_proc = p;
1028 kn->kn_flags |= EV_CLEAR; /* automatically set */
1029 kn->kn_sdata = 0; /* incoming data is ignored */
1030
1031 proc_klist_lock();
1032
1033 KNOTE_ATTACH(&p->p_klist, kn);
1034
1035 proc_klist_unlock();
1036
1037 proc_rele(p);
1038
1039 /*
1040 * only captures edge-triggered events after this point
1041 * so it can't already be fired.
1042 */
1043 return 0;
1044 }
1045
1046
1047 /*
1048 * The knote may be attached to a different process, which may exit,
1049 * leaving nothing for the knote to be attached to. In that case,
1050 * the pointer to the process will have already been nulled out.
1051 */
1052 static void
1053 filt_procdetach(struct knote *kn)
1054 {
1055 struct proc *p;
1056
1057 proc_klist_lock();
1058
1059 p = kn->kn_proc;
1060 if (p != PROC_NULL) {
1061 kn->kn_proc = PROC_NULL;
1062 KNOTE_DETACH(&p->p_klist, kn);
1063 }
1064
1065 proc_klist_unlock();
1066 }
1067
1068 static int
1069 filt_procevent(struct knote *kn, long hint)
1070 {
1071 u_int event;
1072
1073 /* ALWAYS CALLED WITH proc_klist_lock */
1074
1075 /*
1076 * Note: a lot of bits in hint may be obtained from the knote
1077 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1078 * bits in hint for filt_procevent
1079 *
1080 * mask off extra data
1081 */
1082 event = (u_int)hint & NOTE_PCTRLMASK;
1083
1084 /*
1085 * termination lifecycle events can happen while a debugger
1086 * has reparented a process, in which case notifications
1087 * should be quashed except to the tracing parent. When
1088 * the debugger reaps the child (either via wait4(2) or
1089 * process exit), the child will be reparented to the original
1090 * parent and these knotes re-fired.
1091 */
1092 if (event & NOTE_EXIT) {
1093 if ((kn->kn_proc->p_oppid != 0)
1094 && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_proc->p_ppid)) {
1095 /*
1096 * This knote is not for the current ptrace(2) parent, ignore.
1097 */
1098 return 0;
1099 }
1100 }
1101
1102 /*
1103 * if the user is interested in this event, record it.
1104 */
1105 if (kn->kn_sfflags & event) {
1106 kn->kn_fflags |= event;
1107 }
1108
1109 #pragma clang diagnostic push
1110 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1111 if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1112 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1113 }
1114 #pragma clang diagnostic pop
1115
1116
1117 /*
1118 * The kernel has a wrapper in place that returns the same data
1119 * as is collected here, in kn_hook32. Any changes to how
1120 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1121 * should also be reflected in the proc_pidnoteexit() wrapper.
1122 */
1123 if (event == NOTE_EXIT) {
1124 kn->kn_hook32 = 0;
1125 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1126 kn->kn_fflags |= NOTE_EXITSTATUS;
1127 kn->kn_hook32 |= (hint & NOTE_PDATAMASK);
1128 }
1129 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1130 kn->kn_fflags |= NOTE_EXIT_DETAIL;
1131 if ((kn->kn_proc->p_lflag &
1132 P_LTERM_DECRYPTFAIL) != 0) {
1133 kn->kn_hook32 |= NOTE_EXIT_DECRYPTFAIL;
1134 }
1135 if ((kn->kn_proc->p_lflag &
1136 P_LTERM_JETSAM) != 0) {
1137 kn->kn_hook32 |= NOTE_EXIT_MEMORY;
1138 switch (kn->kn_proc->p_lflag & P_JETSAM_MASK) {
1139 case P_JETSAM_VMPAGESHORTAGE:
1140 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1141 break;
1142 case P_JETSAM_VMTHRASHING:
1143 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VMTHRASHING;
1144 break;
1145 case P_JETSAM_FCTHRASHING:
1146 kn->kn_hook32 |= NOTE_EXIT_MEMORY_FCTHRASHING;
1147 break;
1148 case P_JETSAM_VNODE:
1149 kn->kn_hook32 |= NOTE_EXIT_MEMORY_VNODE;
1150 break;
1151 case P_JETSAM_HIWAT:
1152 kn->kn_hook32 |= NOTE_EXIT_MEMORY_HIWAT;
1153 break;
1154 case P_JETSAM_PID:
1155 kn->kn_hook32 |= NOTE_EXIT_MEMORY_PID;
1156 break;
1157 case P_JETSAM_IDLEEXIT:
1158 kn->kn_hook32 |= NOTE_EXIT_MEMORY_IDLE;
1159 break;
1160 }
1161 }
1162 if ((kn->kn_proc->p_csflags &
1163 CS_KILLED) != 0) {
1164 kn->kn_hook32 |= NOTE_EXIT_CSERROR;
1165 }
1166 }
1167 }
1168
1169 /* if we have any matching state, activate the knote */
1170 return kn->kn_fflags != 0;
1171 }
1172
1173 static int
1174 filt_proctouch(struct knote *kn, struct kevent_qos_s *kev)
1175 {
1176 int res;
1177
1178 proc_klist_lock();
1179
1180 /* accept new filter flags and mask off output events no long interesting */
1181 kn->kn_sfflags = kev->fflags;
1182
1183 /* restrict the current results to the (smaller?) set of new interest */
1184 /*
1185 * For compatibility with previous implementations, we leave kn_fflags
1186 * as they were before.
1187 */
1188 //kn->kn_fflags &= kn->kn_sfflags;
1189
1190 res = (kn->kn_fflags != 0);
1191
1192 proc_klist_unlock();
1193
1194 return res;
1195 }
1196
1197 static int
1198 filt_procprocess(struct knote *kn, struct kevent_qos_s *kev)
1199 {
1200 int res = 0;
1201
1202 proc_klist_lock();
1203 if (kn->kn_fflags) {
1204 knote_fill_kevent(kn, kev, kn->kn_hook32);
1205 kn->kn_hook32 = 0;
1206 res = 1;
1207 }
1208 proc_klist_unlock();
1209 return res;
1210 }
1211
1212 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
1213 .f_attach = filt_procattach,
1214 .f_detach = filt_procdetach,
1215 .f_event = filt_procevent,
1216 .f_touch = filt_proctouch,
1217 .f_process = filt_procprocess,
1218 };
1219
1220 #pragma mark timer_filtops
1221
1222 struct filt_timer_params {
1223 uint64_t deadline; /* deadline in abs/cont time
1224 * (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1225 uint64_t leeway; /* leeway in abstime, or 0 if none */
1226 uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
1227 };
1228
1229 /*
1230 * Values stored in the knote at rest (using Mach absolute time units)
1231 *
1232 * kn->kn_thcall where the thread_call object is stored
1233 * kn->kn_ext[0] next deadline or 0 if immediate expiration
1234 * kn->kn_ext[1] leeway value
1235 * kn->kn_sdata interval timer: the interval
1236 * absolute/deadline timer: 0
1237 * kn->kn_hook32 timer state (with gencount)
1238 *
1239 * TIMER_IDLE:
1240 * The timer has either never been scheduled or been cancelled.
1241 * It is safe to schedule a new one in this state.
1242 *
1243 * TIMER_ARMED:
1244 * The timer has been scheduled
1245 *
1246 * TIMER_FIRED
1247 * The timer has fired and an event needs to be delivered.
1248 * When in this state, the callout may still be running.
1249 *
1250 * TIMER_IMMEDIATE
1251 * The timer has fired at registration time, and the callout was never
1252 * dispatched.
1253 */
1254 #define TIMER_IDLE 0x0
1255 #define TIMER_ARMED 0x1
1256 #define TIMER_FIRED 0x2
1257 #define TIMER_IMMEDIATE 0x3
1258 #define TIMER_STATE_MASK 0x3
1259 #define TIMER_GEN_INC 0x4
1260
1261 static void
1262 filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
1263 {
1264 kn->kn_ext[0] = params->deadline;
1265 kn->kn_ext[1] = params->leeway;
1266 kn->kn_sdata = params->interval;
1267 }
1268
1269 /*
1270 * filt_timervalidate - process data from user
1271 *
1272 * Sets up the deadline, interval, and leeway from the provided user data
1273 *
1274 * Input:
1275 * kn_sdata timer deadline or interval time
1276 * kn_sfflags style of timer, unit of measurement
1277 *
1278 * Output:
1279 * struct filter_timer_params to apply to the filter with
1280 * filt_timer_set_params when changes are ready to be commited.
1281 *
1282 * Returns:
1283 * EINVAL Invalid user data parameters
1284 * ERANGE Various overflows with the parameters
1285 *
1286 * Called with timer filter lock held.
1287 */
1288 static int
1289 filt_timervalidate(const struct kevent_qos_s *kev,
1290 struct filt_timer_params *params)
1291 {
1292 /*
1293 * There are 5 knobs that need to be chosen for a timer registration:
1294 *
1295 * A) Units of time (what is the time duration of the specified number)
1296 * Absolute and interval take:
1297 * NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1298 * Defaults to milliseconds if not specified
1299 *
1300 * B) Clock epoch (what is the zero point of the specified number)
1301 * For interval, there is none
1302 * For absolute, defaults to the gettimeofday/calendar epoch
1303 * With NOTE_MACHTIME, uses mach_absolute_time()
1304 * With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1305 *
1306 * C) The knote's behavior on delivery
1307 * Interval timer causes the knote to arm for the next interval unless one-shot is set
1308 * Absolute is a forced one-shot timer which deletes on delivery
1309 * TODO: Add a way for absolute to be not forced one-shot
1310 *
1311 * D) Whether the time duration is relative to now or absolute
1312 * Interval fires at now + duration when it is set up
1313 * Absolute fires at now + difference between now walltime and passed in walltime
1314 * With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1315 *
1316 * E) Whether the timer continues to tick across sleep
1317 * By default all three do not.
1318 * For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1319 * With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1320 * expires when mach_continuous_time() is > the passed in value.
1321 */
1322
1323 uint64_t multiplier;
1324
1325 boolean_t use_abstime = FALSE;
1326
1327 switch (kev->fflags & (NOTE_SECONDS | NOTE_USECONDS | NOTE_NSECONDS | NOTE_MACHTIME)) {
1328 case NOTE_SECONDS:
1329 multiplier = NSEC_PER_SEC;
1330 break;
1331 case NOTE_USECONDS:
1332 multiplier = NSEC_PER_USEC;
1333 break;
1334 case NOTE_NSECONDS:
1335 multiplier = 1;
1336 break;
1337 case NOTE_MACHTIME:
1338 multiplier = 0;
1339 use_abstime = TRUE;
1340 break;
1341 case 0: /* milliseconds (default) */
1342 multiplier = NSEC_PER_SEC / 1000;
1343 break;
1344 default:
1345 return EINVAL;
1346 }
1347
1348 /* transform the leeway in kn_ext[1] to same time scale */
1349 if (kev->fflags & NOTE_LEEWAY) {
1350 uint64_t leeway_abs;
1351
1352 if (use_abstime) {
1353 leeway_abs = (uint64_t)kev->ext[1];
1354 } else {
1355 uint64_t leeway_ns;
1356 if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns)) {
1357 return ERANGE;
1358 }
1359
1360 nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1361 }
1362
1363 params->leeway = leeway_abs;
1364 } else {
1365 params->leeway = 0;
1366 }
1367
1368 if (kev->fflags & NOTE_ABSOLUTE) {
1369 uint64_t deadline_abs;
1370
1371 if (use_abstime) {
1372 deadline_abs = (uint64_t)kev->data;
1373 } else {
1374 uint64_t calendar_deadline_ns;
1375
1376 if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns)) {
1377 return ERANGE;
1378 }
1379
1380 /* calendar_deadline_ns is in nanoseconds since the epoch */
1381
1382 clock_sec_t seconds;
1383 clock_nsec_t nanoseconds;
1384
1385 /*
1386 * Note that the conversion through wall-time is only done once.
1387 *
1388 * If the relationship between MAT and gettimeofday changes,
1389 * the underlying timer does not update.
1390 *
1391 * TODO: build a wall-time denominated timer_call queue
1392 * and a flag to request DTRTing with wall-time timers
1393 */
1394 clock_get_calendar_nanotime(&seconds, &nanoseconds);
1395
1396 uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1397
1398 /* if deadline is in the future */
1399 if (calendar_now_ns < calendar_deadline_ns) {
1400 uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1401 uint64_t interval_abs;
1402
1403 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1404
1405 /*
1406 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1407 * causes the timer to keep ticking across sleep, but
1408 * it does not change the calendar timebase.
1409 */
1410
1411 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1412 clock_continuoustime_interval_to_deadline(interval_abs,
1413 &deadline_abs);
1414 } else {
1415 clock_absolutetime_interval_to_deadline(interval_abs,
1416 &deadline_abs);
1417 }
1418 } else {
1419 deadline_abs = 0; /* cause immediate expiration */
1420 }
1421 }
1422
1423 params->deadline = deadline_abs;
1424 params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
1425 } else if (kev->data < 0) {
1426 /*
1427 * Negative interval timers fire immediately, once.
1428 *
1429 * Ideally a negative interval would be an error, but certain clients
1430 * pass negative values on accident, and expect an event back.
1431 *
1432 * In the old implementation the timer would repeat with no delay
1433 * N times until mach_absolute_time() + (N * interval) underflowed,
1434 * then it would wait ~forever by accidentally arming a timer for the far future.
1435 *
1436 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1437 */
1438
1439 params->deadline = 0; /* expire immediately */
1440 params->interval = 0; /* non-repeating */
1441 } else {
1442 uint64_t interval_abs = 0;
1443
1444 if (use_abstime) {
1445 interval_abs = (uint64_t)kev->data;
1446 } else {
1447 uint64_t interval_ns;
1448 if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns)) {
1449 return ERANGE;
1450 }
1451
1452 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1453 }
1454
1455 uint64_t deadline = 0;
1456
1457 if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME) {
1458 clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1459 } else {
1460 clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1461 }
1462
1463 params->deadline = deadline;
1464 params->interval = interval_abs;
1465 }
1466
1467 return 0;
1468 }
1469
1470 /*
1471 * filt_timerexpire - the timer callout routine
1472 */
1473 static void
1474 filt_timerexpire(void *knx, void *state_on_arm)
1475 {
1476 struct knote *kn = knx;
1477
1478 uint32_t state = (uint32_t)(uintptr_t)state_on_arm;
1479 uint32_t fired_state = state ^ TIMER_ARMED ^ TIMER_FIRED;
1480
1481 if (os_atomic_cmpxchg(&kn->kn_hook32, state, fired_state, relaxed)) {
1482 // our f_event always would say FILTER_ACTIVE,
1483 // so be leaner and just do it.
1484 struct kqueue *kq = knote_get_kq(kn);
1485 kqlock(kq);
1486 knote_activate(kq, kn, FILTER_ACTIVE);
1487 kqunlock(kq);
1488 } else {
1489 /*
1490 * The timer has been reprogrammed or canceled since it was armed,
1491 * and this is a late firing for the timer, just ignore it.
1492 */
1493 }
1494 }
1495
1496 /*
1497 * Does this deadline needs a timer armed for it, or has it expired?
1498 */
1499 static bool
1500 filt_timer_is_ready(struct knote *kn)
1501 {
1502 uint64_t now, deadline = kn->kn_ext[0];
1503
1504 if (deadline == 0) {
1505 return true;
1506 }
1507
1508 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1509 now = mach_continuous_time();
1510 } else {
1511 now = mach_absolute_time();
1512 }
1513 return deadline <= now;
1514 }
1515
1516 /*
1517 * Arm a timer
1518 *
1519 * It is the responsibility of the caller to make sure the timer call
1520 * has completed or been cancelled properly prior to arming it.
1521 */
1522 static void
1523 filt_timerarm(struct knote *kn)
1524 {
1525 uint64_t deadline = kn->kn_ext[0];
1526 uint64_t leeway = kn->kn_ext[1];
1527 uint32_t state;
1528
1529 int filter_flags = kn->kn_sfflags;
1530 unsigned int timer_flags = 0;
1531
1532 if (filter_flags & NOTE_CRITICAL) {
1533 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1534 } else if (filter_flags & NOTE_BACKGROUND) {
1535 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1536 } else {
1537 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1538 }
1539
1540 if (filter_flags & NOTE_LEEWAY) {
1541 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1542 }
1543
1544 if (filter_flags & NOTE_MACH_CONTINUOUS_TIME) {
1545 timer_flags |= THREAD_CALL_CONTINUOUS;
1546 }
1547
1548 /*
1549 * Move to ARMED.
1550 *
1551 * We increase the gencount, and setup the thread call with this expected
1552 * state. It means that if there was a previous generation of the timer in
1553 * flight that needs to be ignored, then 3 things are possible:
1554 *
1555 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1556 * but we clobber it with ARMED and a new gencount. The knote will still
1557 * be activated, but filt_timerprocess() which is serialized with this
1558 * call will not see the FIRED bit set and will not deliver an event.
1559 *
1560 * - this code runs first, but filt_timerexpire() comes second. Because it
1561 * knows an old gencount, it will debounce and not activate the knote.
1562 *
1563 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1564 * will just cancel it properly.
1565 *
1566 * This is important as userspace expects to never be woken up for past
1567 * timers after filt_timertouch ran.
1568 */
1569 state = os_atomic_load(&kn->kn_hook32, relaxed);
1570 state &= ~TIMER_STATE_MASK;
1571 state += TIMER_GEN_INC + TIMER_ARMED;
1572 os_atomic_store(&kn->kn_hook32, state, relaxed);
1573
1574 thread_call_enter_delayed_with_leeway(kn->kn_thcall,
1575 (void *)(uintptr_t)state, deadline, leeway, timer_flags);
1576 }
1577
1578 /*
1579 * Mark a timer as "already fired" when it is being reprogrammed
1580 *
1581 * If there is a timer in flight, this will do a best effort at canceling it,
1582 * but will not wait. If the thread call was in flight, having set the
1583 * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1584 * cancelation.
1585 */
1586 static void
1587 filt_timerfire_immediate(struct knote *kn)
1588 {
1589 uint32_t state;
1590
1591 static_assert(TIMER_IMMEDIATE == TIMER_STATE_MASK,
1592 "validate that this atomic or will transition to IMMEDIATE");
1593 state = os_atomic_or_orig(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1594
1595 if ((state & TIMER_STATE_MASK) == TIMER_ARMED) {
1596 thread_call_cancel(kn->kn_thcall);
1597 }
1598 }
1599
1600 /*
1601 * Allocate a thread call for the knote's lifetime, and kick off the timer.
1602 */
1603 static int
1604 filt_timerattach(struct knote *kn, struct kevent_qos_s *kev)
1605 {
1606 thread_call_t callout;
1607 struct filt_timer_params params;
1608 int error;
1609
1610 if ((error = filt_timervalidate(kev, &params)) != 0) {
1611 knote_set_error(kn, error);
1612 return 0;
1613 }
1614
1615 callout = thread_call_allocate_with_options(filt_timerexpire,
1616 (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1617 THREAD_CALL_OPTIONS_ONCE);
1618
1619 if (NULL == callout) {
1620 knote_set_error(kn, ENOMEM);
1621 return 0;
1622 }
1623
1624 filt_timer_set_params(kn, &params);
1625 kn->kn_thcall = callout;
1626 kn->kn_flags |= EV_CLEAR;
1627 os_atomic_store(&kn->kn_hook32, TIMER_IDLE, relaxed);
1628
1629 /* NOTE_ABSOLUTE implies EV_ONESHOT */
1630 if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1631 kn->kn_flags |= EV_ONESHOT;
1632 }
1633
1634 if (filt_timer_is_ready(kn)) {
1635 os_atomic_store(&kn->kn_hook32, TIMER_IMMEDIATE, relaxed);
1636 return FILTER_ACTIVE;
1637 } else {
1638 filt_timerarm(kn);
1639 return 0;
1640 }
1641 }
1642
1643 /*
1644 * Shut down the timer if it's running, and free the callout.
1645 */
1646 static void
1647 filt_timerdetach(struct knote *kn)
1648 {
1649 __assert_only boolean_t freed;
1650
1651 /*
1652 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1653 * running anymore.
1654 */
1655 thread_call_cancel_wait(kn->kn_thcall);
1656 freed = thread_call_free(kn->kn_thcall);
1657 assert(freed);
1658 }
1659
1660 /*
1661 * filt_timertouch - update timer knote with new user input
1662 *
1663 * Cancel and restart the timer based on new user data. When
1664 * the user picks up a knote, clear the count of how many timer
1665 * pops have gone off (in kn_data).
1666 */
1667 static int
1668 filt_timertouch(struct knote *kn, struct kevent_qos_s *kev)
1669 {
1670 struct filt_timer_params params;
1671 uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
1672 int error;
1673
1674 if (changed_flags & NOTE_ABSOLUTE) {
1675 kev->flags |= EV_ERROR;
1676 kev->data = EINVAL;
1677 return 0;
1678 }
1679
1680 if ((error = filt_timervalidate(kev, &params)) != 0) {
1681 kev->flags |= EV_ERROR;
1682 kev->data = error;
1683 return 0;
1684 }
1685
1686 /* capture the new values used to compute deadline */
1687 filt_timer_set_params(kn, &params);
1688 kn->kn_sfflags = kev->fflags;
1689
1690 if (filt_timer_is_ready(kn)) {
1691 filt_timerfire_immediate(kn);
1692 return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
1693 } else {
1694 filt_timerarm(kn);
1695 return FILTER_UPDATE_REQ_QOS;
1696 }
1697 }
1698
1699 /*
1700 * filt_timerprocess - query state of knote and snapshot event data
1701 *
1702 * Determine if the timer has fired in the past, snapshot the state
1703 * of the kevent for returning to user-space, and clear pending event
1704 * counters for the next time.
1705 */
1706 static int
1707 filt_timerprocess(struct knote *kn, struct kevent_qos_s *kev)
1708 {
1709 uint32_t state = os_atomic_load(&kn->kn_hook32, relaxed);
1710
1711 /*
1712 * filt_timerprocess is serialized with any filter routine except for
1713 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1714 * transition, and on success, activates the knote.
1715 *
1716 * Hence, we don't need atomic modifications of the state, only to peek at
1717 * whether we see any of the "FIRED" state, and if we do, it is safe to
1718 * do simple state machine transitions.
1719 */
1720 switch (state & TIMER_STATE_MASK) {
1721 case TIMER_IDLE:
1722 case TIMER_ARMED:
1723 /*
1724 * This can happen if a touch resets a timer that had fired
1725 * without being processed
1726 */
1727 return 0;
1728 }
1729
1730 os_atomic_store(&kn->kn_hook32, state & ~TIMER_STATE_MASK, relaxed);
1731
1732 /*
1733 * Copy out the interesting kevent state,
1734 * but don't leak out the raw time calculations.
1735 *
1736 * TODO: potential enhancements - tell the user about:
1737 * - deadline to which this timer thought it was expiring
1738 * - return kn_sfflags in the fflags field so the client can know
1739 * under what flags the timer fired
1740 */
1741 knote_fill_kevent(kn, kev, 1);
1742 kev->ext[0] = 0;
1743 /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */
1744
1745 if (kn->kn_sdata != 0) {
1746 /*
1747 * This is a 'repeating' timer, so we have to emit
1748 * how many intervals expired between the arm
1749 * and the process.
1750 *
1751 * A very strange style of interface, because
1752 * this could easily be done in the client...
1753 */
1754
1755 uint64_t now;
1756
1757 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1758 now = mach_continuous_time();
1759 } else {
1760 now = mach_absolute_time();
1761 }
1762
1763 uint64_t first_deadline = kn->kn_ext[0];
1764 uint64_t interval_abs = kn->kn_sdata;
1765 uint64_t orig_arm_time = first_deadline - interval_abs;
1766
1767 assert(now > orig_arm_time);
1768 assert(now > first_deadline);
1769
1770 uint64_t elapsed = now - orig_arm_time;
1771
1772 uint64_t num_fired = elapsed / interval_abs;
1773
1774 /*
1775 * To reach this code, we must have seen the timer pop
1776 * and be in repeating mode, so therefore it must have been
1777 * more than 'interval' time since the attach or last
1778 * successful touch.
1779 */
1780 assert(num_fired > 0);
1781
1782 /* report how many intervals have elapsed to the user */
1783 kev->data = (int64_t)num_fired;
1784
1785 /* We only need to re-arm the timer if it's not about to be destroyed */
1786 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1787 /* fire at the end of the next interval */
1788 uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1789
1790 assert(new_deadline > now);
1791
1792 kn->kn_ext[0] = new_deadline;
1793
1794 /*
1795 * This can't shortcut setting up the thread call, because
1796 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1797 */
1798 filt_timerarm(kn);
1799 }
1800 }
1801
1802 return FILTER_ACTIVE;
1803 }
1804
1805 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1806 .f_extended_codes = true,
1807 .f_attach = filt_timerattach,
1808 .f_detach = filt_timerdetach,
1809 .f_event = filt_bad_event,
1810 .f_touch = filt_timertouch,
1811 .f_process = filt_timerprocess,
1812 };
1813
1814 #pragma mark user_filtops
1815
1816 static int
1817 filt_userattach(struct knote *kn, __unused struct kevent_qos_s *kev)
1818 {
1819 if (kn->kn_sfflags & NOTE_TRIGGER) {
1820 kn->kn_hook32 = FILTER_ACTIVE;
1821 } else {
1822 kn->kn_hook32 = 0;
1823 }
1824 return kn->kn_hook32;
1825 }
1826
1827 static int
1828 filt_usertouch(struct knote *kn, struct kevent_qos_s *kev)
1829 {
1830 uint32_t ffctrl;
1831 int fflags;
1832
1833 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1834 fflags = kev->fflags & NOTE_FFLAGSMASK;
1835 switch (ffctrl) {
1836 case NOTE_FFNOP:
1837 break;
1838 case NOTE_FFAND:
1839 kn->kn_sfflags &= fflags;
1840 break;
1841 case NOTE_FFOR:
1842 kn->kn_sfflags |= fflags;
1843 break;
1844 case NOTE_FFCOPY:
1845 kn->kn_sfflags = fflags;
1846 break;
1847 }
1848 kn->kn_sdata = kev->data;
1849
1850 if (kev->fflags & NOTE_TRIGGER) {
1851 kn->kn_hook32 = FILTER_ACTIVE;
1852 }
1853 return (int)kn->kn_hook32;
1854 }
1855
1856 static int
1857 filt_userprocess(struct knote *kn, struct kevent_qos_s *kev)
1858 {
1859 int result = (int)kn->kn_hook32;
1860
1861 if (result) {
1862 /* EVFILT_USER returns the data that was passed in */
1863 knote_fill_kevent_with_sdata(kn, kev);
1864 kev->fflags = kn->kn_sfflags;
1865 if (kn->kn_flags & EV_CLEAR) {
1866 /* knote_fill_kevent cleared kn_fflags */
1867 kn->kn_hook32 = 0;
1868 }
1869 }
1870
1871 return result;
1872 }
1873
1874 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
1875 .f_extended_codes = true,
1876 .f_attach = filt_userattach,
1877 .f_detach = filt_no_detach,
1878 .f_event = filt_bad_event,
1879 .f_touch = filt_usertouch,
1880 .f_process = filt_userprocess,
1881 };
1882
1883 #pragma mark workloop_filtops
1884
1885 #define EPREEMPTDISABLED (-1)
1886
1887 static inline void
1888 filt_wllock(struct kqworkloop *kqwl)
1889 {
1890 lck_spin_lock(&kqwl->kqwl_statelock);
1891 }
1892
1893 static inline void
1894 filt_wlunlock(struct kqworkloop *kqwl)
1895 {
1896 lck_spin_unlock(&kqwl->kqwl_statelock);
1897 }
1898
1899 /*
1900 * Returns true when the interlock for the turnstile is the workqueue lock
1901 *
1902 * When this is the case, all turnstiles operations are delegated
1903 * to the workqueue subsystem.
1904 *
1905 * This is required because kqueue_threadreq_bind_prepost only holds the
1906 * workqueue lock but needs to move the inheritor from the workloop turnstile
1907 * away from the creator thread, so that this now fulfilled request cannot be
1908 * picked anymore by other threads.
1909 */
1910 static inline bool
1911 filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
1912 {
1913 return kqr_thread_requested_pending(&kqwl->kqwl_request);
1914 }
1915
1916 static void
1917 filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
1918 turnstile_update_flags_t flags)
1919 {
1920 turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
1921 workq_threadreq_t kqr = &kqwl->kqwl_request;
1922
1923 /*
1924 * binding to the workq should always happen through
1925 * workq_kern_threadreq_update_inheritor()
1926 */
1927 assert(!filt_wlturnstile_interlock_is_workq(kqwl));
1928
1929 if ((inheritor = kqwl->kqwl_owner)) {
1930 flags |= TURNSTILE_INHERITOR_THREAD;
1931 } else if ((inheritor = kqr_thread(kqr))) {
1932 flags |= TURNSTILE_INHERITOR_THREAD;
1933 }
1934
1935 turnstile_update_inheritor(ts, inheritor, flags);
1936 }
1937
1938 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
1939 #define FILT_WLATTACH 0
1940 #define FILT_WLTOUCH 1
1941 #define FILT_WLDROP 2
1942
1943 __result_use_check
1944 static int
1945 filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
1946 struct kevent_qos_s *kev, kq_index_t qos_index, int op)
1947 {
1948 user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
1949 workq_threadreq_t kqr = &kqwl->kqwl_request;
1950 thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
1951 kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
1952 int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
1953 int action = KQWL_UTQ_NONE, error = 0;
1954 bool wl_inheritor_updated = false, needs_wake = false;
1955 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
1956 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
1957 uint64_t udata = 0;
1958 struct turnstile *ts = TURNSTILE_NULL;
1959
1960 filt_wllock(kqwl);
1961
1962 again:
1963 new_owner = cur_owner = kqwl->kqwl_owner;
1964
1965 /*
1966 * Phase 1:
1967 *
1968 * If asked, load the uint64 value at the user provided address and compare
1969 * it against the passed in mask and expected value.
1970 *
1971 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
1972 * a thread reference.
1973 *
1974 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
1975 * the current thread, then end ownership.
1976 *
1977 * Lastly decide whether we need to perform a QoS update.
1978 */
1979 if (uaddr) {
1980 /*
1981 * Until <rdar://problem/24999882> exists,
1982 * disabling preemption copyin forces any
1983 * vm_fault we encounter to fail.
1984 */
1985 error = copyin_atomic64(uaddr, &udata);
1986
1987 /*
1988 * If we get EFAULT, drop locks, and retry.
1989 * If we still get an error report it,
1990 * else assume the memory has been faulted
1991 * and attempt to copyin under lock again.
1992 */
1993 switch (error) {
1994 case 0:
1995 break;
1996 case EFAULT:
1997 if (efault_retry-- > 0) {
1998 filt_wlunlock(kqwl);
1999 error = copyin_atomic64(uaddr, &udata);
2000 filt_wllock(kqwl);
2001 if (error == 0) {
2002 goto again;
2003 }
2004 }
2005 OS_FALLTHROUGH;
2006 default:
2007 goto out;
2008 }
2009
2010 /* Update state as copied in. */
2011 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2012
2013 if ((udata & mask) != (kdata & mask)) {
2014 error = ESTALE;
2015 } else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
2016 /*
2017 * Decipher the owner port name, and translate accordingly.
2018 * The low 2 bits were borrowed for other flags, so mask them off.
2019 *
2020 * Then attempt translation to a thread reference or fail.
2021 */
2022 mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
2023 if (name != MACH_PORT_NULL) {
2024 name = ipc_entry_name_mask(name);
2025 extra_thread_ref = port_name_to_thread(name,
2026 PORT_TO_THREAD_IN_CURRENT_TASK);
2027 if (extra_thread_ref == THREAD_NULL) {
2028 error = EOWNERDEAD;
2029 goto out;
2030 }
2031 new_owner = extra_thread_ref;
2032 }
2033 }
2034 }
2035
2036 if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
2037 new_owner = THREAD_NULL;
2038 }
2039
2040 if (error == 0) {
2041 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
2042 action = KQWL_UTQ_SET_QOS_INDEX;
2043 } else if (qos_index && kqr->tr_kq_qos_index != qos_index) {
2044 action = KQWL_UTQ_SET_QOS_INDEX;
2045 }
2046
2047 if (op == FILT_WLTOUCH) {
2048 /*
2049 * Save off any additional fflags/data we just accepted
2050 * But only keep the last round of "update" bits we acted on which helps
2051 * debugging a lot.
2052 */
2053 kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2054 kn->kn_sfflags |= kev->fflags;
2055 if (kev->fflags & NOTE_WL_SYNC_WAKE) {
2056 needs_wake = (kn->kn_thread != THREAD_NULL);
2057 }
2058 } else if (op == FILT_WLDROP) {
2059 if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2060 NOTE_WL_SYNC_WAIT) {
2061 /*
2062 * When deleting a SYNC_WAIT knote that hasn't been woken up
2063 * explicitly, issue a wake up.
2064 */
2065 kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2066 needs_wake = (kn->kn_thread != THREAD_NULL);
2067 }
2068 }
2069 }
2070
2071 /*
2072 * Phase 2:
2073 *
2074 * Commit ownership and QoS changes if any, possibly wake up waiters
2075 */
2076
2077 if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
2078 goto out;
2079 }
2080
2081 kqlock(kqwl);
2082
2083 /* If already tracked as servicer, don't track as owner */
2084 if (new_owner == kqr_thread(kqr)) {
2085 new_owner = THREAD_NULL;
2086 }
2087
2088 if (cur_owner != new_owner) {
2089 kqwl->kqwl_owner = new_owner;
2090 if (new_owner == extra_thread_ref) {
2091 /* we just transfered this ref to kqwl_owner */
2092 extra_thread_ref = THREAD_NULL;
2093 }
2094 cur_override = kqworkloop_override(kqwl);
2095
2096 if (new_owner) {
2097 /* override it before we drop the old */
2098 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2099 thread_add_kevent_override(new_owner, cur_override);
2100 }
2101 if (kqr_thread_requested_pending(kqr)) {
2102 if (action == KQWL_UTQ_NONE) {
2103 action = KQWL_UTQ_REDRIVE_EVENTS;
2104 }
2105 }
2106 } else {
2107 if (!kqr_thread_requested(kqr) && kqr->tr_kq_wakeup) {
2108 if (action == KQWL_UTQ_NONE) {
2109 action = KQWL_UTQ_REDRIVE_EVENTS;
2110 }
2111 }
2112 }
2113 }
2114
2115 if (action != KQWL_UTQ_NONE) {
2116 kqworkloop_update_threads_qos(kqwl, action, qos_index);
2117 }
2118
2119 ts = kqwl->kqwl_turnstile;
2120 if (cur_owner != new_owner && ts) {
2121 if (action == KQWL_UTQ_REDRIVE_EVENTS) {
2122 /*
2123 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2124 * the code went through workq_kern_threadreq_initiate()
2125 * and the workqueue has set the inheritor already
2126 */
2127 assert(filt_wlturnstile_interlock_is_workq(kqwl));
2128 } else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2129 workq_kern_threadreq_lock(kqwl->kqwl_p);
2130 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
2131 ts, TURNSTILE_IMMEDIATE_UPDATE);
2132 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2133 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2134 /*
2135 * If the workq is no longer the interlock, then
2136 * workq_kern_threadreq_update_inheritor() has finished a bind
2137 * and we need to fallback to the regular path.
2138 */
2139 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2140 }
2141 wl_inheritor_updated = true;
2142 } else {
2143 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
2144 wl_inheritor_updated = true;
2145 }
2146
2147 /*
2148 * We need a turnstile reference because we are dropping the interlock
2149 * and the caller has not called turnstile_prepare.
2150 */
2151 if (wl_inheritor_updated) {
2152 turnstile_reference(ts);
2153 }
2154 }
2155
2156 if (needs_wake && ts) {
2157 waitq_wakeup64_thread(&ts->ts_waitq, knote_filt_wev64(kn),
2158 kn->kn_thread, THREAD_AWAKENED);
2159 if (op == FILT_WLATTACH || op == FILT_WLTOUCH) {
2160 disable_preemption();
2161 error = EPREEMPTDISABLED;
2162 }
2163 }
2164
2165 kqunlock(kqwl);
2166
2167 out:
2168 /*
2169 * Phase 3:
2170 *
2171 * Unlock and cleanup various lingering references and things.
2172 */
2173 filt_wlunlock(kqwl);
2174
2175 #if CONFIG_WORKLOOP_DEBUG
2176 KQWL_HISTORY_WRITE_ENTRY(kqwl, {
2177 .updater = current_thread(),
2178 .servicer = kqr_thread(kqr), /* Note: racy */
2179 .old_owner = cur_owner,
2180 .new_owner = new_owner,
2181
2182 .kev_ident = kev->ident,
2183 .error = (int16_t)error,
2184 .kev_flags = kev->flags,
2185 .kev_fflags = kev->fflags,
2186
2187 .kev_mask = mask,
2188 .kev_value = kdata,
2189 .in_value = udata,
2190 });
2191 #endif // CONFIG_WORKLOOP_DEBUG
2192
2193 if (wl_inheritor_updated) {
2194 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
2195 turnstile_deallocate_safe(ts);
2196 }
2197
2198 if (cur_owner && new_owner != cur_owner) {
2199 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2200 thread_drop_kevent_override(cur_owner);
2201 }
2202 thread_deallocate_safe(cur_owner);
2203 }
2204 if (extra_thread_ref) {
2205 thread_deallocate_safe(extra_thread_ref);
2206 }
2207 return error;
2208 }
2209
2210 /*
2211 * Remembers the last updated that came in from userspace for debugging reasons.
2212 * - fflags is mirrored from the userspace kevent
2213 * - ext[i, i != VALUE] is mirrored from the userspace kevent
2214 * - ext[VALUE] is set to what the kernel loaded atomically
2215 * - data is set to the error if any
2216 */
2217 static inline void
2218 filt_wlremember_last_update(struct knote *kn, struct kevent_qos_s *kev,
2219 int error)
2220 {
2221 kn->kn_fflags = kev->fflags;
2222 kn->kn_sdata = error;
2223 memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2224 }
2225
2226 static int
2227 filt_wlupdate_sync_ipc(struct kqworkloop *kqwl, struct knote *kn,
2228 struct kevent_qos_s *kev, int op)
2229 {
2230 user_addr_t uaddr = (user_addr_t) kev->ext[EV_EXTIDX_WL_ADDR];
2231 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2232 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2233 uint64_t udata = 0;
2234 int efault_retry = EVFILT_WORKLOOP_EFAULT_RETRY_COUNT;
2235 int error = 0;
2236
2237 if (op == FILT_WLATTACH) {
2238 (void)kqueue_alloc_turnstile(&kqwl->kqwl_kqueue);
2239 } else if (uaddr == 0) {
2240 return 0;
2241 }
2242
2243 filt_wllock(kqwl);
2244
2245 again:
2246
2247 /*
2248 * Do the debounce thing, the lock serializing the state is the knote lock.
2249 */
2250 if (uaddr) {
2251 /*
2252 * Until <rdar://problem/24999882> exists,
2253 * disabling preemption copyin forces any
2254 * vm_fault we encounter to fail.
2255 */
2256 error = copyin_atomic64(uaddr, &udata);
2257
2258 /*
2259 * If we get EFAULT, drop locks, and retry.
2260 * If we still get an error report it,
2261 * else assume the memory has been faulted
2262 * and attempt to copyin under lock again.
2263 */
2264 switch (error) {
2265 case 0:
2266 break;
2267 case EFAULT:
2268 if (efault_retry-- > 0) {
2269 filt_wlunlock(kqwl);
2270 error = copyin_atomic64(uaddr, &udata);
2271 filt_wllock(kqwl);
2272 if (error == 0) {
2273 goto again;
2274 }
2275 }
2276 OS_FALLTHROUGH;
2277 default:
2278 goto out;
2279 }
2280
2281 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2282 kn->kn_ext[EV_EXTIDX_WL_VALUE] = udata;
2283
2284 if ((udata & mask) != (kdata & mask)) {
2285 error = ESTALE;
2286 goto out;
2287 }
2288 }
2289
2290 if (op == FILT_WLATTACH) {
2291 error = filt_wlattach_sync_ipc(kn);
2292 if (error == 0) {
2293 disable_preemption();
2294 error = EPREEMPTDISABLED;
2295 }
2296 }
2297
2298 out:
2299 filt_wlunlock(kqwl);
2300 return error;
2301 }
2302
2303 static int
2304 filt_wlattach(struct knote *kn, struct kevent_qos_s *kev)
2305 {
2306 struct kqueue *kq = knote_get_kq(kn);
2307 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2308 int error = 0, result = 0;
2309 kq_index_t qos_index = 0;
2310
2311 if (__improbable((kq->kq_state & KQ_WORKLOOP) == 0)) {
2312 error = ENOTSUP;
2313 goto out;
2314 }
2315
2316 uint32_t command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2317 switch (command) {
2318 case NOTE_WL_THREAD_REQUEST:
2319 if (kn->kn_id != kqwl->kqwl_dynamicid) {
2320 error = EINVAL;
2321 goto out;
2322 }
2323 qos_index = _pthread_priority_thread_qos(kn->kn_qos);
2324 if (qos_index == THREAD_QOS_UNSPECIFIED) {
2325 error = ERANGE;
2326 goto out;
2327 }
2328 if (kqwl->kqwl_request.tr_kq_qos_index) {
2329 /*
2330 * There already is a thread request, and well, you're only allowed
2331 * one per workloop, so fail the attach.
2332 */
2333 error = EALREADY;
2334 goto out;
2335 }
2336 break;
2337 case NOTE_WL_SYNC_WAIT:
2338 case NOTE_WL_SYNC_WAKE:
2339 if (kn->kn_id == kqwl->kqwl_dynamicid) {
2340 error = EINVAL;
2341 goto out;
2342 }
2343 if ((kn->kn_flags & EV_DISABLE) == 0) {
2344 error = EINVAL;
2345 goto out;
2346 }
2347 if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2348 error = EINVAL;
2349 goto out;
2350 }
2351 break;
2352
2353 case NOTE_WL_SYNC_IPC:
2354 if ((kn->kn_flags & EV_DISABLE) == 0) {
2355 error = EINVAL;
2356 goto out;
2357 }
2358 if (kn->kn_sfflags & (NOTE_WL_UPDATE_QOS | NOTE_WL_DISCOVER_OWNER)) {
2359 error = EINVAL;
2360 goto out;
2361 }
2362 break;
2363 default:
2364 error = EINVAL;
2365 goto out;
2366 }
2367
2368 if (command == NOTE_WL_SYNC_IPC) {
2369 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLATTACH);
2370 } else {
2371 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
2372 }
2373
2374 if (error == EPREEMPTDISABLED) {
2375 error = 0;
2376 result = FILTER_THREADREQ_NODEFEER;
2377 }
2378 out:
2379 if (error) {
2380 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2381 if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2382 error = 0;
2383 }
2384 knote_set_error(kn, error);
2385 return result;
2386 }
2387 if (command == NOTE_WL_SYNC_WAIT) {
2388 return kevent_register_wait_prepare(kn, kev, result);
2389 }
2390 /* Just attaching the thread request successfully will fire it */
2391 if (command == NOTE_WL_THREAD_REQUEST) {
2392 /*
2393 * Thread Request knotes need an explicit touch to be active again,
2394 * so delivering an event needs to also consume it.
2395 */
2396 kn->kn_flags |= EV_CLEAR;
2397 return result | FILTER_ACTIVE;
2398 }
2399 return result;
2400 }
2401
2402 static void __dead2
2403 filt_wlwait_continue(void *parameter, wait_result_t wr)
2404 {
2405 struct _kevent_register *cont_args = parameter;
2406 struct kqworkloop *kqwl = cont_args->kqwl;
2407
2408 kqlock(kqwl);
2409 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2410 workq_kern_threadreq_lock(kqwl->kqwl_p);
2411 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2412 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2413 } else {
2414 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL, TURNSTILE_WORKLOOPS);
2415 }
2416 kqunlock(kqwl);
2417
2418 turnstile_cleanup();
2419
2420 if (wr == THREAD_INTERRUPTED) {
2421 cont_args->kev.flags |= EV_ERROR;
2422 cont_args->kev.data = EINTR;
2423 } else if (wr != THREAD_AWAKENED) {
2424 panic("Unexpected wait result: %d", wr);
2425 }
2426
2427 kevent_register_wait_return(cont_args);
2428 }
2429
2430 /*
2431 * Called with the workloop mutex held, most of the time never returns as it
2432 * calls filt_wlwait_continue through a continuation.
2433 */
2434 static void __dead2
2435 filt_wlpost_register_wait(struct uthread *uth, struct knote *kn,
2436 struct _kevent_register *cont_args)
2437 {
2438 struct kqworkloop *kqwl = cont_args->kqwl;
2439 workq_threadreq_t kqr = &kqwl->kqwl_request;
2440 struct turnstile *ts;
2441 bool workq_locked = false;
2442
2443 kqlock_held(kqwl);
2444
2445 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
2446 workq_kern_threadreq_lock(kqwl->kqwl_p);
2447 workq_locked = true;
2448 }
2449
2450 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
2451 TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
2452
2453 if (workq_locked) {
2454 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
2455 &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
2456 TURNSTILE_DELAYED_UPDATE);
2457 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
2458 /*
2459 * if the interlock is no longer the workqueue lock,
2460 * then we don't need to hold it anymore.
2461 */
2462 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2463 workq_locked = false;
2464 }
2465 }
2466 if (!workq_locked) {
2467 /*
2468 * If the interlock is the workloop's, then it's our responsibility to
2469 * call update_inheritor, so just do it.
2470 */
2471 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
2472 }
2473
2474 thread_set_pending_block_hint(uth->uu_thread, kThreadWaitWorkloopSyncWait);
2475 waitq_assert_wait64(&ts->ts_waitq, knote_filt_wev64(kn),
2476 THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
2477
2478 if (workq_locked) {
2479 workq_kern_threadreq_unlock(kqwl->kqwl_p);
2480 }
2481
2482 thread_t thread = kqwl->kqwl_owner ?: kqr_thread(kqr);
2483 if (thread) {
2484 thread_reference(thread);
2485 }
2486
2487 kevent_register_wait_block(ts, thread, filt_wlwait_continue, cont_args);
2488 }
2489
2490 /* called in stackshot context to report the thread responsible for blocking this thread */
2491 void
2492 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2493 event64_t event, thread_waitinfo_t *waitinfo)
2494 {
2495 extern zone_t thread_zone;
2496 struct knote *kn = (struct knote *)event;
2497
2498 zone_require(knote_zone, kn);
2499
2500 assert(kn->kn_thread == thread);
2501
2502 struct kqueue *kq = knote_get_kq(kn);
2503
2504 zone_require(kqworkloop_zone, kq);
2505 assert(kq->kq_state & KQ_WORKLOOP);
2506
2507 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2508 workq_threadreq_t kqr = &kqwl->kqwl_request;
2509
2510 thread_t kqwl_owner = kqwl->kqwl_owner;
2511
2512 if (kqwl_owner != THREAD_NULL) {
2513 zone_require(thread_zone, kqwl_owner);
2514 waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2515 } else if (kqr_thread_requested_pending(kqr)) {
2516 waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2517 } else if (kqr->tr_state >= WORKQ_TR_STATE_BINDING) {
2518 zone_require(thread_zone, kqr->tr_thread);
2519 waitinfo->owner = thread_tid(kqr->tr_thread);
2520 } else {
2521 waitinfo->owner = 0;
2522 }
2523
2524 waitinfo->context = kqwl->kqwl_dynamicid;
2525 }
2526
2527 static void
2528 filt_wldetach(struct knote *kn)
2529 {
2530 if (kn->kn_sfflags & NOTE_WL_SYNC_IPC) {
2531 filt_wldetach_sync_ipc(kn);
2532 } else if (kn->kn_thread) {
2533 kevent_register_wait_cleanup(kn);
2534 }
2535 }
2536
2537 static int
2538 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_qos_s *kev,
2539 thread_qos_t *qos_index)
2540 {
2541 uint32_t new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2542 uint32_t sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2543
2544 if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
2545 return EINVAL;
2546 }
2547 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2548 if (kev->flags & EV_DELETE) {
2549 return EINVAL;
2550 }
2551 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2552 return EINVAL;
2553 }
2554 if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
2555 return ERANGE;
2556 }
2557 }
2558
2559 switch (new_commands) {
2560 case NOTE_WL_THREAD_REQUEST:
2561 /* thread requests can only update themselves */
2562 if (sav_commands != NOTE_WL_THREAD_REQUEST) {
2563 return EINVAL;
2564 }
2565 break;
2566
2567 case NOTE_WL_SYNC_WAIT:
2568 if (kev->fflags & NOTE_WL_END_OWNERSHIP) {
2569 return EINVAL;
2570 }
2571 goto sync_checks;
2572
2573 case NOTE_WL_SYNC_WAKE:
2574 sync_checks:
2575 if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE))) {
2576 return EINVAL;
2577 }
2578 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2579 return EINVAL;
2580 }
2581 break;
2582
2583 case NOTE_WL_SYNC_IPC:
2584 if (sav_commands != NOTE_WL_SYNC_IPC) {
2585 return EINVAL;
2586 }
2587 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE) {
2588 return EINVAL;
2589 }
2590 break;
2591
2592 default:
2593 return EINVAL;
2594 }
2595 return 0;
2596 }
2597
2598 static int
2599 filt_wltouch(struct knote *kn, struct kevent_qos_s *kev)
2600 {
2601 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2602 thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
2603 int result = 0;
2604
2605 int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
2606 if (error) {
2607 goto out;
2608 }
2609
2610 uint32_t command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2611 if (command == NOTE_WL_SYNC_IPC) {
2612 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLTOUCH);
2613 } else {
2614 error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
2615 filt_wlremember_last_update(kn, kev, error);
2616 }
2617 if (error == EPREEMPTDISABLED) {
2618 error = 0;
2619 result = FILTER_THREADREQ_NODEFEER;
2620 }
2621
2622 out:
2623 if (error) {
2624 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2625 /* If userland wants ESTALE to be hidden, do not activate */
2626 return result;
2627 }
2628 kev->flags |= EV_ERROR;
2629 kev->data = error;
2630 return result;
2631 }
2632 if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
2633 return kevent_register_wait_prepare(kn, kev, result);
2634 }
2635 /* Just touching the thread request successfully will fire it */
2636 if (command == NOTE_WL_THREAD_REQUEST) {
2637 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2638 result |= FILTER_UPDATE_REQ_QOS;
2639 }
2640 result |= FILTER_ACTIVE;
2641 }
2642 return result;
2643 }
2644
2645 static bool
2646 filt_wlallow_drop(struct knote *kn, struct kevent_qos_s *kev)
2647 {
2648 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2649
2650 int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
2651 if (error) {
2652 goto out;
2653 }
2654
2655 uint32_t command = (kev->fflags & NOTE_WL_COMMANDS_MASK);
2656 if (command == NOTE_WL_SYNC_IPC) {
2657 error = filt_wlupdate_sync_ipc(kqwl, kn, kev, FILT_WLDROP);
2658 } else {
2659 error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
2660 filt_wlremember_last_update(kn, kev, error);
2661 }
2662 assert(error != EPREEMPTDISABLED);
2663
2664 out:
2665 if (error) {
2666 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2667 return false;
2668 }
2669 kev->flags |= EV_ERROR;
2670 kev->data = error;
2671 return false;
2672 }
2673 return true;
2674 }
2675
2676 static int
2677 filt_wlprocess(struct knote *kn, struct kevent_qos_s *kev)
2678 {
2679 struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
2680 int rc = 0;
2681
2682 assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2683
2684 kqlock(kqwl);
2685
2686 if (kqwl->kqwl_owner) {
2687 /*
2688 * <rdar://problem/33584321> userspace sometimes due to events being
2689 * delivered but not triggering a drain session can cause a process
2690 * of the thread request knote.
2691 *
2692 * When that happens, the automatic deactivation due to process
2693 * would swallow the event, so we have to activate the knote again.
2694 */
2695 knote_activate(kqwl, kn, FILTER_ACTIVE);
2696 } else {
2697 #if DEBUG || DEVELOPMENT
2698 if (kevent_debug_flags & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
2699 /*
2700 * see src/queue_internal.h in libdispatch
2701 */
2702 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2703 user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2704 task_t t = current_task();
2705 uint64_t val;
2706 if (addr && task_is_active(t) && !task_is_halting(t) &&
2707 copyin_atomic64(addr, &val) == 0 &&
2708 val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
2709 (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
2710 panic("kevent: workloop %#016llx is not enqueued "
2711 "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2712 kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2713 }
2714 }
2715 #endif
2716 knote_fill_kevent(kn, kev, 0);
2717 kev->fflags = kn->kn_sfflags;
2718 rc |= FILTER_ACTIVE;
2719 }
2720
2721 kqunlock(kqwl);
2722
2723 if (rc & FILTER_ACTIVE) {
2724 workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
2725 }
2726 return rc;
2727 }
2728
2729 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
2730 .f_extended_codes = true,
2731 .f_attach = filt_wlattach,
2732 .f_detach = filt_wldetach,
2733 .f_event = filt_bad_event,
2734 .f_touch = filt_wltouch,
2735 .f_process = filt_wlprocess,
2736 .f_allow_drop = filt_wlallow_drop,
2737 .f_post_register_wait = filt_wlpost_register_wait,
2738 };
2739
2740 #pragma mark - kqueues allocation and deallocation
2741
2742 /*!
2743 * @enum kqworkloop_dealloc_flags_t
2744 *
2745 * @brief
2746 * Flags that alter kqworkloop_dealloc() behavior.
2747 *
2748 * @const KQWL_DEALLOC_NONE
2749 * Convenient name for "no flags".
2750 *
2751 * @const KQWL_DEALLOC_SKIP_HASH_REMOVE
2752 * Do not remove the workloop fromt he hash table.
2753 * This is used for process tear-down codepaths as the workloops have been
2754 * removed by the caller already.
2755 */
2756 OS_OPTIONS(kqworkloop_dealloc_flags, unsigned,
2757 KQWL_DEALLOC_NONE = 0x0000,
2758 KQWL_DEALLOC_SKIP_HASH_REMOVE = 0x0001,
2759 );
2760
2761 static void
2762 kqworkloop_dealloc(struct kqworkloop *, kqworkloop_dealloc_flags_t, uint32_t);
2763
2764 OS_NOINLINE OS_COLD OS_NORETURN
2765 static void
2766 kqworkloop_retain_panic(struct kqworkloop *kqwl, uint32_t previous)
2767 {
2768 if (previous == 0) {
2769 panic("kq(%p) resurrection", kqwl);
2770 } else {
2771 panic("kq(%p) retain overflow", kqwl);
2772 }
2773 }
2774
2775 OS_NOINLINE OS_COLD OS_NORETURN
2776 static void
2777 kqworkloop_release_panic(struct kqworkloop *kqwl)
2778 {
2779 panic("kq(%p) over-release", kqwl);
2780 }
2781
2782 OS_ALWAYS_INLINE
2783 static inline bool
2784 kqworkloop_try_retain(struct kqworkloop *kqwl)
2785 {
2786 uint32_t old_ref, new_ref;
2787 os_atomic_rmw_loop(&kqwl->kqwl_retains, old_ref, new_ref, relaxed, {
2788 if (__improbable(old_ref == 0)) {
2789 os_atomic_rmw_loop_give_up(return false);
2790 }
2791 if (__improbable(old_ref >= KQ_WORKLOOP_RETAINS_MAX)) {
2792 kqworkloop_retain_panic(kqwl, old_ref);
2793 }
2794 new_ref = old_ref + 1;
2795 });
2796 return true;
2797 }
2798
2799 OS_ALWAYS_INLINE
2800 static inline void
2801 kqworkloop_retain(struct kqworkloop *kqwl)
2802 {
2803 uint32_t previous = os_atomic_inc_orig(&kqwl->kqwl_retains, relaxed);
2804 if (__improbable(previous == 0 || previous >= KQ_WORKLOOP_RETAINS_MAX)) {
2805 kqworkloop_retain_panic(kqwl, previous);
2806 }
2807 }
2808
2809 OS_ALWAYS_INLINE
2810 static inline void
2811 kqueue_retain(kqueue_t kqu)
2812 {
2813 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2814 kqworkloop_retain(kqu.kqwl);
2815 }
2816 }
2817
2818 OS_ALWAYS_INLINE
2819 static inline void
2820 kqworkloop_release_live(struct kqworkloop *kqwl)
2821 {
2822 uint32_t refs = os_atomic_dec_orig(&kqwl->kqwl_retains, relaxed);
2823 if (__improbable(refs <= 1)) {
2824 kqworkloop_release_panic(kqwl);
2825 }
2826 }
2827
2828 OS_ALWAYS_INLINE
2829 static inline void
2830 kqueue_release_live(kqueue_t kqu)
2831 {
2832 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2833 kqworkloop_release_live(kqu.kqwl);
2834 }
2835 }
2836
2837 OS_ALWAYS_INLINE
2838 static inline void
2839 kqworkloop_release(struct kqworkloop *kqwl)
2840 {
2841 uint32_t refs = os_atomic_dec_orig(&kqwl->kqwl_retains, relaxed);
2842
2843 if (__improbable(refs <= 1)) {
2844 kqworkloop_dealloc(kqwl, KQWL_DEALLOC_NONE, refs - 1);
2845 }
2846 }
2847
2848 OS_ALWAYS_INLINE
2849 static inline void
2850 kqueue_release(kqueue_t kqu)
2851 {
2852 if (kqu.kq->kq_state & KQ_DYNAMIC) {
2853 kqworkloop_release(kqu.kqwl);
2854 }
2855 }
2856
2857 /*!
2858 * @function kqueue_destroy
2859 *
2860 * @brief
2861 * Common part to all kqueue dealloc functions.
2862 */
2863 OS_NOINLINE
2864 static void
2865 kqueue_destroy(kqueue_t kqu, zone_t zone)
2866 {
2867 /*
2868 * waitq_set_deinit() remove the KQ's waitq set from
2869 * any select sets to which it may belong.
2870 *
2871 * The order of these deinits matter: before waitq_set_deinit() returns,
2872 * waitq_set__CALLING_PREPOST_HOOK__ may be called and it will take the
2873 * kq_lock.
2874 */
2875 waitq_set_deinit(&kqu.kq->kq_wqs);
2876 lck_spin_destroy(&kqu.kq->kq_lock, &kq_lck_grp);
2877
2878 zfree(zone, kqu.kq);
2879 }
2880
2881 /*!
2882 * @function kqueue_init
2883 *
2884 * @brief
2885 * Common part to all kqueue alloc functions.
2886 */
2887 static kqueue_t
2888 kqueue_init(kqueue_t kqu, waitq_set_prepost_hook_t *hook, int policy)
2889 {
2890 waitq_set_init(&kqu.kq->kq_wqs, policy, NULL, hook);
2891 lck_spin_init(&kqu.kq->kq_lock, &kq_lck_grp, LCK_ATTR_NULL);
2892 return kqu;
2893 }
2894
2895 #pragma mark kqfile allocation and deallocation
2896
2897 /*!
2898 * @function kqueue_dealloc
2899 *
2900 * @brief
2901 * Detach all knotes from a kqfile and free it.
2902 *
2903 * @discussion
2904 * We walk each list looking for knotes referencing this
2905 * this kqueue. If we find one, we try to drop it. But
2906 * if we fail to get a drop reference, that will wait
2907 * until it is dropped. So, we can just restart again
2908 * safe in the assumption that the list will eventually
2909 * not contain any more references to this kqueue (either
2910 * we dropped them all, or someone else did).
2911 *
2912 * Assumes no new events are being added to the kqueue.
2913 * Nothing locked on entry or exit.
2914 */
2915 void
2916 kqueue_dealloc(struct kqueue *kq)
2917 {
2918 KNOTE_LOCK_CTX(knlc);
2919 struct proc *p = kq->kq_p;
2920 struct filedesc *fdp = p->p_fd;
2921 struct knote *kn;
2922
2923 assert(kq && (kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ)) == 0);
2924
2925 proc_fdlock(p);
2926 for (int i = 0; i < fdp->fd_knlistsize; i++) {
2927 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2928 while (kn != NULL) {
2929 if (kq == knote_get_kq(kn)) {
2930 kqlock(kq);
2931 proc_fdunlock(p);
2932 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2933 knote_drop(kq, kn, &knlc);
2934 }
2935 proc_fdlock(p);
2936 /* start over at beginning of list */
2937 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
2938 continue;
2939 }
2940 kn = SLIST_NEXT(kn, kn_link);
2941 }
2942 }
2943
2944 knhash_lock(fdp);
2945 proc_fdunlock(p);
2946
2947 if (fdp->fd_knhashmask != 0) {
2948 for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
2949 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2950 while (kn != NULL) {
2951 if (kq == knote_get_kq(kn)) {
2952 kqlock(kq);
2953 knhash_unlock(fdp);
2954 if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
2955 knote_drop(kq, kn, &knlc);
2956 }
2957 knhash_lock(fdp);
2958 /* start over at beginning of list */
2959 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
2960 continue;
2961 }
2962 kn = SLIST_NEXT(kn, kn_link);
2963 }
2964 }
2965 }
2966 knhash_unlock(fdp);
2967
2968 kqueue_destroy(kq, kqfile_zone);
2969 }
2970
2971 /*!
2972 * @function kqueue_alloc
2973 *
2974 * @brief
2975 * Allocate a kqfile.
2976 */
2977 struct kqueue *
2978 kqueue_alloc(struct proc *p)
2979 {
2980 struct kqfile *kqf;
2981
2982 /*
2983 * kqfiles are created with kqueue() so we need to wait for
2984 * the first kevent syscall to know which bit among
2985 * KQ_KEV_{32,64,QOS} will be set in kqf_state
2986 */
2987 kqf = zalloc_flags(kqfile_zone, Z_WAITOK | Z_ZERO);
2988 kqf->kqf_p = p;
2989 TAILQ_INIT_AFTER_BZERO(&kqf->kqf_queue);
2990 TAILQ_INIT_AFTER_BZERO(&kqf->kqf_suppressed);
2991
2992 return kqueue_init(kqf, NULL, SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST).kq;
2993 }
2994
2995 /*!
2996 * @function kqueue_internal
2997 *
2998 * @brief
2999 * Core implementation for kqueue and guarded_kqueue_np()
3000 */
3001 int
3002 kqueue_internal(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
3003 {
3004 struct kqueue *kq;
3005 struct fileproc *fp;
3006 int fd, error;
3007
3008 error = falloc_withalloc(p, &fp, &fd, vfs_context_current(), fp_zalloc, cra);
3009 if (error) {
3010 return error;
3011 }
3012
3013 kq = kqueue_alloc(p);
3014 if (kq == NULL) {
3015 fp_free(p, fd, fp);
3016 return ENOMEM;
3017 }
3018
3019 fp->f_flag = FREAD | FWRITE;
3020 fp->f_ops = &kqueueops;
3021 fp->f_data = kq;
3022 fp->f_lflags |= FG_CONFINED;
3023
3024 proc_fdlock(p);
3025 *fdflags(p, fd) |= UF_EXCLOSE | UF_FORKCLOSE;
3026 procfdtbl_releasefd(p, fd, NULL);
3027 fp_drop(p, fd, fp, 1);
3028 proc_fdunlock(p);
3029
3030 *retval = fd;
3031 return error;
3032 }
3033
3034 /*!
3035 * @function kqueue
3036 *
3037 * @brief
3038 * The kqueue syscall.
3039 */
3040 int
3041 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3042 {
3043 return kqueue_internal(p, fileproc_alloc_init, NULL, retval);
3044 }
3045
3046 #pragma mark kqworkq allocation and deallocation
3047
3048 /*!
3049 * @function kqworkq_dealloc
3050 *
3051 * @brief
3052 * Deallocates a workqueue kqueue.
3053 *
3054 * @discussion
3055 * This only happens at process death, or for races with concurrent
3056 * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3057 * this kqueue, either there are none, or someone else took care of them.
3058 */
3059 void
3060 kqworkq_dealloc(struct kqworkq *kqwq)
3061 {
3062 kqueue_destroy(kqwq, kqworkq_zone);
3063 }
3064
3065 /*!
3066 * @function kqworkq_alloc
3067 *
3068 * @brief
3069 * Allocates a workqueue kqueue.
3070 *
3071 * @discussion
3072 * This is the slow path of kevent_get_kqwq.
3073 * This takes care of making sure procs have a single workq kqueue.
3074 */
3075 OS_NOINLINE
3076 static struct kqworkq *
3077 kqworkq_alloc(struct proc *p, unsigned int flags)
3078 {
3079 struct kqworkq *kqwq, *tmp;
3080
3081 kqwq = zalloc_flags(kqworkq_zone, Z_WAITOK | Z_ZERO);
3082
3083 assert((flags & KEVENT_FLAG_LEGACY32) == 0);
3084 if (flags & KEVENT_FLAG_LEGACY64) {
3085 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV64;
3086 } else {
3087 kqwq->kqwq_state = KQ_WORKQ | KQ_KEV_QOS;
3088 }
3089 kqwq->kqwq_p = p;
3090
3091 for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3092 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_queue[i]);
3093 TAILQ_INIT_AFTER_BZERO(&kqwq->kqwq_suppressed[i]);
3094 }
3095 for (int i = 0; i < KQWQ_NBUCKETS; i++) {
3096 /*
3097 * Because of how the bucketized system works, we mix overcommit
3098 * sources with not overcommit: each time we move a knote from
3099 * one bucket to the next due to overrides, we'd had to track
3100 * overcommitness, and it's really not worth it in the workloop
3101 * enabled world that track this faithfully.
3102 *
3103 * Incidentally, this behaves like the original manager-based
3104 * kqwq where event delivery always happened (hence is
3105 * "overcommit")
3106 */
3107 kqwq->kqwq_request[i].tr_state = WORKQ_TR_STATE_IDLE;
3108 kqwq->kqwq_request[i].tr_flags = WORKQ_TR_FLAG_KEVENT;
3109 if (i != KQWQ_QOS_MANAGER) {
3110 kqwq->kqwq_request[i].tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
3111 }
3112 kqwq->kqwq_request[i].tr_kq_qos_index = (kq_index_t)i;
3113 }
3114
3115 kqueue_init(kqwq, &kqwq->kqwq_waitq_hook, SYNC_POLICY_FIFO);
3116
3117 if (!os_atomic_cmpxchgv(&p->p_fd->fd_wqkqueue, NULL, kqwq, &tmp, release)) {
3118 kqworkq_dealloc(kqwq);
3119 return tmp;
3120 }
3121
3122 return kqwq;
3123 }
3124
3125 #pragma mark kqworkloop allocation and deallocation
3126
3127 #define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3128 #define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3129
3130 OS_ALWAYS_INLINE
3131 static inline void
3132 kqhash_lock(struct filedesc *fdp)
3133 {
3134 lck_mtx_lock_spin_always(&fdp->fd_kqhashlock);
3135 }
3136
3137 OS_ALWAYS_INLINE
3138 static inline void
3139 kqhash_unlock(struct filedesc *fdp)
3140 {
3141 lck_mtx_unlock(&fdp->fd_kqhashlock);
3142 }
3143
3144 OS_ALWAYS_INLINE
3145 static inline void
3146 kqworkloop_hash_insert_locked(struct filedesc *fdp, kqueue_id_t id,
3147 struct kqworkloop *kqwl)
3148 {
3149 struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3150 LIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3151 }
3152
3153 OS_ALWAYS_INLINE
3154 static inline struct kqworkloop *
3155 kqworkloop_hash_lookup_locked(struct filedesc *fdp, kqueue_id_t id)
3156 {
3157 struct kqwllist *list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3158 struct kqworkloop *kqwl;
3159
3160 LIST_FOREACH(kqwl, list, kqwl_hashlink) {
3161 if (kqwl->kqwl_dynamicid == id) {
3162 return kqwl;
3163 }
3164 }
3165 return NULL;
3166 }
3167
3168 static struct kqworkloop *
3169 kqworkloop_hash_lookup_and_retain(struct filedesc *fdp, kqueue_id_t kq_id)
3170 {
3171 struct kqworkloop *kqwl = NULL;
3172
3173 kqhash_lock(fdp);
3174 if (__probable(fdp->fd_kqhash)) {
3175 kqwl = kqworkloop_hash_lookup_locked(fdp, kq_id);
3176 if (kqwl && !kqworkloop_try_retain(kqwl)) {
3177 kqwl = NULL;
3178 }
3179 }
3180 kqhash_unlock(fdp);
3181 return kqwl;
3182 }
3183
3184 OS_NOINLINE
3185 static void
3186 kqworkloop_hash_init(struct filedesc *fdp)
3187 {
3188 struct kqwllist *alloc_hash;
3189 u_long alloc_mask;
3190
3191 kqhash_unlock(fdp);
3192 alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3193 kqhash_lock(fdp);
3194
3195 /* See if we won the race */
3196 if (__probable(fdp->fd_kqhashmask == 0)) {
3197 fdp->fd_kqhash = alloc_hash;
3198 fdp->fd_kqhashmask = alloc_mask;
3199 } else {
3200 kqhash_unlock(fdp);
3201 hashdestroy(alloc_hash, M_KQUEUE, alloc_mask);
3202 kqhash_lock(fdp);
3203 }
3204 }
3205
3206 /*!
3207 * @function kqworkloop_dealloc
3208 *
3209 * @brief
3210 * Deallocates a workloop kqueue.
3211 *
3212 * @discussion
3213 * Knotes hold references on the workloop, so we can't really reach this
3214 * function unless all of these are already gone.
3215 *
3216 * Nothing locked on entry or exit.
3217 *
3218 * @param flags
3219 * Unless KQWL_DEALLOC_SKIP_HASH_REMOVE is set, the workloop is removed
3220 * from its hash table.
3221 *
3222 * @param current_ref
3223 * This function is also called to undo a kqworkloop_alloc in case of
3224 * allocation races, expected_ref is the current refcount that is expected
3225 * on the workloop object, usually 0, and 1 when a dealloc race is resolved.
3226 */
3227 static void
3228 kqworkloop_dealloc(struct kqworkloop *kqwl, kqworkloop_dealloc_flags_t flags,
3229 uint32_t current_ref)
3230 {
3231 thread_t cur_owner;
3232
3233 if (__improbable(current_ref > 1)) {
3234 kqworkloop_release_panic(kqwl);
3235 }
3236 assert(kqwl->kqwl_retains == current_ref);
3237
3238 /* pair with kqunlock() and other kq locks */
3239 os_atomic_thread_fence(acquire);
3240
3241 cur_owner = kqwl->kqwl_owner;
3242 if (cur_owner) {
3243 if (kqworkloop_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
3244 thread_drop_kevent_override(cur_owner);
3245 }
3246 thread_deallocate(cur_owner);
3247 kqwl->kqwl_owner = THREAD_NULL;
3248 }
3249
3250 if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
3251 struct turnstile *ts;
3252 turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
3253 &ts, TURNSTILE_WORKLOOPS);
3254 turnstile_cleanup();
3255 turnstile_deallocate(ts);
3256 }
3257
3258 if ((flags & KQWL_DEALLOC_SKIP_HASH_REMOVE) == 0) {
3259 struct filedesc *fdp = kqwl->kqwl_p->p_fd;
3260
3261 kqhash_lock(fdp);
3262 LIST_REMOVE(kqwl, kqwl_hashlink);
3263 kqhash_unlock(fdp);
3264 }
3265
3266 assert(TAILQ_EMPTY(&kqwl->kqwl_suppressed));
3267 assert(kqwl->kqwl_owner == THREAD_NULL);
3268 assert(kqwl->kqwl_turnstile == TURNSTILE_NULL);
3269
3270 lck_spin_destroy(&kqwl->kqwl_statelock, &kq_lck_grp);
3271 kqueue_destroy(kqwl, kqworkloop_zone);
3272 }
3273
3274 /*!
3275 * @function kqworkloop_alloc
3276 *
3277 * @brief
3278 * Allocates a workloop kqueue.
3279 */
3280 static void
3281 kqworkloop_init(struct kqworkloop *kqwl, proc_t p,
3282 kqueue_id_t id, workq_threadreq_param_t *trp)
3283 {
3284 kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC | KQ_KEV_QOS;
3285 kqwl->kqwl_retains = 1; /* donate a retain to creator */
3286 kqwl->kqwl_dynamicid = id;
3287 kqwl->kqwl_p = p;
3288 if (trp) {
3289 kqwl->kqwl_params = trp->trp_value;
3290 }
3291
3292 workq_tr_flags_t tr_flags = WORKQ_TR_FLAG_WORKLOOP;
3293 if (trp) {
3294 if (trp->trp_flags & TRP_PRIORITY) {
3295 tr_flags |= WORKQ_TR_FLAG_WL_OUTSIDE_QOS;
3296 }
3297 if (trp->trp_flags) {
3298 tr_flags |= WORKQ_TR_FLAG_WL_PARAMS;
3299 }
3300 }
3301 kqwl->kqwl_request.tr_state = WORKQ_TR_STATE_IDLE;
3302 kqwl->kqwl_request.tr_flags = tr_flags;
3303
3304 for (int i = 0; i < KQWL_NBUCKETS; i++) {
3305 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_queue[i]);
3306 }
3307 TAILQ_INIT_AFTER_BZERO(&kqwl->kqwl_suppressed);
3308
3309 lck_spin_init(&kqwl->kqwl_statelock, &kq_lck_grp, LCK_ATTR_NULL);
3310
3311 kqueue_init(kqwl, &kqwl->kqwl_waitq_hook, SYNC_POLICY_FIFO);
3312 }
3313
3314 /*!
3315 * @function kqworkloop_get_or_create
3316 *
3317 * @brief
3318 * Wrapper around kqworkloop_alloc that handles the uniquing of workloops.
3319 *
3320 * @returns
3321 * 0: success
3322 * EINVAL: invalid parameters
3323 * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3324 * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3325 * ENOMEM: allocation failed
3326 */
3327 static int
3328 kqworkloop_get_or_create(struct proc *p, kqueue_id_t id,
3329 workq_threadreq_param_t *trp, unsigned int flags, struct kqworkloop **kqwlp)
3330 {
3331 struct filedesc *fdp = p->p_fd;
3332 struct kqworkloop *alloc_kqwl = NULL;
3333 struct kqworkloop *kqwl = NULL;
3334 int error = 0;
3335
3336 assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
3337
3338 if (id == 0 || id == (kqueue_id_t)-1) {
3339 return EINVAL;
3340 }
3341
3342 for (;;) {
3343 kqhash_lock(fdp);
3344 if (__improbable(fdp->fd_kqhash == NULL)) {
3345 kqworkloop_hash_init(fdp);
3346 }
3347
3348 kqwl = kqworkloop_hash_lookup_locked(fdp, id);
3349 if (kqwl) {
3350 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
3351 /*
3352 * If MUST_NOT_EXIST was passed, even if we would have failed
3353 * the try_retain, it could have gone the other way, and
3354 * userspace can't tell. Let'em fix their race.
3355 */
3356 error = EEXIST;
3357 break;
3358 }
3359
3360 if (__probable(kqworkloop_try_retain(kqwl))) {
3361 /*
3362 * This is a valid live workloop !
3363 */
3364 *kqwlp = kqwl;
3365 error = 0;
3366 break;
3367 }
3368 }
3369
3370 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST)) {
3371 error = ENOENT;
3372 break;
3373 }
3374
3375 /*
3376 * We didn't find what we were looking for.
3377 *
3378 * If this is the second time we reach this point (alloc_kqwl != NULL),
3379 * then we're done.
3380 *
3381 * If this is the first time we reach this point (alloc_kqwl == NULL),
3382 * then try to allocate one without blocking.
3383 */
3384 if (__probable(alloc_kqwl == NULL)) {
3385 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_NOWAIT | Z_ZERO);
3386 }
3387 if (__probable(alloc_kqwl)) {
3388 kqworkloop_init(alloc_kqwl, p, id, trp);
3389 kqworkloop_hash_insert_locked(fdp, id, alloc_kqwl);
3390 kqhash_unlock(fdp);
3391 *kqwlp = alloc_kqwl;
3392 return 0;
3393 }
3394
3395 /*
3396 * We have to block to allocate a workloop, drop the lock,
3397 * allocate one, but then we need to retry lookups as someone
3398 * else could race with us.
3399 */
3400 kqhash_unlock(fdp);
3401
3402 alloc_kqwl = zalloc_flags(kqworkloop_zone, Z_WAITOK | Z_ZERO);
3403 }
3404
3405 kqhash_unlock(fdp);
3406
3407 if (__improbable(alloc_kqwl)) {
3408 zfree(kqworkloop_zone, alloc_kqwl);
3409 }
3410
3411 return error;
3412 }
3413
3414 #pragma mark - knotes
3415
3416 static int
3417 filt_no_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
3418 {
3419 knote_set_error(kn, ENOTSUP);
3420 return 0;
3421 }
3422
3423 static void
3424 filt_no_detach(__unused struct knote *kn)
3425 {
3426 }
3427
3428 static int __dead2
3429 filt_bad_event(struct knote *kn, long hint)
3430 {
3431 panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
3432 }
3433
3434 static int __dead2
3435 filt_bad_touch(struct knote *kn, struct kevent_qos_s *kev)
3436 {
3437 panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3438 }
3439
3440 static int __dead2
3441 filt_bad_process(struct knote *kn, struct kevent_qos_s *kev)
3442 {
3443 panic("%s[%d](%p, %p)", __func__, kn->kn_filter, kn, kev);
3444 }
3445
3446 /*
3447 * knotes_dealloc - detach all knotes for the process and drop them
3448 *
3449 * Called with proc_fdlock held.
3450 * Returns with it locked.
3451 * May drop it temporarily.
3452 * Process is in such a state that it will not try to allocate
3453 * any more knotes during this process (stopped for exit or exec).
3454 */
3455 void
3456 knotes_dealloc(proc_t p)
3457 {
3458 struct filedesc *fdp = p->p_fd;
3459 struct kqueue *kq;
3460 struct knote *kn;
3461 struct klist *kn_hash = NULL;
3462 u_long kn_hashmask;
3463 int i;
3464
3465 /* Close all the fd-indexed knotes up front */
3466 if (fdp->fd_knlistsize > 0) {
3467 for (i = 0; i < fdp->fd_knlistsize; i++) {
3468 while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
3469 kq = knote_get_kq(kn);
3470 kqlock(kq);
3471 proc_fdunlock(p);
3472 knote_drop(kq, kn, NULL);
3473 proc_fdlock(p);
3474 }
3475 }
3476 /* free the table */
3477 FREE(fdp->fd_knlist, M_KQUEUE);
3478 fdp->fd_knlist = NULL;
3479 }
3480 fdp->fd_knlistsize = 0;
3481
3482 knhash_lock(fdp);
3483 proc_fdunlock(p);
3484
3485 /* Clean out all the hashed knotes as well */
3486 if (fdp->fd_knhashmask != 0) {
3487 for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
3488 while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
3489 kq = knote_get_kq(kn);
3490 kqlock(kq);
3491 knhash_unlock(fdp);
3492 knote_drop(kq, kn, NULL);
3493 knhash_lock(fdp);
3494 }
3495 }
3496 kn_hash = fdp->fd_knhash;
3497 kn_hashmask = fdp->fd_knhashmask;
3498 fdp->fd_knhashmask = 0;
3499 fdp->fd_knhash = NULL;
3500 }
3501
3502 knhash_unlock(fdp);
3503
3504 if (kn_hash) {
3505 hashdestroy(kn_hash, M_KQUEUE, kn_hashmask);
3506 }
3507
3508 proc_fdlock(p);
3509 }
3510
3511 /*
3512 * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3513 * scheduling parameters
3514 *
3515 * Called with proc_fdlock held.
3516 * Returns with it locked.
3517 * Process is in such a state that it will not try to allocate
3518 * any more knotes during this process (stopped for exit or exec).
3519 */
3520 void
3521 kqworkloops_dealloc(proc_t p)
3522 {
3523 struct filedesc *fdp = p->p_fd;
3524 struct kqworkloop *kqwl, *kqwln;
3525 struct kqwllist tofree;
3526
3527 if (!(fdp->fd_flags & FD_WORKLOOP)) {
3528 return;
3529 }
3530
3531 kqhash_lock(fdp);
3532
3533 if (fdp->fd_kqhashmask == 0) {
3534 kqhash_unlock(fdp);
3535 return;
3536 }
3537
3538 LIST_INIT(&tofree);
3539
3540 for (size_t i = 0; i <= fdp->fd_kqhashmask; i++) {
3541 LIST_FOREACH_SAFE(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink, kqwln) {
3542 /*
3543 * kqworkloops that have scheduling parameters have an
3544 * implicit retain from kqueue_workloop_ctl that needs
3545 * to be balanced on process exit.
3546 */
3547 assert(kqwl->kqwl_params);
3548 LIST_REMOVE(kqwl, kqwl_hashlink);
3549 LIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
3550 }
3551 }
3552
3553 kqhash_unlock(fdp);
3554
3555 LIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
3556 kqworkloop_dealloc(kqwl, KQWL_DEALLOC_SKIP_HASH_REMOVE, 1);
3557 }
3558 }
3559
3560 static int
3561 kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
3562 struct kevent_qos_s *kev)
3563 {
3564 /* We don't care about the priority of a disabled or deleted knote */
3565 if (kev->flags & (EV_DISABLE | EV_DELETE)) {
3566 return 0;
3567 }
3568
3569 if (kq->kq_state & KQ_WORKLOOP) {
3570 /*
3571 * Workloops need valid priorities with a QOS (excluding manager) for
3572 * any enabled knote.
3573 *
3574 * When it is pre-existing, just make sure it has a valid QoS as
3575 * kevent_register() will not use the incoming priority (filters who do
3576 * have the responsibility to validate it again, see filt_wltouch).
3577 *
3578 * If the knote is being made, validate the incoming priority.
3579 */
3580 if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
3581 return ERANGE;
3582 }
3583 }
3584
3585 return 0;
3586 }
3587
3588 /*
3589 * Prepare a filter for waiting after register.
3590 *
3591 * The f_post_register_wait hook will be called later by kevent_register()
3592 * and should call kevent_register_wait_block()
3593 */
3594 static int
3595 kevent_register_wait_prepare(struct knote *kn, struct kevent_qos_s *kev, int rc)
3596 {
3597 thread_t thread = current_thread();
3598
3599 assert(knote_fops(kn)->f_extended_codes);
3600
3601 if (kn->kn_thread == NULL) {
3602 thread_reference(thread);
3603 kn->kn_thread = thread;
3604 } else if (kn->kn_thread != thread) {
3605 /*
3606 * kn_thread may be set from a previous aborted wait
3607 * However, it has to be from the same thread.
3608 */
3609 kev->flags |= EV_ERROR;
3610 kev->data = EXDEV;
3611 return 0;
3612 }
3613
3614 return FILTER_REGISTER_WAIT | rc;
3615 }
3616
3617 /*
3618 * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3619 * aborted instead of properly woken up with thread_wakeup_thread().
3620 */
3621 static void
3622 kevent_register_wait_cleanup(struct knote *kn)
3623 {
3624 thread_t thread = kn->kn_thread;
3625 kn->kn_thread = NULL;
3626 thread_deallocate(thread);
3627 }
3628
3629 /*
3630 * Must be called at the end of a f_post_register_wait call from a filter.
3631 */
3632 static void
3633 kevent_register_wait_block(struct turnstile *ts, thread_t thread,
3634 thread_continue_t cont, struct _kevent_register *cont_args)
3635 {
3636 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
3637 kqunlock(cont_args->kqwl);
3638 cont_args->handoff_thread = thread;
3639 thread_handoff_parameter(thread, cont, cont_args, THREAD_HANDOFF_NONE);
3640 }
3641
3642 /*
3643 * Called by Filters using a f_post_register_wait to return from their wait.
3644 */
3645 static void
3646 kevent_register_wait_return(struct _kevent_register *cont_args)
3647 {
3648 struct kqworkloop *kqwl = cont_args->kqwl;
3649 struct kevent_qos_s *kev = &cont_args->kev;
3650 int error = 0;
3651
3652 if (cont_args->handoff_thread) {
3653 thread_deallocate(cont_args->handoff_thread);
3654 }
3655
3656 if (kev->flags & (EV_ERROR | EV_RECEIPT)) {
3657 if ((kev->flags & EV_ERROR) == 0) {
3658 kev->flags |= EV_ERROR;
3659 kev->data = 0;
3660 }
3661 error = kevent_modern_copyout(kev, &cont_args->ueventlist);
3662 if (error == 0) {
3663 cont_args->eventout++;
3664 }
3665 }
3666
3667 kqworkloop_release(kqwl);
3668 if (error == 0) {
3669 *(int32_t *)&current_uthread()->uu_rval = cont_args->eventout;
3670 }
3671 unix_syscall_return(error);
3672 }
3673
3674 /*
3675 * kevent_register - add a new event to a kqueue
3676 *
3677 * Creates a mapping between the event source and
3678 * the kqueue via a knote data structure.
3679 *
3680 * Because many/most the event sources are file
3681 * descriptor related, the knote is linked off
3682 * the filedescriptor table for quick access.
3683 *
3684 * called with nothing locked
3685 * caller holds a reference on the kqueue
3686 */
3687
3688 int
3689 kevent_register(struct kqueue *kq, struct kevent_qos_s *kev,
3690 struct knote **kn_out)
3691 {
3692 struct proc *p = kq->kq_p;
3693 const struct filterops *fops;
3694 struct knote *kn = NULL;
3695 int result = 0, error = 0;
3696 unsigned short kev_flags = kev->flags;
3697 KNOTE_LOCK_CTX(knlc);
3698
3699 if (__probable(kev->filter < 0 && kev->filter + EVFILT_SYSCOUNT >= 0)) {
3700 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
3701 } else {
3702 error = EINVAL;
3703 goto out;
3704 }
3705
3706 /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
3707 if (__improbable((kev->flags & EV_VANISHED) &&
3708 (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2))) {
3709 error = EINVAL;
3710 goto out;
3711 }
3712
3713 /* Simplify the flags - delete and disable overrule */
3714 if (kev->flags & EV_DELETE) {
3715 kev->flags &= ~EV_ADD;
3716 }
3717 if (kev->flags & EV_DISABLE) {
3718 kev->flags &= ~EV_ENABLE;
3719 }
3720
3721 if (kq->kq_state & KQ_WORKLOOP) {
3722 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
3723 ((struct kqworkloop *)kq)->kqwl_dynamicid,
3724 kev->udata, kev->flags, kev->filter);
3725 } else if (kq->kq_state & KQ_WORKQ) {
3726 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
3727 0, kev->udata, kev->flags, kev->filter);
3728 } else {
3729 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
3730 VM_KERNEL_UNSLIDE_OR_PERM(kq),
3731 kev->udata, kev->flags, kev->filter);
3732 }
3733
3734 restart:
3735 /* find the matching knote from the fd tables/hashes */
3736 kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
3737 error = kevent_register_validate_priority(kq, kn, kev);
3738 result = 0;
3739 if (error) {
3740 goto out;
3741 }
3742
3743 if (kn == NULL && (kev->flags & EV_ADD) == 0) {
3744 /*
3745 * No knote found, EV_ADD wasn't specified
3746 */
3747
3748 if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
3749 (kq->kq_state & KQ_WORKLOOP)) {
3750 /*
3751 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
3752 * that doesn't care about ENOENT, so just pretend the deletion
3753 * happened.
3754 */
3755 } else {
3756 error = ENOENT;
3757 }
3758 goto out;
3759 } else if (kn == NULL) {
3760 /*
3761 * No knote found, need to attach a new one (attach)
3762 */
3763
3764 struct fileproc *knote_fp = NULL;
3765
3766 /* grab a file reference for the new knote */
3767 if (fops->f_isfd) {
3768 if ((error = fp_lookup(p, (int)kev->ident, &knote_fp, 0)) != 0) {
3769 goto out;
3770 }
3771 }
3772
3773 kn = knote_alloc();
3774 if (kn == NULL) {
3775 error = ENOMEM;
3776 if (knote_fp != NULL) {
3777 fp_drop(p, (int)kev->ident, knote_fp, 0);
3778 }
3779 goto out;
3780 }
3781
3782 kn->kn_fp = knote_fp;
3783 kn->kn_is_fd = fops->f_isfd;
3784 kn->kn_kq_packed = VM_PACK_POINTER((vm_offset_t)kq, KNOTE_KQ_PACKED);
3785 kn->kn_status = 0;
3786
3787 /* was vanish support requested */
3788 if (kev->flags & EV_VANISHED) {
3789 kev->flags &= ~EV_VANISHED;
3790 kn->kn_status |= KN_REQVANISH;
3791 }
3792
3793 /* snapshot matching/dispatching protocol flags into knote */
3794 if (kev->flags & EV_DISABLE) {
3795 kn->kn_status |= KN_DISABLED;
3796 }
3797
3798 /*
3799 * copy the kevent state into knote
3800 * protocol is that fflags and data
3801 * are saved off, and cleared before
3802 * calling the attach routine.
3803 *
3804 * - kn->kn_sfflags aliases with kev->xflags
3805 * - kn->kn_sdata aliases with kev->data
3806 * - kn->kn_filter is the top 8 bits of kev->filter
3807 */
3808 kn->kn_kevent = *(struct kevent_internal_s *)kev;
3809 kn->kn_sfflags = kev->fflags;
3810 kn->kn_filtid = (uint8_t)~kev->filter;
3811 kn->kn_fflags = 0;
3812 knote_reset_priority(kq, kn, kev->qos);
3813
3814 /* Add the knote for lookup thru the fd table */
3815 error = kq_add_knote(kq, kn, &knlc, p);
3816 if (error) {
3817 knote_free(kn);
3818 if (knote_fp != NULL) {
3819 fp_drop(p, (int)kev->ident, knote_fp, 0);
3820 }
3821
3822 if (error == ERESTART) {
3823 goto restart;
3824 }
3825 goto out;
3826 }
3827
3828 /* fp reference count now applies to knote */
3829
3830 /*
3831 * we can't use filter_call() because f_attach can change the filter ops
3832 * for a filter that supports f_extended_codes, so we need to reload
3833 * knote_fops() and not use `fops`.
3834 */
3835 result = fops->f_attach(kn, kev);
3836 if (result && !knote_fops(kn)->f_extended_codes) {
3837 result = FILTER_ACTIVE;
3838 }
3839
3840 kqlock(kq);
3841
3842 if (result & FILTER_THREADREQ_NODEFEER) {
3843 enable_preemption();
3844 }
3845
3846 if (kn->kn_flags & EV_ERROR) {
3847 /*
3848 * Failed to attach correctly, so drop.
3849 */
3850 kn->kn_filtid = EVFILTID_DETACHED;
3851 error = (int)kn->kn_sdata;
3852 knote_drop(kq, kn, &knlc);
3853 result = 0;
3854 goto out;
3855 }
3856
3857 /*
3858 * end "attaching" phase - now just attached
3859 *
3860 * Mark the thread request overcommit, if appropos
3861 *
3862 * If the attach routine indicated that an
3863 * event is already fired, activate the knote.
3864 */
3865 if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
3866 (kq->kq_state & KQ_WORKLOOP)) {
3867 kqworkloop_set_overcommit((struct kqworkloop *)kq);
3868 }
3869 } else if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
3870 /*
3871 * The knote was dropped while we were waiting for the lock,
3872 * we need to re-evaluate entirely
3873 */
3874
3875 goto restart;
3876 } else if (kev->flags & EV_DELETE) {
3877 /*
3878 * Deletion of a knote (drop)
3879 *
3880 * If the filter wants to filter drop events, let it do so.
3881 *
3882 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
3883 * we must wait for the knote to be re-enabled (unless it is being
3884 * re-enabled atomically here).
3885 */
3886
3887 if (knote_fops(kn)->f_allow_drop) {
3888 bool drop;
3889
3890 kqunlock(kq);
3891 drop = knote_fops(kn)->f_allow_drop(kn, kev);
3892 kqlock(kq);
3893
3894 if (!drop) {
3895 goto out_unlock;
3896 }
3897 }
3898
3899 if ((kev->flags & EV_ENABLE) == 0 &&
3900 (kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
3901 (kn->kn_status & KN_DISABLED) != 0) {
3902 kn->kn_status |= KN_DEFERDELETE;
3903 error = EINPROGRESS;
3904 goto out_unlock;
3905 }
3906
3907 knote_drop(kq, kn, &knlc);
3908 goto out;
3909 } else {
3910 /*
3911 * Regular update of a knote (touch)
3912 *
3913 * Call touch routine to notify filter of changes in filter values
3914 * (and to re-determine if any events are fired).
3915 *
3916 * If the knote is in defer-delete, avoid calling the filter touch
3917 * routine (it has delivered its last event already).
3918 *
3919 * If the touch routine had no failure,
3920 * apply the requested side effects to the knote.
3921 */
3922
3923 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
3924 if (kev->flags & EV_ENABLE) {
3925 result = FILTER_ACTIVE;
3926 }
3927 } else {
3928 kqunlock(kq);
3929 result = filter_call(knote_fops(kn), f_touch(kn, kev));
3930 kqlock(kq);
3931 if (result & FILTER_THREADREQ_NODEFEER) {
3932 enable_preemption();
3933 }
3934 }
3935
3936 if (kev->flags & EV_ERROR) {
3937 result = 0;
3938 goto out_unlock;
3939 }
3940
3941 if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0 &&
3942 kn->kn_udata != kev->udata) {
3943 // this allows klist_copy_udata() not to take locks
3944 os_atomic_store_wide(&kn->kn_udata, kev->udata, relaxed);
3945 }
3946 if ((kev->flags & EV_DISABLE) && !(kn->kn_status & KN_DISABLED)) {
3947 kn->kn_status |= KN_DISABLED;
3948 knote_dequeue(kq, kn);
3949 }
3950 }
3951
3952 /* accept new kevent state */
3953 knote_apply_touch(kq, kn, kev, result);
3954
3955 out_unlock:
3956 /*
3957 * When the filter asked for a post-register wait,
3958 * we leave the kqueue locked for kevent_register()
3959 * to call the filter's f_post_register_wait hook.
3960 */
3961 if (result & FILTER_REGISTER_WAIT) {
3962 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
3963 *kn_out = kn;
3964 } else {
3965 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
3966 }
3967
3968 out:
3969 /* output local errors through the kevent */
3970 if (error) {
3971 kev->flags |= EV_ERROR;
3972 kev->data = error;
3973 }
3974 return result;
3975 }
3976
3977 /*
3978 * knote_process - process a triggered event
3979 *
3980 * Validate that it is really still a triggered event
3981 * by calling the filter routines (if necessary). Hold
3982 * a use reference on the knote to avoid it being detached.
3983 *
3984 * If it is still considered triggered, we will have taken
3985 * a copy of the state under the filter lock. We use that
3986 * snapshot to dispatch the knote for future processing (or
3987 * not, if this was a lost event).
3988 *
3989 * Our caller assures us that nobody else can be processing
3990 * events from this knote during the whole operation. But
3991 * others can be touching or posting events to the knote
3992 * interspersed with our processing it.
3993 *
3994 * caller holds a reference on the kqueue.
3995 * kqueue locked on entry and exit - but may be dropped
3996 */
3997 static int
3998 knote_process(struct knote *kn, kevent_ctx_t kectx,
3999 kevent_callback_t callback)
4000 {
4001 struct kevent_qos_s kev;
4002 struct kqueue *kq = knote_get_kq(kn);
4003 KNOTE_LOCK_CTX(knlc);
4004 int result = FILTER_ACTIVE;
4005 int error = 0;
4006 bool drop = false;
4007
4008 /*
4009 * Must be active or stayactive
4010 * Must be queued and not disabled/suppressed or dropping
4011 */
4012 assert(kn->kn_status & KN_QUEUED);
4013 assert(kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE));
4014 assert(!(kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)));
4015
4016 if (kq->kq_state & KQ_WORKLOOP) {
4017 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4018 ((struct kqworkloop *)kq)->kqwl_dynamicid,
4019 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4020 kn->kn_filtid);
4021 } else if (kq->kq_state & KQ_WORKQ) {
4022 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4023 0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4024 kn->kn_filtid);
4025 } else {
4026 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4027 VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4028 kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4029 }
4030
4031 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
4032 /*
4033 * When the knote is dropping or has dropped,
4034 * then there's nothing we want to process.
4035 */
4036 return EJUSTRETURN;
4037 }
4038
4039 /*
4040 * While waiting for the knote lock, we may have dropped the kq lock.
4041 * and a touch may have disabled and dequeued the knote.
4042 */
4043 if (!(kn->kn_status & KN_QUEUED)) {
4044 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4045 return EJUSTRETURN;
4046 }
4047
4048 /*
4049 * For deferred-drop or vanished events, we just create a fake
4050 * event to acknowledge end-of-life. Otherwise, we call the
4051 * filter's process routine to snapshot the kevent state under
4052 * the filter's locking protocol.
4053 *
4054 * suppress knotes to avoid returning the same event multiple times in
4055 * a single call.
4056 */
4057 knote_suppress(kq, kn);
4058
4059 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4060 uint16_t kev_flags = EV_DISPATCH2 | EV_ONESHOT;
4061 if (kn->kn_status & KN_DEFERDELETE) {
4062 kev_flags |= EV_DELETE;
4063 } else {
4064 kev_flags |= EV_VANISHED;
4065 }
4066
4067 /* create fake event */
4068 kev = (struct kevent_qos_s){
4069 .filter = kn->kn_filter,
4070 .ident = kn->kn_id,
4071 .flags = kev_flags,
4072 .udata = kn->kn_udata,
4073 };
4074 } else {
4075 kqunlock(kq);
4076 kev = (struct kevent_qos_s) { };
4077 result = filter_call(knote_fops(kn), f_process(kn, &kev));
4078 kqlock(kq);
4079 }
4080
4081 /*
4082 * Determine how to dispatch the knote for future event handling.
4083 * not-fired: just return (do not callout, leave deactivated).
4084 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
4085 * is the deferred delete event delivery itself). Otherwise,
4086 * drop it.
4087 * Dispatch: don't clear state, just mark it disabled.
4088 * Cleared: just leave it deactivated.
4089 * Others: re-activate as there may be more events to handle.
4090 * This will not wake up more handlers right now, but
4091 * at the completion of handling events it may trigger
4092 * more handler threads (TODO: optimize based on more than
4093 * just this one event being detected by the filter).
4094 */
4095 if ((result & FILTER_ACTIVE) == 0) {
4096 if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
4097 /*
4098 * Stay active knotes should not be unsuppressed or we'd create an
4099 * infinite loop.
4100 *
4101 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4102 * within f_process() but that doesn't necessarily make them
4103 * ready to process, so we should leave them be.
4104 *
4105 * For other knotes, since we will not return an event,
4106 * there's no point keeping the knote suppressed.
4107 */
4108 knote_unsuppress(kq, kn);
4109 }
4110 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4111 return EJUSTRETURN;
4112 }
4113
4114 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
4115 knote_adjust_qos(kq, kn, result);
4116 }
4117 kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
4118
4119 if (kev.flags & EV_ONESHOT) {
4120 if ((kn->kn_flags & EV_DISPATCH2) == EV_DISPATCH2 &&
4121 (kn->kn_status & KN_DEFERDELETE) == 0) {
4122 /* defer dropping non-delete oneshot dispatch2 events */
4123 kn->kn_status |= KN_DEFERDELETE | KN_DISABLED;
4124 } else {
4125 drop = true;
4126 }
4127 } else if (kn->kn_flags & EV_DISPATCH) {
4128 /* disable all dispatch knotes */
4129 kn->kn_status |= KN_DISABLED;
4130 } else if ((kn->kn_flags & EV_CLEAR) == 0) {
4131 /* re-activate in case there are more events */
4132 knote_activate(kq, kn, FILTER_ACTIVE);
4133 }
4134
4135 /*
4136 * callback to handle each event as we find it.
4137 * If we have to detach and drop the knote, do
4138 * it while we have the kq unlocked.
4139 */
4140 if (drop) {
4141 knote_drop(kq, kn, &knlc);
4142 } else {
4143 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
4144 }
4145
4146 if (kev.flags & EV_VANISHED) {
4147 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
4148 kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4149 kn->kn_filtid);
4150 }
4151
4152 error = (callback)(&kev, kectx);
4153 kqlock(kq);
4154 return error;
4155 }
4156
4157 /*
4158 * Returns -1 if the kqueue was unbound and processing should not happen
4159 */
4160 #define KQWQAE_BEGIN_PROCESSING 1
4161 #define KQWQAE_END_PROCESSING 2
4162 #define KQWQAE_UNBIND 3
4163 static int
4164 kqworkq_acknowledge_events(struct kqworkq *kqwq, workq_threadreq_t kqr,
4165 int kevent_flags, int kqwqae_op)
4166 {
4167 thread_qos_t old_override = THREAD_QOS_UNSPECIFIED;
4168 thread_t thread = kqr_thread_fast(kqr);
4169 struct knote *kn;
4170 int rc = 0;
4171 bool unbind;
4172 struct kqtailq *suppressq = &kqwq->kqwq_suppressed[kqr->tr_kq_qos_index];
4173
4174 kqlock_held(&kqwq->kqwq_kqueue);
4175
4176 if (!TAILQ_EMPTY(suppressq)) {
4177 /*
4178 * Return suppressed knotes to their original state.
4179 * For workq kqueues, suppressed ones that are still
4180 * truly active (not just forced into the queue) will
4181 * set flags we check below to see if anything got
4182 * woken up.
4183 */
4184 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4185 assert(kn->kn_status & KN_SUPPRESSED);
4186 knote_unsuppress(kqwq, kn);
4187 }
4188 }
4189
4190 #if DEBUG || DEVELOPMENT
4191 thread_t self = current_thread();
4192 struct uthread *ut = get_bsdthread_info(self);
4193
4194 assert(thread == self);
4195 assert(ut->uu_kqr_bound == kqr);
4196 #endif // DEBUG || DEVELOPMENT
4197
4198 if (kqwqae_op == KQWQAE_UNBIND) {
4199 unbind = true;
4200 } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
4201 unbind = false;
4202 } else {
4203 unbind = !kqr->tr_kq_wakeup;
4204 }
4205 if (unbind) {
4206 old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
4207 rc = -1;
4208 /*
4209 * request a new thread if we didn't process the whole queue or real events
4210 * have happened (not just putting stay-active events back).
4211 */
4212 if (kqr->tr_kq_wakeup) {
4213 kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
4214 kqr->tr_kq_qos_index, 0);
4215 }
4216 }
4217
4218 if (rc == 0) {
4219 /*
4220 * Reset wakeup bit to notice events firing while we are processing,
4221 * as we cannot rely on the bucket queue emptiness because of stay
4222 * active knotes.
4223 */
4224 kqr->tr_kq_wakeup = false;
4225 }
4226
4227 if (old_override) {
4228 thread_drop_kevent_override(thread);
4229 }
4230
4231 return rc;
4232 }
4233
4234 /*
4235 * Return 0 to indicate that processing should proceed,
4236 * -1 if there is nothing to process.
4237 *
4238 * Called with kqueue locked and returns the same way,
4239 * but may drop lock temporarily.
4240 */
4241 static int
4242 kqworkq_begin_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4243 int kevent_flags)
4244 {
4245 int rc = 0;
4246
4247 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
4248 0, kqr->tr_kq_qos_index);
4249
4250 rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4251 KQWQAE_BEGIN_PROCESSING);
4252
4253 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
4254 thread_tid(kqr_thread(kqr)), kqr->tr_kq_wakeup);
4255
4256 return rc;
4257 }
4258
4259 static thread_qos_t
4260 kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
4261 {
4262 kq_index_t qos = THREAD_QOS_UNSPECIFIED;
4263 struct knote *kn, *tmp;
4264
4265 kqlock_held(kqwl);
4266
4267 TAILQ_FOREACH_SAFE(kn, &kqwl->kqwl_suppressed, kn_tqe, tmp) {
4268 /*
4269 * If a knote that can adjust QoS is disabled because of the automatic
4270 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4271 * further overrides keep pushing.
4272 */
4273 if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
4274 (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 &&
4275 (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
4276 qos = MAX(qos, kn->kn_qos_override);
4277 continue;
4278 }
4279 knote_unsuppress(kqwl, kn);
4280 }
4281
4282 return qos;
4283 }
4284
4285 static int
4286 kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
4287 {
4288 workq_threadreq_t kqr = &kqwl->kqwl_request;
4289 struct kqueue *kq = &kqwl->kqwl_kqueue;
4290 thread_qos_t qos_override;
4291 thread_t thread = kqr_thread_fast(kqr);
4292 int rc = 0, op = KQWL_UTQ_NONE;
4293
4294 kqlock_held(kq);
4295
4296 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
4297 kqwl->kqwl_dynamicid, 0, 0);
4298
4299 /* nobody else should still be processing */
4300 assert((kq->kq_state & KQ_PROCESSING) == 0);
4301
4302 kq->kq_state |= KQ_PROCESSING;
4303
4304 if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
4305 op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
4306 }
4307
4308 if (kevent_flags & KEVENT_FLAG_PARKING) {
4309 /*
4310 * When "parking" we want to process events and if no events are found
4311 * unbind.
4312 *
4313 * However, non overcommit threads sometimes park even when they have
4314 * more work so that the pool can narrow. For these, we need to unbind
4315 * early, so that calling kqworkloop_update_threads_qos() can ask the
4316 * workqueue subsystem whether the thread should park despite having
4317 * pending events.
4318 */
4319 if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
4320 op = KQWL_UTQ_PARKING;
4321 } else {
4322 op = KQWL_UTQ_UNBINDING;
4323 }
4324 }
4325 if (op == KQWL_UTQ_NONE) {
4326 goto done;
4327 }
4328
4329 qos_override = kqworkloop_acknowledge_events(kqwl);
4330
4331 if (op == KQWL_UTQ_UNBINDING) {
4332 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_IMMEDIATELY);
4333 kqworkloop_release_live(kqwl);
4334 }
4335 kqworkloop_update_threads_qos(kqwl, op, qos_override);
4336 if (op == KQWL_UTQ_PARKING) {
4337 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
4338 /*
4339 * We cannot trust tr_kq_wakeup when looking at stay active knotes.
4340 * We need to process once, and kqworkloop_end_processing will
4341 * handle the unbind.
4342 */
4343 } else if (!kqr->tr_kq_wakeup || kqwl->kqwl_owner) {
4344 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
4345 kqworkloop_release_live(kqwl);
4346 rc = -1;
4347 }
4348 } else if (op == KQWL_UTQ_UNBINDING) {
4349 if (kqr_thread(kqr) == thread) {
4350 /*
4351 * The thread request fired again, passed the admission check and
4352 * got bound to the current thread again.
4353 */
4354 } else {
4355 rc = -1;
4356 }
4357 }
4358
4359 if (rc == 0) {
4360 /*
4361 * Reset wakeup bit to notice stay active events firing while we are
4362 * processing, as we cannot rely on the stayactive bucket emptiness.
4363 */
4364 kqwl->kqwl_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
4365 } else {
4366 kq->kq_state &= ~KQ_PROCESSING;
4367 }
4368
4369 if (rc == -1) {
4370 kqworkloop_unbind_delayed_override_drop(thread);
4371 }
4372
4373 done:
4374 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
4375 kqwl->kqwl_dynamicid, 0, 0);
4376
4377 return rc;
4378 }
4379
4380 /*
4381 * Return 0 to indicate that processing should proceed,
4382 * -1 if there is nothing to process.
4383 * EBADF if the kqueue is draining
4384 *
4385 * Called with kqueue locked and returns the same way,
4386 * but may drop lock temporarily.
4387 * May block.
4388 */
4389 static int
4390 kqfile_begin_processing(struct kqfile *kq)
4391 {
4392 struct kqtailq *suppressq;
4393
4394 kqlock_held(kq);
4395
4396 assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4397 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
4398 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4399
4400 /* wait to become the exclusive processing thread */
4401 for (;;) {
4402 if (kq->kqf_state & KQ_DRAIN) {
4403 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4404 VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
4405 return EBADF;
4406 }
4407
4408 if ((kq->kqf_state & KQ_PROCESSING) == 0) {
4409 break;
4410 }
4411
4412 /* if someone else is processing the queue, wait */
4413 kq->kqf_state |= KQ_PROCWAIT;
4414 suppressq = &kq->kqf_suppressed;
4415 waitq_assert_wait64((struct waitq *)&kq->kqf_wqs,
4416 CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT,
4417 TIMEOUT_WAIT_FOREVER);
4418
4419 kqunlock(kq);
4420 thread_block(THREAD_CONTINUE_NULL);
4421 kqlock(kq);
4422 }
4423
4424 /* Nobody else processing */
4425
4426 /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
4427 waitq_set_clear_preposts(&kq->kqf_wqs);
4428 kq->kqf_state &= ~KQ_WAKEUP;
4429
4430 /* anything left to process? */
4431 if (TAILQ_EMPTY(&kq->kqf_queue)) {
4432 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4433 VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
4434 return -1;
4435 }
4436
4437 /* convert to processing mode */
4438 kq->kqf_state |= KQ_PROCESSING;
4439
4440 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
4441 VM_KERNEL_UNSLIDE_OR_PERM(kq));
4442
4443 return 0;
4444 }
4445
4446 /*
4447 * Try to end the processing, only called when a workq thread is attempting to
4448 * park (KEVENT_FLAG_PARKING is set).
4449 *
4450 * When returning -1, the kqworkq is setup again so that it is ready to be
4451 * processed.
4452 */
4453 static int
4454 kqworkq_end_processing(struct kqworkq *kqwq, workq_threadreq_t kqr,
4455 int kevent_flags)
4456 {
4457 if (!TAILQ_EMPTY(&kqwq->kqwq_queue[kqr->tr_kq_qos_index])) {
4458 /* remember we didn't process everything */
4459 kqr->tr_kq_wakeup = true;
4460 }
4461
4462 if (kevent_flags & KEVENT_FLAG_PARKING) {
4463 /*
4464 * if acknowledge events "succeeds" it means there are events,
4465 * which is a failure condition for end_processing.
4466 */
4467 int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
4468 KQWQAE_END_PROCESSING);
4469 if (rc == 0) {
4470 return -1;
4471 }
4472 }
4473
4474 return 0;
4475 }
4476
4477 /*
4478 * Try to end the processing, only called when a workq thread is attempting to
4479 * park (KEVENT_FLAG_PARKING is set).
4480 *
4481 * When returning -1, the kqworkq is setup again so that it is ready to be
4482 * processed (as if kqworkloop_begin_processing had just been called).
4483 *
4484 * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4485 * the kqworkloop is unbound from its servicer as a side effect.
4486 */
4487 static int
4488 kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
4489 {
4490 struct kqueue *kq = &kqwl->kqwl_kqueue;
4491 workq_threadreq_t kqr = &kqwl->kqwl_request;
4492 thread_qos_t qos_override;
4493 thread_t thread = kqr_thread_fast(kqr);
4494 int rc = 0;
4495
4496 kqlock_held(kq);
4497
4498 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
4499 kqwl->kqwl_dynamicid, 0, 0);
4500
4501 if (flags & KQ_PROCESSING) {
4502 assert(kq->kq_state & KQ_PROCESSING);
4503
4504 /*
4505 * If we still have queued stayactive knotes, remember we didn't finish
4506 * processing all of them. This should be extremely rare and would
4507 * require to have a lot of them registered and fired.
4508 */
4509 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
4510 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS,
4511 KQWL_BUCKET_STAYACTIVE);
4512 }
4513
4514 /*
4515 * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
4516 * still under the lock.
4517 *
4518 * So we do everything kqworkloop_unbind() would do, but because we're
4519 * inside kqueue_process(), if the workloop actually received events
4520 * while our locks were dropped, we have the opportunity to fail the end
4521 * processing and loop again.
4522 *
4523 * This avoids going through the process-wide workqueue lock hence
4524 * scales better.
4525 */
4526 if (kevent_flags & KEVENT_FLAG_PARKING) {
4527 qos_override = kqworkloop_acknowledge_events(kqwl);
4528 }
4529 }
4530
4531 if (kevent_flags & KEVENT_FLAG_PARKING) {
4532 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
4533 if (kqr->tr_kq_wakeup && !kqwl->kqwl_owner) {
4534 /*
4535 * Reset wakeup bit to notice stay active events firing while we are
4536 * processing, as we cannot rely on the stayactive bucket emptiness.
4537 */
4538 kqwl->kqwl_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
4539 rc = -1;
4540 } else {
4541 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
4542 kqworkloop_release_live(kqwl);
4543 kq->kq_state &= ~flags;
4544 }
4545 } else {
4546 kq->kq_state &= ~flags;
4547 kq->kq_state |= KQ_R2K_ARMED;
4548 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
4549 }
4550
4551 if ((kevent_flags & KEVENT_FLAG_PARKING) && rc == 0) {
4552 kqworkloop_unbind_delayed_override_drop(thread);
4553 }
4554
4555 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
4556 kqwl->kqwl_dynamicid, 0, 0);
4557
4558 return rc;
4559 }
4560
4561 /*
4562 * Called with kqueue lock held.
4563 *
4564 * 0: no more events
4565 * -1: has more events
4566 * EBADF: kqueue is in draining mode
4567 */
4568 static int
4569 kqfile_end_processing(struct kqfile *kq)
4570 {
4571 struct kqtailq *suppressq = &kq->kqf_suppressed;
4572 struct knote *kn;
4573 int procwait;
4574
4575 kqlock_held(kq);
4576
4577 assert((kq->kqf_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
4578
4579 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
4580 VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
4581
4582 /*
4583 * Return suppressed knotes to their original state.
4584 */
4585 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
4586 assert(kn->kn_status & KN_SUPPRESSED);
4587 knote_unsuppress(kq, kn);
4588 }
4589
4590 procwait = (kq->kqf_state & KQ_PROCWAIT);
4591 kq->kqf_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
4592
4593 if (procwait) {
4594 /* first wake up any thread already waiting to process */
4595 waitq_wakeup64_all((struct waitq *)&kq->kqf_wqs,
4596 CAST_EVENT64_T(suppressq), THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
4597 }
4598
4599 if (kq->kqf_state & KQ_DRAIN) {
4600 return EBADF;
4601 }
4602 return (kq->kqf_state & KQ_WAKEUP) ? -1 : 0;
4603 }
4604
4605 static int
4606 kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
4607 struct kqueue_workloop_params *params, int *retval)
4608 {
4609 int error = 0;
4610 struct kqworkloop *kqwl;
4611 struct filedesc *fdp = p->p_fd;
4612 workq_threadreq_param_t trp = { };
4613
4614 switch (cmd) {
4615 case KQ_WORKLOOP_CREATE:
4616 if (!params->kqwlp_flags) {
4617 error = EINVAL;
4618 break;
4619 }
4620
4621 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
4622 (params->kqwlp_sched_pri < 1 ||
4623 params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
4624 error = EINVAL;
4625 break;
4626 }
4627
4628 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
4629 invalid_policy(params->kqwlp_sched_pol)) {
4630 error = EINVAL;
4631 break;
4632 }
4633
4634 if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
4635 (params->kqwlp_cpu_percent <= 0 ||
4636 params->kqwlp_cpu_percent > 100 ||
4637 params->kqwlp_cpu_refillms <= 0 ||
4638 params->kqwlp_cpu_refillms > 0x00ffffff)) {
4639 error = EINVAL;
4640 break;
4641 }
4642
4643 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
4644 trp.trp_flags |= TRP_PRIORITY;
4645 trp.trp_pri = (uint8_t)params->kqwlp_sched_pri;
4646 }
4647 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
4648 trp.trp_flags |= TRP_POLICY;
4649 trp.trp_pol = (uint8_t)params->kqwlp_sched_pol;
4650 }
4651 if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
4652 trp.trp_flags |= TRP_CPUPERCENT;
4653 trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
4654 trp.trp_refillms = params->kqwlp_cpu_refillms;
4655 }
4656
4657 error = kqworkloop_get_or_create(p, params->kqwlp_id, &trp,
4658 KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4659 KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST, &kqwl);
4660 if (error) {
4661 break;
4662 }
4663
4664 if (!(fdp->fd_flags & FD_WORKLOOP)) {
4665 /* FD_WORKLOOP indicates we've ever created a workloop
4666 * via this syscall but its only ever added to a process, never
4667 * removed.
4668 */
4669 proc_fdlock(p);
4670 fdp->fd_flags |= FD_WORKLOOP;
4671 proc_fdunlock(p);
4672 }
4673 break;
4674 case KQ_WORKLOOP_DESTROY:
4675 error = kqworkloop_get_or_create(p, params->kqwlp_id, NULL,
4676 KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
4677 KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST, &kqwl);
4678 if (error) {
4679 break;
4680 }
4681 kqlock(kqwl);
4682 trp.trp_value = kqwl->kqwl_params;
4683 if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
4684 trp.trp_flags |= TRP_RELEASED;
4685 kqwl->kqwl_params = trp.trp_value;
4686 kqworkloop_release_live(kqwl);
4687 } else {
4688 error = EINVAL;
4689 }
4690 kqunlock(kqwl);
4691 kqworkloop_release(kqwl);
4692 break;
4693 }
4694 *retval = 0;
4695 return error;
4696 }
4697
4698 int
4699 kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
4700 {
4701 struct kqueue_workloop_params params = {
4702 .kqwlp_id = 0,
4703 };
4704 if (uap->sz < sizeof(params.kqwlp_version)) {
4705 return EINVAL;
4706 }
4707
4708 size_t copyin_sz = MIN(sizeof(params), uap->sz);
4709 int rv = copyin(uap->addr, &params, copyin_sz);
4710 if (rv) {
4711 return rv;
4712 }
4713
4714 if (params.kqwlp_version != (int)uap->sz) {
4715 return EINVAL;
4716 }
4717
4718 return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
4719 retval);
4720 }
4721
4722 /*ARGSUSED*/
4723 static int
4724 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
4725 __unused vfs_context_t ctx)
4726 {
4727 struct kqfile *kq = (struct kqfile *)fp->f_data;
4728 struct kqtailq *suppressq = &kq->kqf_suppressed;
4729 struct kqtailq *queue = &kq->kqf_queue;
4730 struct knote *kn;
4731 int retnum = 0;
4732
4733 if (which != FREAD) {
4734 return 0;
4735 }
4736
4737 kqlock(kq);
4738
4739 assert((kq->kqf_state & KQ_WORKQ) == 0);
4740
4741 /*
4742 * If this is the first pass, link the wait queue associated with the
4743 * the kqueue onto the wait queue set for the select(). Normally we
4744 * use selrecord() for this, but it uses the wait queue within the
4745 * selinfo structure and we need to use the main one for the kqueue to
4746 * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
4747 * (The select() call will unlink them when it ends).
4748 */
4749 if (wq_link_id != NULL) {
4750 thread_t cur_act = current_thread();
4751 struct uthread * ut = get_bsdthread_info(cur_act);
4752
4753 kq->kqf_state |= KQ_SEL;
4754 waitq_link((struct waitq *)&kq->kqf_wqs, ut->uu_wqset,
4755 WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
4756
4757 /* always consume the reserved link object */
4758 waitq_link_release(*(uint64_t *)wq_link_id);
4759 *(uint64_t *)wq_link_id = 0;
4760
4761 /*
4762 * selprocess() is expecting that we send it back the waitq
4763 * that was just added to the thread's waitq set. In order
4764 * to not change the selrecord() API (which is exported to
4765 * kexts), we pass this value back through the
4766 * void *wq_link_id pointer we were passed. We need to use
4767 * memcpy here because the pointer may not be properly aligned
4768 * on 32-bit systems.
4769 */
4770 void *wqptr = &kq->kqf_wqs;
4771 memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
4772 }
4773
4774 if (kqfile_begin_processing(kq) == -1) {
4775 kqunlock(kq);
4776 return 0;
4777 }
4778
4779 if (!TAILQ_EMPTY(queue)) {
4780 /*
4781 * there is something queued - but it might be a
4782 * KN_STAYACTIVE knote, which may or may not have
4783 * any events pending. Otherwise, we have to walk
4784 * the list of knotes to see, and peek at the
4785 * (non-vanished) stay-active ones to be really sure.
4786 */
4787 while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
4788 if (kn->kn_status & KN_ACTIVE) {
4789 retnum = 1;
4790 goto out;
4791 }
4792 assert(kn->kn_status & KN_STAYACTIVE);
4793 knote_suppress(kq, kn);
4794 }
4795
4796 /*
4797 * There were no regular events on the queue, so take
4798 * a deeper look at the stay-queued ones we suppressed.
4799 */
4800 while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
4801 KNOTE_LOCK_CTX(knlc);
4802 int result = 0;
4803
4804 /* If didn't vanish while suppressed - peek at it */
4805 if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc,
4806 KNOTE_KQ_LOCK_ON_FAILURE)) {
4807 continue;
4808 }
4809
4810 result = filter_call(knote_fops(kn), f_peek(kn));
4811
4812 kqlock(kq);
4813 knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
4814
4815 /* unsuppress it */
4816 knote_unsuppress(kq, kn);
4817
4818 /* has data or it has to report a vanish */
4819 if (result & FILTER_ACTIVE) {
4820 retnum = 1;
4821 goto out;
4822 }
4823 }
4824 }
4825
4826 out:
4827 kqfile_end_processing(kq);
4828 kqunlock(kq);
4829 return retnum;
4830 }
4831
4832 /*
4833 * kqueue_close -
4834 */
4835 /*ARGSUSED*/
4836 static int
4837 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
4838 {
4839 struct kqfile *kqf = (struct kqfile *)fg->fg_data;
4840
4841 assert((kqf->kqf_state & KQ_WORKQ) == 0);
4842 kqueue_dealloc(&kqf->kqf_kqueue);
4843 fg->fg_data = NULL;
4844 return 0;
4845 }
4846
4847 /*
4848 * Max depth of the nested kq path that can be created.
4849 * Note that this has to be less than the size of kq_level
4850 * to avoid wrapping around and mislabeling the level.
4851 */
4852 #define MAX_NESTED_KQ 1000
4853
4854 /*ARGSUSED*/
4855 /*
4856 * The callers has taken a use-count reference on this kqueue and will donate it
4857 * to the kqueue we are being added to. This keeps the kqueue from closing until
4858 * that relationship is torn down.
4859 */
4860 static int
4861 kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
4862 __unused struct kevent_qos_s *kev)
4863 {
4864 struct kqfile *kqf = (struct kqfile *)fp->f_data;
4865 struct kqueue *kq = &kqf->kqf_kqueue;
4866 struct kqueue *parentkq = knote_get_kq(kn);
4867
4868 assert((kqf->kqf_state & KQ_WORKQ) == 0);
4869
4870 if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
4871 knote_set_error(kn, EINVAL);
4872 return 0;
4873 }
4874
4875 /*
4876 * We have to avoid creating a cycle when nesting kqueues
4877 * inside another. Rather than trying to walk the whole
4878 * potential DAG of nested kqueues, we just use a simple
4879 * ceiling protocol. When a kqueue is inserted into another,
4880 * we check that the (future) parent is not already nested
4881 * into another kqueue at a lower level than the potenial
4882 * child (because it could indicate a cycle). If that test
4883 * passes, we just mark the nesting levels accordingly.
4884 *
4885 * Only up to MAX_NESTED_KQ can be nested.
4886 *
4887 * Note: kqworkq and kqworkloop cannot be nested and have reused their
4888 * kq_level field, so ignore these as parent.
4889 */
4890
4891 kqlock(parentkq);
4892
4893 if ((parentkq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0) {
4894 if (parentkq->kq_level > 0 &&
4895 parentkq->kq_level < kq->kq_level) {
4896 kqunlock(parentkq);
4897 knote_set_error(kn, EINVAL);
4898 return 0;
4899 }
4900
4901 /* set parent level appropriately */
4902 uint16_t plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
4903 if (plevel < kq->kq_level + 1) {
4904 if (kq->kq_level + 1 > MAX_NESTED_KQ) {
4905 kqunlock(parentkq);
4906 knote_set_error(kn, EINVAL);
4907 return 0;
4908 }
4909 plevel = kq->kq_level + 1;
4910 }
4911
4912 parentkq->kq_level = plevel;
4913 }
4914
4915 kqunlock(parentkq);
4916
4917 kn->kn_filtid = EVFILTID_KQREAD;
4918 kqlock(kq);
4919 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
4920 /* indicate nesting in child, if needed */
4921 if (kq->kq_level == 0) {
4922 kq->kq_level = 1;
4923 }
4924
4925 int count = kq->kq_count;
4926 kqunlock(kq);
4927 return count > 0;
4928 }
4929
4930 /*
4931 * kqueue_drain - called when kq is closed
4932 */
4933 /*ARGSUSED*/
4934 static int
4935 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
4936 {
4937 struct kqfile *kqf = (struct kqfile *)fp->fp_glob->fg_data;
4938
4939 assert((kqf->kqf_state & KQ_WORKQ) == 0);
4940
4941 kqlock(kqf);
4942 kqf->kqf_state |= KQ_DRAIN;
4943
4944 /* wakeup sleeping threads */
4945 if ((kqf->kqf_state & (KQ_SLEEP | KQ_SEL)) != 0) {
4946 kqf->kqf_state &= ~(KQ_SLEEP | KQ_SEL);
4947 (void)waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs,
4948 KQ_EVENT,
4949 THREAD_RESTART,
4950 WAITQ_ALL_PRIORITIES);
4951 }
4952
4953 /* wakeup threads waiting their turn to process */
4954 if (kqf->kqf_state & KQ_PROCWAIT) {
4955 assert(kqf->kqf_state & KQ_PROCESSING);
4956
4957 kqf->kqf_state &= ~KQ_PROCWAIT;
4958 (void)waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs,
4959 CAST_EVENT64_T(&kqf->kqf_suppressed),
4960 THREAD_RESTART, WAITQ_ALL_PRIORITIES);
4961 }
4962
4963 kqunlock(kqf);
4964 return 0;
4965 }
4966
4967 /*ARGSUSED*/
4968 int
4969 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
4970 {
4971 assert((kq->kq_state & KQ_WORKQ) == 0);
4972
4973 kqlock(kq);
4974 if (isstat64 != 0) {
4975 struct stat64 *sb64 = (struct stat64 *)ub;
4976
4977 bzero((void *)sb64, sizeof(*sb64));
4978 sb64->st_size = kq->kq_count;
4979 if (kq->kq_state & KQ_KEV_QOS) {
4980 sb64->st_blksize = sizeof(struct kevent_qos_s);
4981 } else if (kq->kq_state & KQ_KEV64) {
4982 sb64->st_blksize = sizeof(struct kevent64_s);
4983 } else if (IS_64BIT_PROCESS(p)) {
4984 sb64->st_blksize = sizeof(struct user64_kevent);
4985 } else {
4986 sb64->st_blksize = sizeof(struct user32_kevent);
4987 }
4988 sb64->st_mode = S_IFIFO;
4989 } else {
4990 struct stat *sb = (struct stat *)ub;
4991
4992 bzero((void *)sb, sizeof(*sb));
4993 sb->st_size = kq->kq_count;
4994 if (kq->kq_state & KQ_KEV_QOS) {
4995 sb->st_blksize = sizeof(struct kevent_qos_s);
4996 } else if (kq->kq_state & KQ_KEV64) {
4997 sb->st_blksize = sizeof(struct kevent64_s);
4998 } else if (IS_64BIT_PROCESS(p)) {
4999 sb->st_blksize = sizeof(struct user64_kevent);
5000 } else {
5001 sb->st_blksize = sizeof(struct user32_kevent);
5002 }
5003 sb->st_mode = S_IFIFO;
5004 }
5005 kqunlock(kq);
5006 return 0;
5007 }
5008
5009 static inline bool
5010 kqueue_threadreq_can_use_ast(struct kqueue *kq)
5011 {
5012 if (current_proc() == kq->kq_p) {
5013 /*
5014 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5015 * do combined send/receive and in the case of self-IPC, the AST may bet
5016 * set on a thread that will not return to userspace and needs the
5017 * thread the AST would create to unblock itself.
5018 *
5019 * At this time, we really want to target:
5020 *
5021 * - kevent variants that can cause thread creations, and dispatch
5022 * really only uses kevent_qos and kevent_id,
5023 *
5024 * - workq_kernreturn (directly about thread creations)
5025 *
5026 * - bsdthread_ctl which is used for qos changes and has direct impact
5027 * on the creator thread scheduling decisions.
5028 */
5029 switch (current_uthread()->syscall_code) {
5030 case SYS_kevent_qos:
5031 case SYS_kevent_id:
5032 case SYS_workq_kernreturn:
5033 case SYS_bsdthread_ctl:
5034 return true;
5035 }
5036 }
5037 return false;
5038 }
5039
5040 /*
5041 * Interact with the pthread kext to request a servicing there at a specific QoS
5042 * level.
5043 *
5044 * - Caller holds the workq request lock
5045 *
5046 * - May be called with the kqueue's wait queue set locked,
5047 * so cannot do anything that could recurse on that.
5048 */
5049 static void
5050 kqueue_threadreq_initiate(struct kqueue *kq, workq_threadreq_t kqr,
5051 kq_index_t qos, int flags)
5052 {
5053 assert(kqr->tr_kq_wakeup);
5054 assert(kqr_thread(kqr) == THREAD_NULL);
5055 assert(!kqr_thread_requested(kqr));
5056 struct turnstile *ts = TURNSTILE_NULL;
5057
5058 if (workq_is_exiting(kq->kq_p)) {
5059 return;
5060 }
5061
5062 kqlock_held(kq);
5063
5064 if (kq->kq_state & KQ_WORKLOOP) {
5065 __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq;
5066
5067 assert(kqwl->kqwl_owner == THREAD_NULL);
5068 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
5069 kqwl->kqwl_dynamicid, 0, qos, kqr->tr_kq_wakeup);
5070 ts = kqwl->kqwl_turnstile;
5071 /* Add a thread request reference on the kqueue. */
5072 kqworkloop_retain(kqwl);
5073 } else {
5074 assert(kq->kq_state & KQ_WORKQ);
5075 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
5076 -1, 0, qos, kqr->tr_kq_wakeup);
5077 }
5078
5079 /*
5080 * New-style thread request supported.
5081 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5082 * its use until a corresponding kqueue_threadreq_bind callback.
5083 */
5084 if (kqueue_threadreq_can_use_ast(kq)) {
5085 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5086 }
5087 if (qos == KQWQ_QOS_MANAGER) {
5088 qos = WORKQ_THREAD_QOS_MANAGER;
5089 }
5090 if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) {
5091 /*
5092 * Process is shutting down or exec'ing.
5093 * All the kqueues are going to be cleaned up
5094 * soon. Forget we even asked for a thread -
5095 * and make sure we don't ask for more.
5096 */
5097 kq->kq_state &= ~KQ_R2K_ARMED;
5098 kqueue_release_live(kq);
5099 }
5100 }
5101
5102 /*
5103 * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5104 *
5105 * This is used when kqueue_threadreq_bind may cause a lock inversion.
5106 */
5107 __attribute__((always_inline))
5108 void
5109 kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t kqr,
5110 struct uthread *ut)
5111 {
5112 ut->uu_kqr_bound = kqr;
5113 kqr->tr_thread = ut->uu_thread;
5114 kqr->tr_state = WORKQ_TR_STATE_BINDING;
5115 }
5116
5117 /*
5118 * kqueue_threadreq_bind_commit - commit a bind prepost
5119 *
5120 * The workq code has to commit any binding prepost before the thread has
5121 * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5122 */
5123 void
5124 kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
5125 {
5126 struct uthread *ut = get_bsdthread_info(thread);
5127 workq_threadreq_t kqr = ut->uu_kqr_bound;
5128 kqueue_t kqu = kqr_kqueue(p, kqr);
5129
5130 kqlock(kqu);
5131 if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5132 kqueue_threadreq_bind(p, kqr, thread, 0);
5133 }
5134 kqunlock(kqu);
5135 }
5136
5137 static void
5138 kqueue_threadreq_modify(kqueue_t kqu, workq_threadreq_t kqr, kq_index_t qos,
5139 workq_kern_threadreq_flags_t flags)
5140 {
5141 assert(kqr_thread_requested_pending(kqr));
5142
5143 kqlock_held(kqu);
5144
5145 if (kqueue_threadreq_can_use_ast(kqu.kq)) {
5146 flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
5147 }
5148 workq_kern_threadreq_modify(kqu.kq->kq_p, kqr, qos, flags);
5149 }
5150
5151 /*
5152 * kqueue_threadreq_bind - bind thread to processing kqrequest
5153 *
5154 * The provided thread will be responsible for delivering events
5155 * associated with the given kqrequest. Bind it and get ready for
5156 * the thread to eventually arrive.
5157 */
5158 void
5159 kqueue_threadreq_bind(struct proc *p, workq_threadreq_t kqr, thread_t thread,
5160 unsigned int flags)
5161 {
5162 kqueue_t kqu = kqr_kqueue(p, kqr);
5163 struct uthread *ut = get_bsdthread_info(thread);
5164
5165 kqlock_held(kqu);
5166
5167 assert(ut->uu_kqueue_override == 0);
5168
5169 if (kqr->tr_state == WORKQ_TR_STATE_BINDING) {
5170 assert(ut->uu_kqr_bound == kqr);
5171 assert(kqr->tr_thread == thread);
5172 } else {
5173 assert(kqr_thread_requested_pending(kqr));
5174 assert(kqr->tr_thread == THREAD_NULL);
5175 assert(ut->uu_kqr_bound == NULL);
5176 ut->uu_kqr_bound = kqr;
5177 kqr->tr_thread = thread;
5178 }
5179
5180 kqr->tr_state = WORKQ_TR_STATE_BOUND;
5181
5182 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5183 struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
5184
5185 if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
5186 /*
5187 * <rdar://problem/38626999> shows that asserting here is not ok.
5188 *
5189 * This is not supposed to happen for correct use of the interface,
5190 * but it is sadly possible for userspace (with the help of memory
5191 * corruption, such as over-release of a dispatch queue) to make
5192 * the creator thread the "owner" of a workloop.
5193 *
5194 * Once that happens, and that creator thread picks up the same
5195 * workloop as a servicer, we trip this codepath. We need to fixup
5196 * the state to forget about this thread being the owner, as the
5197 * entire workloop state machine expects servicers to never be
5198 * owners and everything would basically go downhill from here.
5199 */
5200 kqu.kqwl->kqwl_owner = THREAD_NULL;
5201 if (kqworkloop_override(kqu.kqwl)) {
5202 thread_drop_kevent_override(thread);
5203 }
5204 }
5205
5206 if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
5207 /*
5208 * Past this point, the interlock is the kq req lock again,
5209 * so we can fix the inheritor for good.
5210 */
5211 filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5212 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
5213 }
5214
5215 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
5216 thread_tid(thread), kqr->tr_kq_qos_index,
5217 (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
5218
5219 ut->uu_kqueue_override = kqr->tr_kq_override_index;
5220 if (kqr->tr_kq_override_index) {
5221 thread_add_servicer_override(thread, kqr->tr_kq_override_index);
5222 }
5223 } else {
5224 assert(kqr->tr_kq_override_index == 0);
5225
5226 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
5227 thread_tid(thread), kqr->tr_kq_qos_index,
5228 (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
5229 }
5230 }
5231
5232 /*
5233 * kqueue_threadreq_cancel - abort a pending thread request
5234 *
5235 * Called when exiting/exec'ing. Forget our pending request.
5236 */
5237 void
5238 kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t kqr)
5239 {
5240 kqueue_release(kqr_kqueue(p, kqr));
5241 }
5242
5243 workq_threadreq_param_t
5244 kqueue_threadreq_workloop_param(workq_threadreq_t kqr)
5245 {
5246 struct kqworkloop *kqwl;
5247 workq_threadreq_param_t trp;
5248
5249 assert(kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP);
5250 kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
5251 trp.trp_value = kqwl->kqwl_params;
5252 return trp;
5253 }
5254
5255 /*
5256 * kqueue_threadreq_unbind - unbind thread from processing kqueue
5257 *
5258 * End processing the per-QoS bucket of events and allow other threads
5259 * to be requested for future servicing.
5260 *
5261 * caller holds a reference on the kqueue.
5262 */
5263 void
5264 kqueue_threadreq_unbind(struct proc *p, workq_threadreq_t kqr)
5265 {
5266 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
5267 kqworkloop_unbind(kqr_kqworkloop(kqr));
5268 } else {
5269 kqworkq_unbind(p, kqr);
5270 }
5271 }
5272
5273 /*
5274 * If we aren't already busy processing events [for this QoS],
5275 * request workq thread support as appropriate.
5276 *
5277 * TBD - for now, we don't segregate out processing by QoS.
5278 *
5279 * - May be called with the kqueue's wait queue set locked,
5280 * so cannot do anything that could recurse on that.
5281 */
5282 static void
5283 kqworkq_wakeup(struct kqworkq *kqwq, kq_index_t qos_index)
5284 {
5285 workq_threadreq_t kqr = kqworkq_get_request(kqwq, qos_index);
5286
5287 /* convert to thread qos value */
5288 assert(qos_index < KQWQ_NBUCKETS);
5289
5290 if (!kqr->tr_kq_wakeup) {
5291 kqr->tr_kq_wakeup = true;
5292 if (!kqr_thread_requested(kqr)) {
5293 kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
5294 }
5295 }
5296 }
5297
5298 /*
5299 * This represent the asynchronous QoS a given workloop contributes,
5300 * hence is the max of the current active knotes (override index)
5301 * and the workloop max qos (userspace async qos).
5302 */
5303 static kq_index_t
5304 kqworkloop_override(struct kqworkloop *kqwl)
5305 {
5306 workq_threadreq_t kqr = &kqwl->kqwl_request;
5307 return MAX(kqr->tr_kq_qos_index, kqr->tr_kq_override_index);
5308 }
5309
5310 static inline void
5311 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
5312 {
5313 workq_threadreq_t kqr = &kqwl->kqwl_request;
5314
5315 kqlock_held(kqwl);
5316
5317 if (kqwl->kqwl_state & KQ_R2K_ARMED) {
5318 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5319 act_set_astkevent(kqr_thread_fast(kqr), AST_KEVENT_RETURN_TO_KERNEL);
5320 }
5321 }
5322
5323 static void
5324 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
5325 {
5326 workq_threadreq_t kqr = &kqwl->kqwl_request;
5327 struct kqueue *kq = &kqwl->kqwl_kqueue;
5328 kq_index_t old_override = kqworkloop_override(kqwl);
5329 kq_index_t i;
5330
5331 kqlock_held(kqwl);
5332
5333 switch (op) {
5334 case KQWL_UTQ_UPDATE_WAKEUP_QOS:
5335 if (qos == KQWL_BUCKET_STAYACTIVE) {
5336 /*
5337 * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
5338 * a high watermark (kqwl_stayactive_qos) of any stay active knote
5339 * that was ever registered with this workloop.
5340 *
5341 * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
5342 * knote, we use this high-watermark as a wakeup-index, and also set
5343 * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
5344 * there is at least one stay active knote fired until the next full
5345 * processing of this bucket.
5346 */
5347 kqwl->kqwl_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT;
5348 qos = kqwl->kqwl_stayactive_qos;
5349 assert(qos);
5350 }
5351 if (kqwl->kqwl_wakeup_indexes & (1 << qos)) {
5352 assert(kqr->tr_kq_wakeup);
5353 break;
5354 }
5355
5356 kqwl->kqwl_wakeup_indexes |= (1 << qos);
5357 kqr->tr_kq_wakeup = true;
5358 kqworkloop_request_fire_r2k_notification(kqwl);
5359 goto recompute;
5360
5361 case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
5362 assert(qos);
5363 if (kqwl->kqwl_stayactive_qos < qos) {
5364 kqwl->kqwl_stayactive_qos = qos;
5365 if (kqwl->kqwl_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
5366 assert(kqr->tr_kq_wakeup);
5367 kqwl->kqwl_wakeup_indexes |= (1 << qos);
5368 goto recompute;
5369 }
5370 }
5371 break;
5372
5373 case KQWL_UTQ_PARKING:
5374 case KQWL_UTQ_UNBINDING:
5375 kqr->tr_kq_override_index = qos;
5376 OS_FALLTHROUGH;
5377 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
5378 if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
5379 assert(qos == THREAD_QOS_UNSPECIFIED);
5380 }
5381 i = KQWL_BUCKET_STAYACTIVE;
5382 if (TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5383 kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5384 }
5385 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) &&
5386 (kqwl->kqwl_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
5387 /*
5388 * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
5389 * knote may have fired, so we need to merge in kqwl_stayactive_qos.
5390 *
5391 * Unlike other buckets, this one is never empty but could be idle.
5392 */
5393 kqwl->kqwl_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT;
5394 kqwl->kqwl_wakeup_indexes |= (1 << kqwl->kqwl_stayactive_qos);
5395 } else {
5396 kqwl->kqwl_wakeup_indexes = 0;
5397 }
5398 for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) {
5399 if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) {
5400 kqwl->kqwl_wakeup_indexes |= (1 << i);
5401 }
5402 }
5403 if (kqwl->kqwl_wakeup_indexes) {
5404 kqr->tr_kq_wakeup = true;
5405 kqworkloop_request_fire_r2k_notification(kqwl);
5406 } else {
5407 kqr->tr_kq_wakeup = false;
5408 }
5409 goto recompute;
5410
5411 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
5412 kqr->tr_kq_override_index = qos;
5413 goto recompute;
5414
5415 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
5416 recompute:
5417 /*
5418 * When modifying the wakeup QoS or the override QoS, we always need to
5419 * maintain our invariant that kqr_override_index is at least as large
5420 * as the highest QoS for which an event is fired.
5421 *
5422 * However this override index can be larger when there is an overriden
5423 * suppressed knote pushing on the kqueue.
5424 */
5425 if (kqwl->kqwl_wakeup_indexes > (1 << qos)) {
5426 qos = (uint8_t)(fls(kqwl->kqwl_wakeup_indexes) - 1); /* fls is 1-based */
5427 }
5428 if (kqr->tr_kq_override_index < qos) {
5429 kqr->tr_kq_override_index = qos;
5430 }
5431 break;
5432
5433 case KQWL_UTQ_REDRIVE_EVENTS:
5434 break;
5435
5436 case KQWL_UTQ_SET_QOS_INDEX:
5437 kqr->tr_kq_qos_index = qos;
5438 break;
5439
5440 default:
5441 panic("unknown kqwl thread qos update operation: %d", op);
5442 }
5443
5444 thread_t kqwl_owner = kqwl->kqwl_owner;
5445 thread_t servicer = kqr_thread(kqr);
5446 boolean_t qos_changed = FALSE;
5447 kq_index_t new_override = kqworkloop_override(kqwl);
5448
5449 /*
5450 * Apply the diffs to the owner if applicable
5451 */
5452 if (kqwl_owner) {
5453 #if 0
5454 /* JMM - need new trace hooks for owner overrides */
5455 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
5456 kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->tr_kq_qos_index,
5457 (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
5458 #endif
5459 if (new_override == old_override) {
5460 // nothing to do
5461 } else if (old_override == THREAD_QOS_UNSPECIFIED) {
5462 thread_add_kevent_override(kqwl_owner, new_override);
5463 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5464 thread_drop_kevent_override(kqwl_owner);
5465 } else { /* old_override != new_override */
5466 thread_update_kevent_override(kqwl_owner, new_override);
5467 }
5468 }
5469
5470 /*
5471 * apply the diffs to the servicer
5472 */
5473 if (!kqr_thread_requested(kqr)) {
5474 /*
5475 * No servicer, nor thread-request
5476 *
5477 * Make a new thread request, unless there is an owner (or the workloop
5478 * is suspended in userland) or if there is no asynchronous work in the
5479 * first place.
5480 */
5481
5482 if (kqwl_owner == NULL && kqr->tr_kq_wakeup) {
5483 int initiate_flags = 0;
5484 if (op == KQWL_UTQ_UNBINDING) {
5485 initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
5486 }
5487 kqueue_threadreq_initiate(kq, kqr, new_override, initiate_flags);
5488 }
5489 } else if (servicer) {
5490 /*
5491 * Servicer in flight
5492 *
5493 * Just apply the diff to the servicer
5494 */
5495 struct uthread *ut = get_bsdthread_info(servicer);
5496 if (ut->uu_kqueue_override != new_override) {
5497 if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
5498 thread_add_servicer_override(servicer, new_override);
5499 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5500 thread_drop_servicer_override(servicer);
5501 } else { /* ut->uu_kqueue_override != new_override */
5502 thread_update_servicer_override(servicer, new_override);
5503 }
5504 ut->uu_kqueue_override = new_override;
5505 qos_changed = TRUE;
5506 }
5507 } else if (new_override == THREAD_QOS_UNSPECIFIED) {
5508 /*
5509 * No events to deliver anymore.
5510 *
5511 * However canceling with turnstiles is challenging, so the fact that
5512 * the request isn't useful will be discovered by the servicer himself
5513 * later on.
5514 */
5515 } else if (old_override != new_override) {
5516 /*
5517 * Request is in flight
5518 *
5519 * Apply the diff to the thread request
5520 */
5521 kqueue_threadreq_modify(kq, kqr, new_override, WORKQ_THREADREQ_NONE);
5522 qos_changed = TRUE;
5523 }
5524
5525 if (qos_changed) {
5526 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
5527 thread_tid(servicer), kqr->tr_kq_qos_index,
5528 (kqr->tr_kq_override_index << 16) | kqr->tr_kq_wakeup);
5529 }
5530 }
5531
5532 static void
5533 kqworkloop_wakeup(struct kqworkloop *kqwl, kq_index_t qos)
5534 {
5535 if ((kqwl->kqwl_state & KQ_PROCESSING) &&
5536 kqr_thread(&kqwl->kqwl_request) == current_thread()) {
5537 /*
5538 * kqworkloop_end_processing() will perform the required QoS
5539 * computations when it unsets the processing mode.
5540 */
5541 return;
5542 }
5543
5544 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos);
5545 }
5546
5547 static struct kqtailq *
5548 kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
5549 {
5550 if (kq.kq->kq_state & KQ_WORKLOOP) {
5551 return &kq.kqwl->kqwl_suppressed;
5552 } else if (kq.kq->kq_state & KQ_WORKQ) {
5553 return &kq.kqwq->kqwq_suppressed[kn->kn_qos_index];
5554 } else {
5555 return &kq.kqf->kqf_suppressed;
5556 }
5557 }
5558
5559 struct turnstile *
5560 kqueue_alloc_turnstile(kqueue_t kqu)
5561 {
5562 struct kqworkloop *kqwl = kqu.kqwl;
5563 kq_state_t kq_state;
5564
5565 kq_state = os_atomic_load(&kqu.kq->kq_state, dependency);
5566 if (kq_state & KQ_HAS_TURNSTILE) {
5567 /* force a dependency to pair with the atomic or with release below */
5568 return os_atomic_load_with_dependency_on(&kqwl->kqwl_turnstile,
5569 (uintptr_t)kq_state);
5570 }
5571
5572 if (!(kq_state & KQ_WORKLOOP)) {
5573 return TURNSTILE_NULL;
5574 }
5575
5576 struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
5577 bool workq_locked = false;
5578
5579 kqlock(kqu);
5580
5581 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5582 workq_locked = true;
5583 workq_kern_threadreq_lock(kqwl->kqwl_p);
5584 }
5585
5586 if (kqwl->kqwl_state & KQ_HAS_TURNSTILE) {
5587 free_ts = ts;
5588 ts = kqwl->kqwl_turnstile;
5589 } else {
5590 ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
5591 ts, TURNSTILE_WORKLOOPS);
5592
5593 /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
5594 os_atomic_or(&kqwl->kqwl_state, KQ_HAS_TURNSTILE, release);
5595
5596 if (filt_wlturnstile_interlock_is_workq(kqwl)) {
5597 workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
5598 &kqwl->kqwl_request, kqwl->kqwl_owner,
5599 ts, TURNSTILE_IMMEDIATE_UPDATE);
5600 /*
5601 * The workq may no longer be the interlock after this.
5602 * In which case the inheritor wasn't updated.
5603 */
5604 }
5605 if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
5606 filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
5607 }
5608 }
5609
5610 if (workq_locked) {
5611 workq_kern_threadreq_unlock(kqwl->kqwl_p);
5612 }
5613
5614 kqunlock(kqu);
5615
5616 if (free_ts) {
5617 turnstile_deallocate(free_ts);
5618 } else {
5619 turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
5620 }
5621 return ts;
5622 }
5623
5624 __attribute__((always_inline))
5625 struct turnstile *
5626 kqueue_turnstile(kqueue_t kqu)
5627 {
5628 kq_state_t kq_state = os_atomic_load(&kqu.kq->kq_state, relaxed);
5629 if (kq_state & KQ_WORKLOOP) {
5630 return os_atomic_load(&kqu.kqwl->kqwl_turnstile, relaxed);
5631 }
5632 return TURNSTILE_NULL;
5633 }
5634
5635 __attribute__((always_inline))
5636 struct turnstile *
5637 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr)
5638 {
5639 struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
5640 if (kqwl) {
5641 return os_atomic_load(&kqwl->kqwl_turnstile, relaxed);
5642 }
5643 return TURNSTILE_NULL;
5644 }
5645
5646 static void
5647 kqworkloop_set_overcommit(struct kqworkloop *kqwl)
5648 {
5649 workq_threadreq_t kqr = &kqwl->kqwl_request;
5650
5651 /*
5652 * This test is racy, but since we never remove this bit,
5653 * it allows us to avoid taking a lock.
5654 */
5655 if (kqr->tr_flags & WORKQ_TR_FLAG_OVERCOMMIT) {
5656 return;
5657 }
5658
5659 kqlock_held(kqwl);
5660
5661 if (kqr_thread_requested_pending(kqr)) {
5662 kqueue_threadreq_modify(kqwl, kqr, kqr->tr_qos,
5663 WORKQ_THREADREQ_MAKE_OVERCOMMIT);
5664 } else {
5665 kqr->tr_flags |= WORKQ_TR_FLAG_OVERCOMMIT;
5666 }
5667 }
5668
5669 static void
5670 kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
5671 kq_index_t override_index)
5672 {
5673 workq_threadreq_t kqr;
5674 kq_index_t old_override_index;
5675 kq_index_t queue_index = kn->kn_qos_index;
5676
5677 if (override_index <= queue_index) {
5678 return;
5679 }
5680
5681 kqr = kqworkq_get_request(kqwq, queue_index);
5682
5683 kqlock_held(kqwq);
5684
5685 old_override_index = kqr->tr_kq_override_index;
5686 if (override_index > MAX(kqr->tr_kq_qos_index, old_override_index)) {
5687 thread_t servicer = kqr_thread(kqr);
5688 kqr->tr_kq_override_index = override_index;
5689
5690 /* apply the override to [incoming?] servicing thread */
5691 if (servicer) {
5692 if (old_override_index) {
5693 thread_update_kevent_override(servicer, override_index);
5694 } else {
5695 thread_add_kevent_override(servicer, override_index);
5696 }
5697 }
5698 }
5699 }
5700
5701 static void
5702 kqueue_update_override(kqueue_t kqu, struct knote *kn, thread_qos_t qos)
5703 {
5704 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5705 kqworkloop_update_threads_qos(kqu.kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
5706 qos);
5707 } else {
5708 kqworkq_update_override(kqu.kqwq, kn, qos);
5709 }
5710 }
5711
5712 static void
5713 kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread,
5714 enum kqwl_unbind_locked_mode how)
5715 {
5716 struct uthread *ut = get_bsdthread_info(thread);
5717 workq_threadreq_t kqr = &kqwl->kqwl_request;
5718
5719 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
5720 thread_tid(thread), 0, 0);
5721
5722 kqlock_held(kqwl);
5723
5724 assert(ut->uu_kqr_bound == kqr);
5725 ut->uu_kqr_bound = NULL;
5726 if (how == KQWL_OVERRIDE_DROP_IMMEDIATELY &&
5727 ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5728 thread_drop_servicer_override(thread);
5729 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5730 }
5731
5732 if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
5733 turnstile_update_inheritor(kqwl->kqwl_turnstile,
5734 TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
5735 turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
5736 TURNSTILE_INTERLOCK_HELD);
5737 }
5738
5739 kqr->tr_thread = THREAD_NULL;
5740 kqr->tr_state = WORKQ_TR_STATE_IDLE;
5741 kqwl->kqwl_state &= ~KQ_R2K_ARMED;
5742 }
5743
5744 static void
5745 kqworkloop_unbind_delayed_override_drop(thread_t thread)
5746 {
5747 struct uthread *ut = get_bsdthread_info(thread);
5748 assert(ut->uu_kqr_bound == NULL);
5749 if (ut->uu_kqueue_override != THREAD_QOS_UNSPECIFIED) {
5750 thread_drop_servicer_override(thread);
5751 ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
5752 }
5753 }
5754
5755 /*
5756 * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
5757 *
5758 * It will acknowledge events, and possibly request a new thread if:
5759 * - there were active events left
5760 * - we pended waitq hook callouts during processing
5761 * - we pended wakeups while processing (or unsuppressing)
5762 *
5763 * Called with kqueue lock held.
5764 */
5765 static void
5766 kqworkloop_unbind(struct kqworkloop *kqwl)
5767 {
5768 struct kqueue *kq = &kqwl->kqwl_kqueue;
5769 workq_threadreq_t kqr = &kqwl->kqwl_request;
5770 thread_t thread = kqr_thread_fast(kqr);
5771 int op = KQWL_UTQ_PARKING;
5772 kq_index_t qos_override = THREAD_QOS_UNSPECIFIED;
5773
5774 assert(thread == current_thread());
5775
5776 kqlock(kqwl);
5777
5778 /*
5779 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
5780 * unsuppressing knotes not to be applied until the eventual call to
5781 * kqworkloop_update_threads_qos() below.
5782 */
5783 assert((kq->kq_state & KQ_PROCESSING) == 0);
5784 if (!TAILQ_EMPTY(&kqwl->kqwl_suppressed)) {
5785 kq->kq_state |= KQ_PROCESSING;
5786 qos_override = kqworkloop_acknowledge_events(kqwl);
5787 kq->kq_state &= ~KQ_PROCESSING;
5788 }
5789
5790 kqworkloop_unbind_locked(kqwl, thread, KQWL_OVERRIDE_DROP_DELAYED);
5791 kqworkloop_update_threads_qos(kqwl, op, qos_override);
5792
5793 kqunlock(kqwl);
5794
5795 /*
5796 * Drop the override on the current thread last, after the call to
5797 * kqworkloop_update_threads_qos above.
5798 */
5799 kqworkloop_unbind_delayed_override_drop(thread);
5800
5801 /* If last reference, dealloc the workloop kq */
5802 kqworkloop_release(kqwl);
5803 }
5804
5805 static thread_qos_t
5806 kqworkq_unbind_locked(struct kqworkq *kqwq,
5807 workq_threadreq_t kqr, thread_t thread)
5808 {
5809 struct uthread *ut = get_bsdthread_info(thread);
5810 kq_index_t old_override = kqr->tr_kq_override_index;
5811
5812 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
5813 thread_tid(kqr_thread(kqr)), kqr->tr_kq_qos_index, 0);
5814
5815 kqlock_held(kqwq);
5816
5817 assert(ut->uu_kqr_bound == kqr);
5818 ut->uu_kqr_bound = NULL;
5819 kqr->tr_thread = THREAD_NULL;
5820 kqr->tr_state = WORKQ_TR_STATE_IDLE;
5821 kqr->tr_kq_override_index = THREAD_QOS_UNSPECIFIED;
5822 kqwq->kqwq_state &= ~KQ_R2K_ARMED;
5823
5824 return old_override;
5825 }
5826
5827 /*
5828 * kqworkq_unbind - unbind of a workq kqueue from a thread
5829 *
5830 * We may have to request new threads.
5831 * This can happen there are no waiting processing threads and:
5832 * - there were active events we never got to (count > 0)
5833 * - we pended waitq hook callouts during processing
5834 * - we pended wakeups while processing (or unsuppressing)
5835 */
5836 static void
5837 kqworkq_unbind(proc_t p, workq_threadreq_t kqr)
5838 {
5839 struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
5840 __assert_only int rc;
5841
5842 kqlock(kqwq);
5843 rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
5844 assert(rc == -1);
5845 kqunlock(kqwq);
5846 }
5847
5848 workq_threadreq_t
5849 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
5850 {
5851 assert(qos_index < KQWQ_NBUCKETS);
5852 return &kqwq->kqwq_request[qos_index];
5853 }
5854
5855 static void
5856 knote_reset_priority(kqueue_t kqu, struct knote *kn, pthread_priority_t pp)
5857 {
5858 kq_index_t qos = _pthread_priority_thread_qos(pp);
5859
5860 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5861 assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
5862 pp = _pthread_priority_normalize(pp);
5863 } else if (kqu.kq->kq_state & KQ_WORKQ) {
5864 if (qos == THREAD_QOS_UNSPECIFIED) {
5865 /* On workqueues, outside of QoS means MANAGER */
5866 qos = KQWQ_QOS_MANAGER;
5867 pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
5868 } else {
5869 pp = _pthread_priority_normalize(pp);
5870 }
5871 } else {
5872 pp = _pthread_unspecified_priority();
5873 qos = THREAD_QOS_UNSPECIFIED;
5874 }
5875
5876 kn->kn_qos = (int32_t)pp;
5877
5878 if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
5879 /* Never lower QoS when in "Merge" mode */
5880 kn->kn_qos_override = qos;
5881 }
5882
5883 /* only adjust in-use qos index when not suppressed */
5884 if (kn->kn_status & KN_SUPPRESSED) {
5885 kqueue_update_override(kqu, kn, qos);
5886 } else if (kn->kn_qos_index != qos) {
5887 knote_dequeue(kqu, kn);
5888 kn->kn_qos_index = qos;
5889 }
5890 }
5891
5892 static void
5893 knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
5894 {
5895 thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
5896
5897 kqlock_held(kq);
5898
5899 assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
5900 assert(qos_index < THREAD_QOS_LAST);
5901
5902 /*
5903 * Early exit for knotes that should not change QoS
5904 */
5905 if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
5906 panic("filter %d cannot change QoS", kn->kn_filtid);
5907 } else if (__improbable(!knote_has_qos(kn))) {
5908 return;
5909 }
5910
5911 /*
5912 * knotes with the FALLBACK flag will only use their registration QoS if the
5913 * incoming event has no QoS, else, the registration QoS acts as a floor.
5914 */
5915 thread_qos_t req_qos = _pthread_priority_thread_qos_fast(kn->kn_qos);
5916 if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
5917 if (qos_index == THREAD_QOS_UNSPECIFIED) {
5918 qos_index = req_qos;
5919 }
5920 } else {
5921 if (qos_index < req_qos) {
5922 qos_index = req_qos;
5923 }
5924 }
5925 if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
5926 /* Never lower QoS when in "Merge" mode */
5927 return;
5928 }
5929
5930 if ((kn->kn_status & KN_LOCKED) && (kn->kn_status & KN_POSTING)) {
5931 /*
5932 * When we're trying to update the QoS override and that both an
5933 * f_event() and other f_* calls are running concurrently, any of these
5934 * in flight calls may want to perform overrides that aren't properly
5935 * serialized with each other.
5936 *
5937 * The first update that observes this racy situation enters a "Merge"
5938 * mode which causes subsequent override requests to saturate the
5939 * override instead of replacing its value.
5940 *
5941 * This mode is left when knote_unlock() or knote_post()
5942 * observe that no other f_* routine is in flight.
5943 */
5944 kn->kn_status |= KN_MERGE_QOS;
5945 }
5946
5947 /*
5948 * Now apply the override if it changed.
5949 */
5950
5951 if (kn->kn_qos_override == qos_index) {
5952 return;
5953 }
5954
5955 kn->kn_qos_override = qos_index;
5956
5957 if (kn->kn_status & KN_SUPPRESSED) {
5958 /*
5959 * For suppressed events, the kn_qos_index field cannot be touched as it
5960 * allows us to know on which supress queue the knote is for a kqworkq.
5961 *
5962 * Also, there's no natural push applied on the kqueues when this field
5963 * changes anyway. We hence need to apply manual overrides in this case,
5964 * which will be cleared when the events are later acknowledged.
5965 */
5966 kqueue_update_override(kq, kn, qos_index);
5967 } else if (kn->kn_qos_index != qos_index) {
5968 knote_dequeue(kq, kn);
5969 kn->kn_qos_index = qos_index;
5970 }
5971 }
5972
5973 /*
5974 * Called back from waitq code when no threads waiting and the hook was set.
5975 *
5976 * Preemption is disabled - minimal work can be done in this context!!!
5977 */
5978 void
5979 waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *kq_hook)
5980 {
5981 kqueue_t kqu;
5982
5983 kqu.kq = __container_of(kq_hook, struct kqueue, kq_waitq_hook);
5984 assert(kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
5985
5986 kqlock(kqu);
5987
5988 if (kqu.kq->kq_count > 0) {
5989 if (kqu.kq->kq_state & KQ_WORKLOOP) {
5990 kqworkloop_wakeup(kqu.kqwl, KQWL_BUCKET_STAYACTIVE);
5991 } else {
5992 kqworkq_wakeup(kqu.kqwq, KQWQ_QOS_MANAGER);
5993 }
5994 }
5995
5996 kqunlock(kqu);
5997 }
5998
5999 void
6000 klist_init(struct klist *list)
6001 {
6002 SLIST_INIT(list);
6003 }
6004
6005
6006 /*
6007 * Query/Post each knote in the object's list
6008 *
6009 * The object lock protects the list. It is assumed
6010 * that the filter/event routine for the object can
6011 * determine that the object is already locked (via
6012 * the hint) and not deadlock itself.
6013 *
6014 * The object lock should also hold off pending
6015 * detach/drop operations.
6016 */
6017 void
6018 knote(struct klist *list, long hint)
6019 {
6020 struct knote *kn;
6021
6022 SLIST_FOREACH(kn, list, kn_selnext) {
6023 knote_post(kn, hint);
6024 }
6025 }
6026
6027 /*
6028 * attach a knote to the specified list. Return true if this is the first entry.
6029 * The list is protected by whatever lock the object it is associated with uses.
6030 */
6031 int
6032 knote_attach(struct klist *list, struct knote *kn)
6033 {
6034 int ret = SLIST_EMPTY(list);
6035 SLIST_INSERT_HEAD(list, kn, kn_selnext);
6036 return ret;
6037 }
6038
6039 /*
6040 * detach a knote from the specified list. Return true if that was the last entry.
6041 * The list is protected by whatever lock the object it is associated with uses.
6042 */
6043 int
6044 knote_detach(struct klist *list, struct knote *kn)
6045 {
6046 SLIST_REMOVE(list, kn, knote, kn_selnext);
6047 return SLIST_EMPTY(list);
6048 }
6049
6050 /*
6051 * knote_vanish - Indicate that the source has vanished
6052 *
6053 * If the knote has requested EV_VANISHED delivery,
6054 * arrange for that. Otherwise, deliver a NOTE_REVOKE
6055 * event for backward compatibility.
6056 *
6057 * The knote is marked as having vanished, but is not
6058 * actually detached from the source in this instance.
6059 * The actual detach is deferred until the knote drop.
6060 *
6061 * Our caller already has the object lock held. Calling
6062 * the detach routine would try to take that lock
6063 * recursively - which likely is not supported.
6064 */
6065 void
6066 knote_vanish(struct klist *list, bool make_active)
6067 {
6068 struct knote *kn;
6069 struct knote *kn_next;
6070
6071 SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
6072 struct kqueue *kq = knote_get_kq(kn);
6073
6074 kqlock(kq);
6075 if (__probable(kn->kn_status & KN_REQVANISH)) {
6076 /*
6077 * If EV_VANISH supported - prepare to deliver one
6078 */
6079 kn->kn_status |= KN_VANISHED;
6080 } else {
6081 /*
6082 * Handle the legacy way to indicate that the port/portset was
6083 * deallocated or left the current Mach portspace (modern technique
6084 * is with an EV_VANISHED protocol).
6085 *
6086 * Deliver an EV_EOF event for these changes (hopefully it will get
6087 * delivered before the port name recycles to the same generation
6088 * count and someone tries to re-register a kevent for it or the
6089 * events are udata-specific - avoiding a conflict).
6090 */
6091 kn->kn_flags |= EV_EOF | EV_ONESHOT;
6092 }
6093 if (make_active) {
6094 knote_activate(kq, kn, FILTER_ACTIVE);
6095 }
6096 kqunlock(kq);
6097 }
6098 }
6099
6100 /*
6101 * Force a lazy allocation of the waitqset link
6102 * of the kq_wqs associated with the kn
6103 * if it wasn't already allocated.
6104 *
6105 * This allows knote_link_waitq to never block
6106 * if reserved_link is not NULL.
6107 */
6108 void
6109 knote_link_waitqset_lazy_alloc(struct knote *kn)
6110 {
6111 struct kqueue *kq = knote_get_kq(kn);
6112 waitq_set_lazy_init_link(&kq->kq_wqs);
6113 }
6114
6115 /*
6116 * Check if a lazy allocation for the waitqset link
6117 * of the kq_wqs is needed.
6118 */
6119 boolean_t
6120 knote_link_waitqset_should_lazy_alloc(struct knote *kn)
6121 {
6122 struct kqueue *kq = knote_get_kq(kn);
6123 return waitq_set_should_lazy_init_link(&kq->kq_wqs);
6124 }
6125
6126 /*
6127 * For a given knote, link a provided wait queue directly with the kqueue.
6128 * Wakeups will happen via recursive wait queue support. But nothing will move
6129 * the knote to the active list at wakeup (nothing calls knote()). Instead,
6130 * we permanently enqueue them here.
6131 *
6132 * kqueue and knote references are held by caller.
6133 * waitq locked by caller.
6134 *
6135 * caller provides the wait queue link structure and insures that the kq->kq_wqs
6136 * is linked by previously calling knote_link_waitqset_lazy_alloc.
6137 */
6138 int
6139 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
6140 {
6141 struct kqueue *kq = knote_get_kq(kn);
6142 kern_return_t kr;
6143
6144 kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
6145 if (kr == KERN_SUCCESS) {
6146 knote_markstayactive(kn);
6147 return 0;
6148 } else {
6149 return EINVAL;
6150 }
6151 }
6152
6153 /*
6154 * Unlink the provided wait queue from the kqueue associated with a knote.
6155 * Also remove it from the magic list of directly attached knotes.
6156 *
6157 * Note that the unlink may have already happened from the other side, so
6158 * ignore any failures to unlink and just remove it from the kqueue list.
6159 *
6160 * On success, caller is responsible for the link structure
6161 */
6162 int
6163 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
6164 {
6165 struct kqueue *kq = knote_get_kq(kn);
6166 kern_return_t kr;
6167
6168 kr = waitq_unlink(wq, &kq->kq_wqs);
6169 knote_clearstayactive(kn);
6170 return (kr != KERN_SUCCESS) ? EINVAL : 0;
6171 }
6172
6173 /*
6174 * remove all knotes referencing a specified fd
6175 *
6176 * Entered with the proc_fd lock already held.
6177 * It returns the same way, but may drop it temporarily.
6178 */
6179 void
6180 knote_fdclose(struct proc *p, int fd)
6181 {
6182 struct klist *list;
6183 struct knote *kn;
6184 KNOTE_LOCK_CTX(knlc);
6185
6186 restart:
6187 list = &p->p_fd->fd_knlist[fd];
6188 SLIST_FOREACH(kn, list, kn_link) {
6189 struct kqueue *kq = knote_get_kq(kn);
6190
6191 kqlock(kq);
6192
6193 if (kq->kq_p != p) {
6194 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6195 __func__, kq->kq_p, p);
6196 }
6197
6198 /*
6199 * If the knote supports EV_VANISHED delivery,
6200 * transition it to vanished mode (or skip over
6201 * it if already vanished).
6202 */
6203 if (kn->kn_status & KN_VANISHED) {
6204 kqunlock(kq);
6205 continue;
6206 }
6207
6208 proc_fdunlock(p);
6209 if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
6210 /* the knote was dropped by someone, nothing to do */
6211 } else if (kn->kn_status & KN_REQVANISH) {
6212 kn->kn_status |= KN_VANISHED;
6213
6214 kqunlock(kq);
6215 knote_fops(kn)->f_detach(kn);
6216 if (kn->kn_is_fd) {
6217 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6218 }
6219 kn->kn_filtid = EVFILTID_DETACHED;
6220 kqlock(kq);
6221
6222 knote_activate(kq, kn, FILTER_ACTIVE);
6223 knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
6224 } else {
6225 knote_drop(kq, kn, &knlc);
6226 }
6227
6228 proc_fdlock(p);
6229 goto restart;
6230 }
6231 }
6232
6233 /*
6234 * knote_fdfind - lookup a knote in the fd table for process
6235 *
6236 * If the filter is file-based, lookup based on fd index.
6237 * Otherwise use a hash based on the ident.
6238 *
6239 * Matching is based on kq, filter, and ident. Optionally,
6240 * it may also be based on the udata field in the kevent -
6241 * allowing multiple event registration for the file object
6242 * per kqueue.
6243 *
6244 * fd_knhashlock or fdlock held on entry (and exit)
6245 */
6246 static struct knote *
6247 knote_fdfind(struct kqueue *kq,
6248 const struct kevent_internal_s *kev,
6249 bool is_fd,
6250 struct proc *p)
6251 {
6252 struct filedesc *fdp = p->p_fd;
6253 struct klist *list = NULL;
6254 struct knote *kn = NULL;
6255
6256 /*
6257 * determine where to look for the knote
6258 */
6259 if (is_fd) {
6260 /* fd-based knotes are linked off the fd table */
6261 if (kev->kei_ident < (u_int)fdp->fd_knlistsize) {
6262 list = &fdp->fd_knlist[kev->kei_ident];
6263 }
6264 } else if (fdp->fd_knhashmask != 0) {
6265 /* hash non-fd knotes here too */
6266 list = &fdp->fd_knhash[KN_HASH((u_long)kev->kei_ident, fdp->fd_knhashmask)];
6267 }
6268
6269 /*
6270 * scan the selected list looking for a match
6271 */
6272 if (list != NULL) {
6273 SLIST_FOREACH(kn, list, kn_link) {
6274 if (kq == knote_get_kq(kn) &&
6275 kev->kei_ident == kn->kn_id &&
6276 kev->kei_filter == kn->kn_filter) {
6277 if (kev->kei_flags & EV_UDATA_SPECIFIC) {
6278 if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
6279 kev->kei_udata == kn->kn_udata) {
6280 break; /* matching udata-specific knote */
6281 }
6282 } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
6283 break; /* matching non-udata-specific knote */
6284 }
6285 }
6286 }
6287 }
6288 return kn;
6289 }
6290
6291 /*
6292 * kq_add_knote- Add knote to the fd table for process
6293 * while checking for duplicates.
6294 *
6295 * All file-based filters associate a list of knotes by file
6296 * descriptor index. All other filters hash the knote by ident.
6297 *
6298 * May have to grow the table of knote lists to cover the
6299 * file descriptor index presented.
6300 *
6301 * fd_knhashlock and fdlock unheld on entry (and exit).
6302 *
6303 * Takes a rwlock boost if inserting the knote is successful.
6304 */
6305 static int
6306 kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
6307 struct proc *p)
6308 {
6309 struct filedesc *fdp = p->p_fd;
6310 struct klist *list = NULL;
6311 int ret = 0;
6312 bool is_fd = kn->kn_is_fd;
6313 uint64_t nofile = proc_limitgetcur(p, RLIMIT_NOFILE, TRUE);
6314
6315 if (is_fd) {
6316 proc_fdlock(p);
6317 } else {
6318 knhash_lock(fdp);
6319 }
6320
6321 if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
6322 /* found an existing knote: we can't add this one */
6323 ret = ERESTART;
6324 goto out_locked;
6325 }
6326
6327 /* knote was not found: add it now */
6328 if (!is_fd) {
6329 if (fdp->fd_knhashmask == 0) {
6330 u_long size = 0;
6331
6332 list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
6333 if (list == NULL) {
6334 ret = ENOMEM;
6335 goto out_locked;
6336 }
6337
6338 fdp->fd_knhash = list;
6339 fdp->fd_knhashmask = size;
6340 }
6341
6342 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6343 SLIST_INSERT_HEAD(list, kn, kn_link);
6344 ret = 0;
6345 goto out_locked;
6346 } else {
6347 /* knote is fd based */
6348
6349 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
6350 u_int size = 0;
6351
6352 /* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6353 if (kn->kn_id >= (uint64_t) nofile
6354 || kn->kn_id >= (uint64_t)maxfilesperproc) {
6355 ret = EINVAL;
6356 goto out_locked;
6357 }
6358 /* have to grow the fd_knlist */
6359 size = fdp->fd_knlistsize;
6360 while (size <= kn->kn_id) {
6361 size += KQEXTENT;
6362 }
6363
6364 if (size >= (UINT_MAX / sizeof(struct klist *))) {
6365 ret = EINVAL;
6366 goto out_locked;
6367 }
6368
6369 MALLOC(list, struct klist *,
6370 size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
6371 if (list == NULL) {
6372 ret = ENOMEM;
6373 goto out_locked;
6374 }
6375
6376 bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
6377 fdp->fd_knlistsize * sizeof(struct klist *));
6378 bzero((caddr_t)list +
6379 fdp->fd_knlistsize * sizeof(struct klist *),
6380 (size - fdp->fd_knlistsize) * sizeof(struct klist *));
6381 FREE(fdp->fd_knlist, M_KQUEUE);
6382 fdp->fd_knlist = list;
6383 fdp->fd_knlistsize = size;
6384 }
6385
6386 list = &fdp->fd_knlist[kn->kn_id];
6387 SLIST_INSERT_HEAD(list, kn, kn_link);
6388 ret = 0;
6389 goto out_locked;
6390 }
6391
6392 out_locked:
6393 if (ret == 0) {
6394 kqlock(kq);
6395 assert((kn->kn_status & KN_LOCKED) == 0);
6396 (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
6397 kqueue_retain(kq); /* retain a kq ref */
6398 }
6399 if (is_fd) {
6400 proc_fdunlock(p);
6401 } else {
6402 knhash_unlock(fdp);
6403 }
6404
6405 return ret;
6406 }
6407
6408 /*
6409 * kq_remove_knote - remove a knote from the fd table for process
6410 *
6411 * If the filter is file-based, remove based on fd index.
6412 * Otherwise remove from the hash based on the ident.
6413 *
6414 * fd_knhashlock and fdlock unheld on entry (and exit).
6415 */
6416 static void
6417 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
6418 struct knote_lock_ctx *knlc)
6419 {
6420 struct filedesc *fdp = p->p_fd;
6421 struct klist *list = NULL;
6422 uint16_t kq_state;
6423 bool is_fd = kn->kn_is_fd;
6424
6425 if (is_fd) {
6426 proc_fdlock(p);
6427 } else {
6428 knhash_lock(fdp);
6429 }
6430
6431 if (is_fd) {
6432 assert((u_int)fdp->fd_knlistsize > kn->kn_id);
6433 list = &fdp->fd_knlist[kn->kn_id];
6434 } else {
6435 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
6436 }
6437 SLIST_REMOVE(list, kn, knote, kn_link);
6438
6439 kqlock(kq);
6440 kq_state = kq->kq_state;
6441 if (knlc) {
6442 knote_unlock_cancel(kq, kn, knlc);
6443 } else {
6444 kqunlock(kq);
6445 }
6446 if (is_fd) {
6447 proc_fdunlock(p);
6448 } else {
6449 knhash_unlock(fdp);
6450 }
6451
6452 if (kq_state & KQ_DYNAMIC) {
6453 kqworkloop_release((struct kqworkloop *)kq);
6454 }
6455 }
6456
6457 /*
6458 * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6459 * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6460 *
6461 * fd_knhashlock or fdlock unheld on entry (and exit)
6462 */
6463
6464 static struct knote *
6465 kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_qos_s *kev,
6466 bool is_fd, struct proc *p)
6467 {
6468 struct filedesc *fdp = p->p_fd;
6469 struct knote *kn;
6470
6471 if (is_fd) {
6472 proc_fdlock(p);
6473 } else {
6474 knhash_lock(fdp);
6475 }
6476
6477 /*
6478 * Temporary horrible hack:
6479 * this cast is gross and will go away in a future change.
6480 * It is OK to do because we don't look at xflags/s_fflags,
6481 * and that when we cast down the kev this way,
6482 * the truncated filter field works.
6483 */
6484 kn = knote_fdfind(kq, (struct kevent_internal_s *)kev, is_fd, p);
6485
6486 if (kn) {
6487 kqlock(kq);
6488 assert(knote_get_kq(kn) == kq);
6489 }
6490
6491 if (is_fd) {
6492 proc_fdunlock(p);
6493 } else {
6494 knhash_unlock(fdp);
6495 }
6496
6497 return kn;
6498 }
6499
6500 __attribute__((noinline))
6501 static void
6502 kqfile_wakeup(struct kqfile *kqf, __unused kq_index_t qos)
6503 {
6504 /* flag wakeups during processing */
6505 if (kqf->kqf_state & KQ_PROCESSING) {
6506 kqf->kqf_state |= KQ_WAKEUP;
6507 }
6508
6509 /* wakeup a thread waiting on this queue */
6510 if (kqf->kqf_state & (KQ_SLEEP | KQ_SEL)) {
6511 kqf->kqf_state &= ~(KQ_SLEEP | KQ_SEL);
6512 waitq_wakeup64_all((struct waitq *)&kqf->kqf_wqs, KQ_EVENT,
6513 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
6514 }
6515
6516 /* wakeup other kqueues/select sets we're inside */
6517 KNOTE(&kqf->kqf_sel.si_note, 0);
6518 }
6519
6520 static struct kqtailq *
6521 knote_get_tailq(kqueue_t kqu, struct knote *kn)
6522 {
6523 kq_index_t qos_index = kn->kn_qos_index;
6524
6525 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6526 assert(qos_index < KQWL_NBUCKETS);
6527 } else if (kqu.kq->kq_state & KQ_WORKQ) {
6528 assert(qos_index < KQWQ_NBUCKETS);
6529 } else {
6530 assert(qos_index == QOS_INDEX_KQFILE);
6531 }
6532 static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue),
6533 "struct kqueue::kq_queue must be exactly at the end");
6534 return &kqu.kq->kq_queue[qos_index];
6535 }
6536
6537 static void
6538 knote_enqueue(kqueue_t kqu, struct knote *kn, kn_status_t wakeup_mask)
6539 {
6540 kqlock_held(kqu);
6541
6542 if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
6543 return;
6544 }
6545
6546 if (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)) {
6547 return;
6548 }
6549
6550 if ((kn->kn_status & KN_QUEUED) == 0) {
6551 struct kqtailq *queue = knote_get_tailq(kqu, kn);
6552
6553 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
6554 kn->kn_status |= KN_QUEUED;
6555 kqu.kq->kq_count++;
6556 } else if ((kn->kn_status & KN_STAYACTIVE) == 0) {
6557 return;
6558 }
6559
6560 if (kn->kn_status & wakeup_mask) {
6561 if (kqu.kq->kq_state & KQ_WORKLOOP) {
6562 kqworkloop_wakeup(kqu.kqwl, kn->kn_qos_index);
6563 } else if (kqu.kq->kq_state & KQ_WORKQ) {
6564 kqworkq_wakeup(kqu.kqwq, kn->kn_qos_index);
6565 } else {
6566 kqfile_wakeup(kqu.kqf, kn->kn_qos_index);
6567 }
6568 }
6569 }
6570
6571 __attribute__((always_inline))
6572 static inline void
6573 knote_dequeue(kqueue_t kqu, struct knote *kn)
6574 {
6575 if (kn->kn_status & KN_QUEUED) {
6576 struct kqtailq *queue = knote_get_tailq(kqu, kn);
6577
6578 // attaching the knote calls knote_reset_priority() without
6579 // the kqlock which is fine, so we can't call kqlock_held()
6580 // if we're not queued.
6581 kqlock_held(kqu);
6582
6583 TAILQ_REMOVE(queue, kn, kn_tqe);
6584 kn->kn_status &= ~KN_QUEUED;
6585 kqu.kq->kq_count--;
6586 }
6587 }
6588
6589 /* called with kqueue lock held */
6590 static void
6591 knote_suppress(kqueue_t kqu, struct knote *kn)
6592 {
6593 struct kqtailq *suppressq;
6594
6595 kqlock_held(kqu);
6596
6597 assert((kn->kn_status & KN_SUPPRESSED) == 0);
6598 assert(kn->kn_status & KN_QUEUED);
6599
6600 knote_dequeue(kqu, kn);
6601 /* deactivate - so new activations indicate a wakeup */
6602 kn->kn_status &= ~KN_ACTIVE;
6603 kn->kn_status |= KN_SUPPRESSED;
6604 suppressq = kqueue_get_suppressed_queue(kqu, kn);
6605 TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
6606 }
6607
6608 __attribute__((always_inline))
6609 static inline void
6610 knote_unsuppress_noqueue(kqueue_t kqu, struct knote *kn)
6611 {
6612 struct kqtailq *suppressq;
6613
6614 kqlock_held(kqu);
6615
6616 assert(kn->kn_status & KN_SUPPRESSED);
6617
6618 kn->kn_status &= ~KN_SUPPRESSED;
6619 suppressq = kqueue_get_suppressed_queue(kqu, kn);
6620 TAILQ_REMOVE(suppressq, kn, kn_tqe);
6621
6622 /*
6623 * If the knote is no longer active, reset its push,
6624 * and resynchronize kn_qos_index with kn_qos_override
6625 * for knotes with a real qos.
6626 */
6627 if ((kn->kn_status & KN_ACTIVE) == 0 && knote_has_qos(kn)) {
6628 kn->kn_qos_override = _pthread_priority_thread_qos_fast(kn->kn_qos);
6629 }
6630 kn->kn_qos_index = kn->kn_qos_override;
6631 }
6632
6633 /* called with kqueue lock held */
6634 static void
6635 knote_unsuppress(kqueue_t kqu, struct knote *kn)
6636 {
6637 if (kn->kn_status & KN_SUPPRESSED) {
6638 knote_unsuppress_noqueue(kqu, kn);
6639
6640 /* don't wakeup if unsuppressing just a stay-active knote */
6641 knote_enqueue(kqu, kn, KN_ACTIVE);
6642 }
6643 }
6644
6645 __attribute__((always_inline))
6646 static inline void
6647 knote_mark_active(struct knote *kn)
6648 {
6649 if ((kn->kn_status & KN_ACTIVE) == 0) {
6650 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
6651 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
6652 kn->kn_filtid);
6653 }
6654
6655 kn->kn_status |= KN_ACTIVE;
6656 }
6657
6658 /* called with kqueue lock held */
6659 static void
6660 knote_activate(kqueue_t kqu, struct knote *kn, int result)
6661 {
6662 assert(result & FILTER_ACTIVE);
6663 if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
6664 // may dequeue the knote
6665 knote_adjust_qos(kqu.kq, kn, result);
6666 }
6667 knote_mark_active(kn);
6668 knote_enqueue(kqu, kn, KN_ACTIVE | KN_STAYACTIVE);
6669 }
6670
6671 /*
6672 * This function applies changes requested by f_attach or f_touch for
6673 * a given filter. It proceeds in a carefully chosen order to help
6674 * every single transition do the minimal amount of work possible.
6675 */
6676 static void
6677 knote_apply_touch(kqueue_t kqu, struct knote *kn, struct kevent_qos_s *kev,
6678 int result)
6679 {
6680 kn_status_t wakeup_mask = KN_ACTIVE;
6681
6682 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
6683 /*
6684 * When a stayactive knote is reenabled, we may have missed wakeups
6685 * while it was disabled, so we need to poll it. To do so, ask
6686 * knote_enqueue() below to reenqueue it.
6687 */
6688 wakeup_mask |= KN_STAYACTIVE;
6689 kn->kn_status &= ~KN_DISABLED;
6690
6691 /*
6692 * it is possible for userland to have knotes registered for a given
6693 * workloop `wl_orig` but really handled on another workloop `wl_new`.
6694 *
6695 * In that case, rearming will happen from the servicer thread of
6696 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
6697 * this knote to stay suppressed forever if we only relied on
6698 * kqworkloop_acknowledge_events to be called by `wl_orig`.
6699 *
6700 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
6701 * unsuppress because that would mess with the processing phase of
6702 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
6703 * will be called.
6704 */
6705 if (__improbable(kn->kn_status & KN_SUPPRESSED)) {
6706 if ((kqu.kq->kq_state & KQ_PROCESSING) == 0) {
6707 knote_unsuppress_noqueue(kqu, kn);
6708 }
6709 }
6710 }
6711
6712 if ((result & FILTER_UPDATE_REQ_QOS) && kev->qos && kev->qos != kn->kn_qos) {
6713 // may dequeue the knote
6714 knote_reset_priority(kqu, kn, kev->qos);
6715 }
6716
6717 /*
6718 * When we unsuppress above, or because of knote_reset_priority(),
6719 * the knote may have been dequeued, we need to restore the invariant
6720 * that if the knote is active it needs to be queued now that
6721 * we're done applying changes.
6722 */
6723 if (result & FILTER_ACTIVE) {
6724 knote_activate(kqu, kn, result);
6725 } else {
6726 knote_enqueue(kqu, kn, wakeup_mask);
6727 }
6728
6729 if ((result & FILTER_THREADREQ_NODEFEER) &&
6730 act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ)) {
6731 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
6732 }
6733 }
6734
6735 /*
6736 * knote_drop - disconnect and drop the knote
6737 *
6738 * Called with the kqueue locked, returns with the kqueue unlocked.
6739 *
6740 * If a knote locking context is passed, it is canceled.
6741 *
6742 * The knote may have already been detached from
6743 * (or not yet attached to) its source object.
6744 */
6745 static void
6746 knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
6747 {
6748 struct proc *p = kq->kq_p;
6749
6750 kqlock_held(kq);
6751
6752 assert((kn->kn_status & KN_DROPPING) == 0);
6753 if (knlc == NULL) {
6754 assert((kn->kn_status & KN_LOCKED) == 0);
6755 }
6756 kn->kn_status |= KN_DROPPING;
6757
6758 if (kn->kn_status & KN_SUPPRESSED) {
6759 knote_unsuppress_noqueue(kq, kn);
6760 } else {
6761 knote_dequeue(kq, kn);
6762 }
6763 knote_wait_for_post(kq, kn);
6764
6765 knote_fops(kn)->f_detach(kn);
6766
6767 /* kq may be freed when kq_remove_knote() returns */
6768 kq_remove_knote(kq, kn, p, knlc);
6769 if (kn->kn_is_fd && ((kn->kn_status & KN_VANISHED) == 0)) {
6770 fp_drop(p, (int)kn->kn_id, kn->kn_fp, 0);
6771 }
6772
6773 knote_free(kn);
6774 }
6775
6776 void
6777 knote_init(void)
6778 {
6779 #if CONFIG_MEMORYSTATUS
6780 /* Initialize the memorystatus list lock */
6781 memorystatus_kevent_init(&kq_lck_grp, LCK_ATTR_NULL);
6782 #endif
6783 }
6784 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
6785
6786 const struct filterops *
6787 knote_fops(struct knote *kn)
6788 {
6789 return sysfilt_ops[kn->kn_filtid];
6790 }
6791
6792 static struct knote *
6793 knote_alloc(void)
6794 {
6795 return zalloc_flags(knote_zone, Z_WAITOK | Z_ZERO);
6796 }
6797
6798 static void
6799 knote_free(struct knote *kn)
6800 {
6801 assert((kn->kn_status & (KN_LOCKED | KN_POSTING)) == 0);
6802 zfree(knote_zone, kn);
6803 }
6804
6805 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
6806
6807 kevent_ctx_t
6808 kevent_get_context(thread_t thread)
6809 {
6810 uthread_t ut = get_bsdthread_info(thread);
6811 return &ut->uu_save.uus_kevent;
6812 }
6813
6814 static inline bool
6815 kevent_args_requesting_events(unsigned int flags, int nevents)
6816 {
6817 return !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0;
6818 }
6819
6820 static inline int
6821 kevent_adjust_flags_for_proc(proc_t p, int flags)
6822 {
6823 __builtin_assume(p);
6824 return flags | (IS_64BIT_PROCESS(p) ? KEVENT_FLAG_PROC64 : 0);
6825 }
6826
6827 /*!
6828 * @function kevent_get_kqfile
6829 *
6830 * @brief
6831 * Lookup a kqfile by fd.
6832 *
6833 * @discussion
6834 * Callers: kevent, kevent64, kevent_qos
6835 *
6836 * This is not assumed to be a fastpath (kqfile interfaces are legacy)
6837 */
6838 OS_NOINLINE
6839 static int
6840 kevent_get_kqfile(struct proc *p, int fd, int flags,
6841 struct fileproc **fpp, struct kqueue **kqp)
6842 {
6843 int error = 0;
6844 struct kqueue *kq;
6845
6846 error = fp_get_ftype(p, fd, DTYPE_KQUEUE, EBADF, fpp);
6847 if (__improbable(error)) {
6848 return error;
6849 }
6850 kq = (struct kqueue *)(*fpp)->f_data;
6851
6852 uint16_t kq_state = os_atomic_load(&kq->kq_state, relaxed);
6853 if (__improbable((kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) == 0)) {
6854 kqlock(kq);
6855 kq_state = kq->kq_state;
6856 if (!(kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS))) {
6857 if (flags & KEVENT_FLAG_LEGACY32) {
6858 kq_state |= KQ_KEV32;
6859 } else if (flags & KEVENT_FLAG_LEGACY64) {
6860 kq_state |= KQ_KEV64;
6861 } else {
6862 kq_state |= KQ_KEV_QOS;
6863 }
6864 kq->kq_state = kq_state;
6865 }
6866 kqunlock(kq);
6867 }
6868
6869 /*
6870 * kqfiles can't be used through the legacy kevent()
6871 * and other interfaces at the same time.
6872 */
6873 if (__improbable((bool)(flags & KEVENT_FLAG_LEGACY32) !=
6874 (bool)(kq_state & KQ_KEV32))) {
6875 fp_drop(p, fd, *fpp, 0);
6876 return EINVAL;
6877 }
6878
6879 *kqp = kq;
6880 return 0;
6881 }
6882
6883 /*!
6884 * @function kevent_get_kqwq
6885 *
6886 * @brief
6887 * Lookup or create the process kqwq (faspath).
6888 *
6889 * @discussion
6890 * Callers: kevent64, kevent_qos
6891 */
6892 OS_ALWAYS_INLINE
6893 static int
6894 kevent_get_kqwq(proc_t p, int flags, int nevents, struct kqueue **kqp)
6895 {
6896 struct kqworkq *kqwq = p->p_fd->fd_wqkqueue;
6897
6898 if (__improbable(kevent_args_requesting_events(flags, nevents))) {
6899 return EINVAL;
6900 }
6901 if (__improbable(kqwq == NULL)) {
6902 kqwq = kqworkq_alloc(p, flags);
6903 if (__improbable(kqwq == NULL)) {
6904 return ENOMEM;
6905 }
6906 }
6907
6908 *kqp = &kqwq->kqwq_kqueue;
6909 return 0;
6910 }
6911
6912 #pragma mark kevent copyio
6913
6914 /*!
6915 * @function kevent_get_data_size
6916 *
6917 * @brief
6918 * Copies in the extra data size from user-space.
6919 */
6920 static int
6921 kevent_get_data_size(int flags, user_addr_t data_avail, user_addr_t data_out,
6922 kevent_ctx_t kectx)
6923 {
6924 if (!data_avail || !data_out) {
6925 kectx->kec_data_size = 0;
6926 kectx->kec_data_resid = 0;
6927 } else if (flags & KEVENT_FLAG_PROC64) {
6928 user64_size_t usize = 0;
6929 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
6930 if (__improbable(error)) {
6931 return error;
6932 }
6933 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
6934 } else {
6935 user32_size_t usize = 0;
6936 int error = copyin((user_addr_t)data_avail, &usize, sizeof(usize));
6937 if (__improbable(error)) {
6938 return error;
6939 }
6940 kectx->kec_data_avail = data_avail;
6941 kectx->kec_data_resid = kectx->kec_data_size = (user_size_t)usize;
6942 }
6943 kectx->kec_data_out = data_out;
6944 kectx->kec_data_avail = data_avail;
6945 return 0;
6946 }
6947
6948 /*!
6949 * @function kevent_put_data_size
6950 *
6951 * @brief
6952 * Copies out the residual data size to user-space if any has been used.
6953 */
6954 static int
6955 kevent_put_data_size(unsigned int flags, kevent_ctx_t kectx)
6956 {
6957 if (kectx->kec_data_resid == kectx->kec_data_size) {
6958 return 0;
6959 }
6960 if (flags & KEVENT_FLAG_KERNEL) {
6961 *(user_size_t *)(uintptr_t)kectx->kec_data_avail = kectx->kec_data_resid;
6962 return 0;
6963 }
6964 if (flags & KEVENT_FLAG_PROC64) {
6965 user64_size_t usize = (user64_size_t)kectx->kec_data_resid;
6966 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
6967 } else {
6968 user32_size_t usize = (user32_size_t)kectx->kec_data_resid;
6969 return copyout(&usize, (user_addr_t)kectx->kec_data_avail, sizeof(usize));
6970 }
6971 }
6972
6973 /*!
6974 * @function kevent_legacy_copyin
6975 *
6976 * @brief
6977 * Handles the copyin of a kevent/kevent64 event.
6978 */
6979 static int
6980 kevent_legacy_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp, unsigned int flags)
6981 {
6982 int error;
6983
6984 assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
6985
6986 if (flags & KEVENT_FLAG_LEGACY64) {
6987 struct kevent64_s kev64;
6988
6989 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
6990 if (__improbable(error)) {
6991 return error;
6992 }
6993 *addrp += sizeof(kev64);
6994 *kevp = (struct kevent_qos_s){
6995 .ident = kev64.ident,
6996 .filter = kev64.filter,
6997 /* Make sure user doesn't pass in any system flags */
6998 .flags = kev64.flags & ~EV_SYSFLAGS,
6999 .udata = kev64.udata,
7000 .fflags = kev64.fflags,
7001 .data = kev64.data,
7002 .ext[0] = kev64.ext[0],
7003 .ext[1] = kev64.ext[1],
7004 };
7005 } else if (flags & KEVENT_FLAG_PROC64) {
7006 struct user64_kevent kev64;
7007
7008 error = copyin(*addrp, (caddr_t)&kev64, sizeof(kev64));
7009 if (__improbable(error)) {
7010 return error;
7011 }
7012 *addrp += sizeof(kev64);
7013 *kevp = (struct kevent_qos_s){
7014 .ident = kev64.ident,
7015 .filter = kev64.filter,
7016 /* Make sure user doesn't pass in any system flags */
7017 .flags = kev64.flags & ~EV_SYSFLAGS,
7018 .udata = kev64.udata,
7019 .fflags = kev64.fflags,
7020 .data = kev64.data,
7021 };
7022 } else {
7023 struct user32_kevent kev32;
7024
7025 error = copyin(*addrp, (caddr_t)&kev32, sizeof(kev32));
7026 if (__improbable(error)) {
7027 return error;
7028 }
7029 *addrp += sizeof(kev32);
7030 *kevp = (struct kevent_qos_s){
7031 .ident = (uintptr_t)kev32.ident,
7032 .filter = kev32.filter,
7033 /* Make sure user doesn't pass in any system flags */
7034 .flags = kev32.flags & ~EV_SYSFLAGS,
7035 .udata = CAST_USER_ADDR_T(kev32.udata),
7036 .fflags = kev32.fflags,
7037 .data = (intptr_t)kev32.data,
7038 };
7039 }
7040
7041 return 0;
7042 }
7043
7044 /*!
7045 * @function kevent_modern_copyin
7046 *
7047 * @brief
7048 * Handles the copyin of a kevent_qos/kevent_id event.
7049 */
7050 static int
7051 kevent_modern_copyin(user_addr_t *addrp, struct kevent_qos_s *kevp)
7052 {
7053 int error = copyin(*addrp, (caddr_t)kevp, sizeof(struct kevent_qos_s));
7054 if (__probable(!error)) {
7055 /* Make sure user doesn't pass in any system flags */
7056 *addrp += sizeof(struct kevent_qos_s);
7057 kevp->flags &= ~EV_SYSFLAGS;
7058 }
7059 return error;
7060 }
7061
7062 /*!
7063 * @function kevent_legacy_copyout
7064 *
7065 * @brief
7066 * Handles the copyout of a kevent/kevent64 event.
7067 */
7068 static int
7069 kevent_legacy_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp, unsigned int flags)
7070 {
7071 int advance;
7072 int error;
7073
7074 assert((flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64)) != 0);
7075
7076 /*
7077 * fully initialize the differnt output event structure
7078 * types from the internal kevent (and some universal
7079 * defaults for fields not represented in the internal
7080 * form).
7081 *
7082 * Note: these structures have no padding hence the C99
7083 * initializers below do not leak kernel info.
7084 */
7085 if (flags & KEVENT_FLAG_LEGACY64) {
7086 struct kevent64_s kev64 = {
7087 .ident = kevp->ident,
7088 .filter = kevp->filter,
7089 .flags = kevp->flags,
7090 .fflags = kevp->fflags,
7091 .data = (int64_t)kevp->data,
7092 .udata = kevp->udata,
7093 .ext[0] = kevp->ext[0],
7094 .ext[1] = kevp->ext[1],
7095 };
7096 advance = sizeof(struct kevent64_s);
7097 error = copyout((caddr_t)&kev64, *addrp, advance);
7098 } else if (flags & KEVENT_FLAG_PROC64) {
7099 /*
7100 * deal with the special case of a user-supplied
7101 * value of (uintptr_t)-1.
7102 */
7103 uint64_t ident = (kevp->ident == (uintptr_t)-1) ?
7104 (uint64_t)-1LL : (uint64_t)kevp->ident;
7105 struct user64_kevent kev64 = {
7106 .ident = ident,
7107 .filter = kevp->filter,
7108 .flags = kevp->flags,
7109 .fflags = kevp->fflags,
7110 .data = (int64_t) kevp->data,
7111 .udata = (user_addr_t) kevp->udata,
7112 };
7113 advance = sizeof(kev64);
7114 error = copyout((caddr_t)&kev64, *addrp, advance);
7115 } else {
7116 struct user32_kevent kev32 = {
7117 .ident = (uint32_t)kevp->ident,
7118 .filter = kevp->filter,
7119 .flags = kevp->flags,
7120 .fflags = kevp->fflags,
7121 .data = (int32_t)kevp->data,
7122 .udata = (uint32_t)kevp->udata,
7123 };
7124 advance = sizeof(kev32);
7125 error = copyout((caddr_t)&kev32, *addrp, advance);
7126 }
7127 if (__probable(!error)) {
7128 *addrp += advance;
7129 }
7130 return error;
7131 }
7132
7133 /*!
7134 * @function kevent_modern_copyout
7135 *
7136 * @brief
7137 * Handles the copyout of a kevent_qos/kevent_id event.
7138 */
7139 OS_ALWAYS_INLINE
7140 static inline int
7141 kevent_modern_copyout(struct kevent_qos_s *kevp, user_addr_t *addrp)
7142 {
7143 int error = copyout((caddr_t)kevp, *addrp, sizeof(struct kevent_qos_s));
7144 if (__probable(!error)) {
7145 *addrp += sizeof(struct kevent_qos_s);
7146 }
7147 return error;
7148 }
7149
7150 #pragma mark kevent core implementation
7151
7152 /*!
7153 * @function kevent_callback_inline
7154 *
7155 * @brief
7156 * Callback for each individual event
7157 *
7158 * @discussion
7159 * This is meant to be inlined in kevent_modern_callback and
7160 * kevent_legacy_callback.
7161 */
7162 OS_ALWAYS_INLINE
7163 static inline int
7164 kevent_callback_inline(struct kevent_qos_s *kevp, kevent_ctx_t kectx, bool legacy)
7165 {
7166 int error;
7167
7168 assert(kectx->kec_process_noutputs < kectx->kec_process_nevents);
7169
7170 /*
7171 * Copy out the appropriate amount of event data for this user.
7172 */
7173 if (legacy) {
7174 error = kevent_legacy_copyout(kevp, &kectx->kec_process_eventlist,
7175 kectx->kec_process_flags);
7176 } else {
7177 error = kevent_modern_copyout(kevp, &kectx->kec_process_eventlist);
7178 }
7179
7180 /*
7181 * If there isn't space for additional events, return
7182 * a harmless error to stop the processing here
7183 */
7184 if (error == 0 && ++kectx->kec_process_noutputs == kectx->kec_process_nevents) {
7185 error = EWOULDBLOCK;
7186 }
7187 return error;
7188 }
7189
7190 /*!
7191 * @function kevent_modern_callback
7192 *
7193 * @brief
7194 * Callback for each individual modern event.
7195 *
7196 * @discussion
7197 * This callback handles kevent_qos/kevent_id events.
7198 */
7199 static int
7200 kevent_modern_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7201 {
7202 return kevent_callback_inline(kevp, kectx, /*legacy*/ false);
7203 }
7204
7205 /*!
7206 * @function kevent_legacy_callback
7207 *
7208 * @brief
7209 * Callback for each individual legacy event.
7210 *
7211 * @discussion
7212 * This callback handles kevent/kevent64 events.
7213 */
7214 static int
7215 kevent_legacy_callback(struct kevent_qos_s *kevp, kevent_ctx_t kectx)
7216 {
7217 return kevent_callback_inline(kevp, kectx, /*legacy*/ true);
7218 }
7219
7220 /*!
7221 * @function kevent_cleanup
7222 *
7223 * @brief
7224 * Handles the cleanup returning from a kevent call.
7225 *
7226 * @discussion
7227 * kevent entry points will take a reference on workloops,
7228 * and a usecount on the fileglob of kqfiles.
7229 *
7230 * This function undoes this on the exit paths of kevents.
7231 *
7232 * @returns
7233 * The error to return to userspace.
7234 */
7235 static int
7236 kevent_cleanup(kqueue_t kqu, int flags, int error, kevent_ctx_t kectx)
7237 {
7238 // poll should not call any codepath leading to this
7239 assert((flags & KEVENT_FLAG_POLL) == 0);
7240
7241 if (flags & KEVENT_FLAG_WORKLOOP) {
7242 kqworkloop_release(kqu.kqwl);
7243 } else if (flags & KEVENT_FLAG_WORKQ) {
7244 /* nothing held */
7245 } else {
7246 fp_drop(kqu.kqf->kqf_p, kectx->kec_fd, kectx->kec_fp, 0);
7247 }
7248
7249 /* don't restart after signals... */
7250 if (error == ERESTART) {
7251 error = EINTR;
7252 } else if (error == 0) {
7253 /* don't abandon other output just because of residual copyout failures */
7254 (void)kevent_put_data_size(flags, kectx);
7255 }
7256
7257 if (flags & KEVENT_FLAG_PARKING) {
7258 thread_t th = current_thread();
7259 struct uthread *uth = get_bsdthread_info(th);
7260 if (uth->uu_kqr_bound) {
7261 thread_unfreeze_base_pri(th);
7262 }
7263 }
7264 return error;
7265 }
7266
7267 /*!
7268 * @function kqueue_process
7269 *
7270 * @brief
7271 * Process the triggered events in a kqueue.
7272 *
7273 * @discussion
7274 * Walk the queued knotes and validate that they are really still triggered
7275 * events by calling the filter routines (if necessary).
7276 *
7277 * For each event that is still considered triggered, invoke the callback
7278 * routine provided.
7279 *
7280 * caller holds a reference on the kqueue.
7281 * kqueue locked on entry and exit - but may be dropped
7282 * kqueue list locked (held for duration of call)
7283 *
7284 * This is only called by kqueue_scan() so that the compiler can inline it.
7285 *
7286 * @returns
7287 * - 0: no event was returned, no other error occured
7288 * - EBADF: the kqueue is being destroyed (KQ_DRAIN is set)
7289 * - EWOULDBLOCK: (not an error) events have been found and we should return
7290 * - EFAULT: copyout failed
7291 * - filter specific errors
7292 */
7293 static int
7294 kqueue_process(kqueue_t kqu, int flags, kevent_ctx_t kectx,
7295 kevent_callback_t callback)
7296 {
7297 workq_threadreq_t kqr = current_uthread()->uu_kqr_bound;
7298 struct knote *kn;
7299 int error = 0, rc = 0;
7300 struct kqtailq *base_queue, *queue;
7301 #if DEBUG || DEVELOPMENT
7302 int retries = 64;
7303 #endif
7304 uint16_t kq_type = (kqu.kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
7305
7306 if (kq_type & KQ_WORKQ) {
7307 rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
7308 } else if (kq_type & KQ_WORKLOOP) {
7309 rc = kqworkloop_begin_processing(kqu.kqwl, flags);
7310 } else {
7311 kqfile_retry:
7312 rc = kqfile_begin_processing(kqu.kqf);
7313 if (rc == EBADF) {
7314 return EBADF;
7315 }
7316 }
7317
7318 if (rc == -1) {
7319 /* Nothing to process */
7320 return 0;
7321 }
7322
7323 /*
7324 * loop through the enqueued knotes associated with this request,
7325 * processing each one. Each request may have several queues
7326 * of knotes to process (depending on the type of kqueue) so we
7327 * have to loop through all the queues as long as we have additional
7328 * space.
7329 */
7330
7331 process_again:
7332 if (kq_type & KQ_WORKQ) {
7333 base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->tr_kq_qos_index];
7334 } else if (kq_type & KQ_WORKLOOP) {
7335 base_queue = &kqu.kqwl->kqwl_queue[0];
7336 queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
7337 } else {
7338 base_queue = queue = &kqu.kqf->kqf_queue;
7339 }
7340
7341 do {
7342 while ((kn = TAILQ_FIRST(queue)) != NULL) {
7343 error = knote_process(kn, kectx, callback);
7344 if (error == EJUSTRETURN) {
7345 error = 0;
7346 } else if (__improbable(error)) {
7347 /* error is EWOULDBLOCK when the out event array is full */
7348 goto stop_processing;
7349 }
7350 }
7351 } while (queue-- > base_queue);
7352
7353 if (kectx->kec_process_noutputs) {
7354 /* callers will transform this into no error */
7355 error = EWOULDBLOCK;
7356 }
7357
7358 stop_processing:
7359 /*
7360 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7361 * we want to unbind the kqrequest from the thread.
7362 *
7363 * However, because the kq locks are dropped several times during process,
7364 * new knotes may have fired again, in which case, we want to fail the end
7365 * processing and process again, until it converges.
7366 *
7367 * If we have an error or returned events, end processing never fails.
7368 */
7369 if (error) {
7370 flags &= ~KEVENT_FLAG_PARKING;
7371 }
7372 if (kq_type & KQ_WORKQ) {
7373 rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
7374 } else if (kq_type & KQ_WORKLOOP) {
7375 rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
7376 } else {
7377 rc = kqfile_end_processing(kqu.kqf);
7378 }
7379
7380 if (__probable(error)) {
7381 return error;
7382 }
7383
7384 if (__probable(rc >= 0)) {
7385 assert(rc == 0 || rc == EBADF);
7386 return rc;
7387 }
7388
7389 #if DEBUG || DEVELOPMENT
7390 if (retries-- == 0) {
7391 panic("kevent: way too many knote_process retries, kq: %p (0x%04x)",
7392 kqu.kq, kqu.kq->kq_state);
7393 }
7394 #endif
7395 if (kq_type & (KQ_WORKQ | KQ_WORKLOOP)) {
7396 assert(flags & KEVENT_FLAG_PARKING);
7397 goto process_again;
7398 } else {
7399 goto kqfile_retry;
7400 }
7401 }
7402
7403 /*!
7404 * @function kqueue_scan_continue
7405 *
7406 * @brief
7407 * The continuation used by kqueue_scan for kevent entry points.
7408 *
7409 * @discussion
7410 * Assumes we inherit a use/ref count on the kq or its fileglob.
7411 *
7412 * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7413 * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7414 */
7415 OS_NORETURN OS_NOINLINE
7416 static void
7417 kqueue_scan_continue(void *data, wait_result_t wait_result)
7418 {
7419 uthread_t ut = current_uthread();
7420 kevent_ctx_t kectx = &ut->uu_save.uus_kevent;
7421 int error = 0, flags = kectx->kec_process_flags;
7422 struct kqueue *kq = data;
7423
7424 /*
7425 * only kevent variants call in here, so we know the callback is
7426 * kevent_legacy_callback or kevent_modern_callback.
7427 */
7428 assert((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0);
7429
7430 switch (wait_result) {
7431 case THREAD_AWAKENED:
7432 if (__improbable(flags & (KEVENT_FLAG_LEGACY32 | KEVENT_FLAG_LEGACY64))) {
7433 error = kqueue_scan(kq, flags, kectx, kevent_legacy_callback);
7434 } else {
7435 error = kqueue_scan(kq, flags, kectx, kevent_modern_callback);
7436 }
7437 break;
7438 case THREAD_TIMED_OUT:
7439 error = 0;
7440 break;
7441 case THREAD_INTERRUPTED:
7442 error = EINTR;
7443 break;
7444 case THREAD_RESTART:
7445 error = EBADF;
7446 break;
7447 default:
7448 panic("%s: - invalid wait_result (%d)", __func__, wait_result);
7449 }
7450
7451
7452 error = kevent_cleanup(kq, flags, error, kectx);
7453 *(int32_t *)&ut->uu_rval = kectx->kec_process_noutputs;
7454 unix_syscall_return(error);
7455 }
7456
7457 /*!
7458 * @function kqueue_scan
7459 *
7460 * @brief
7461 * Scan and wait for events in a kqueue (used by poll & kevent).
7462 *
7463 * @discussion
7464 * Process the triggered events in a kqueue.
7465 *
7466 * If there are no events triggered arrange to wait for them:
7467 * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7468 * - possibly until kectx->kec_deadline expires
7469 *
7470 * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7471 * are set, then it will wait in the kqueue_scan_continue continuation.
7472 *
7473 * poll() will block in place, and KEVENT_FLAG_KERNEL calls
7474 * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
7475 *
7476 * @param kq
7477 * The kqueue being scanned.
7478 *
7479 * @param flags
7480 * The KEVENT_FLAG_* flags for this call.
7481 *
7482 * @param kectx
7483 * The context used for this scan.
7484 * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
7485 *
7486 * @param callback
7487 * The callback to be called on events sucessfully processed.
7488 * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
7489 */
7490 int
7491 kqueue_scan(struct kqueue *kq, int flags, kevent_ctx_t kectx,
7492 kevent_callback_t callback)
7493 {
7494 int error;
7495
7496 for (;;) {
7497 kqlock(kq);
7498 error = kqueue_process(kq, flags, kectx, callback);
7499
7500 /*
7501 * If we got an error, events returned (EWOULDBLOCK)
7502 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
7503 * just return.
7504 */
7505 if (__probable(error || (flags & KEVENT_FLAG_IMMEDIATE))) {
7506 kqunlock(kq);
7507 return error == EWOULDBLOCK ? 0 : error;
7508 }
7509
7510 waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
7511 KQ_EVENT, THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL,
7512 kectx->kec_deadline, TIMEOUT_NO_LEEWAY);
7513 kq->kq_state |= KQ_SLEEP;
7514
7515 kqunlock(kq);
7516
7517 if (__probable((flags & (KEVENT_FLAG_POLL | KEVENT_FLAG_KERNEL)) == 0)) {
7518 thread_block_parameter(kqueue_scan_continue, kq);
7519 __builtin_unreachable();
7520 }
7521
7522 wait_result_t wr = thread_block(THREAD_CONTINUE_NULL);
7523 switch (wr) {
7524 case THREAD_AWAKENED:
7525 break;
7526 case THREAD_TIMED_OUT:
7527 return 0;
7528 case THREAD_INTERRUPTED:
7529 return EINTR;
7530 case THREAD_RESTART:
7531 return EBADF;
7532 default:
7533 panic("%s: - bad wait_result (%d)", __func__, wr);
7534 }
7535 }
7536 }
7537
7538 /*!
7539 * @function kevent_internal
7540 *
7541 * @brief
7542 * Common kevent code.
7543 *
7544 * @discussion
7545 * Needs to be inlined to specialize for legacy or modern and
7546 * eliminate dead code.
7547 *
7548 * This is the core logic of kevent entry points, that will:
7549 * - register kevents
7550 * - optionally scan the kqueue for events
7551 *
7552 * The caller is giving kevent_internal a reference on the kqueue
7553 * or its fileproc that needs to be cleaned up by kevent_cleanup().
7554 */
7555 OS_ALWAYS_INLINE
7556 static inline int
7557 kevent_internal(kqueue_t kqu,
7558 user_addr_t changelist, int nchanges,
7559 user_addr_t ueventlist, int nevents,
7560 int flags, kevent_ctx_t kectx, int32_t *retval,
7561 bool legacy)
7562 {
7563 int error = 0, noutputs = 0, register_rc;
7564
7565 /* only bound threads can receive events on workloops */
7566 if (!legacy && (flags & KEVENT_FLAG_WORKLOOP)) {
7567 #if CONFIG_WORKLOOP_DEBUG
7568 UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
7569 .uu_kqid = kqu.kqwl->kqwl_dynamicid,
7570 .uu_kq = error ? NULL : kqu.kq,
7571 .uu_error = error,
7572 .uu_nchanges = nchanges,
7573 .uu_nevents = nevents,
7574 .uu_flags = flags,
7575 });
7576 #endif // CONFIG_WORKLOOP_DEBUG
7577
7578 if (flags & KEVENT_FLAG_KERNEL) {
7579 /* see kevent_workq_internal */
7580 error = copyout(&kqu.kqwl->kqwl_dynamicid,
7581 ueventlist - sizeof(kqueue_id_t), sizeof(kqueue_id_t));
7582 kectx->kec_data_resid -= sizeof(kqueue_id_t);
7583 if (__improbable(error)) {
7584 goto out;
7585 }
7586 }
7587
7588 if (kevent_args_requesting_events(flags, nevents)) {
7589 /*
7590 * Disable the R2K notification while doing a register, if the
7591 * caller wants events too, we don't want the AST to be set if we
7592 * will process these events soon.
7593 */
7594 kqlock(kqu);
7595 kqu.kq->kq_state &= ~KQ_R2K_ARMED;
7596 kqunlock(kqu);
7597 flags |= KEVENT_FLAG_NEEDS_END_PROCESSING;
7598 }
7599 }
7600
7601 /* register all the change requests the user provided... */
7602 while (nchanges > 0 && error == 0) {
7603 struct kevent_qos_s kev;
7604 struct knote *kn = NULL;
7605
7606 if (legacy) {
7607 error = kevent_legacy_copyin(&changelist, &kev, flags);
7608 } else {
7609 error = kevent_modern_copyin(&changelist, &kev);
7610 }
7611 if (error) {
7612 break;
7613 }
7614
7615 register_rc = kevent_register(kqu.kq, &kev, &kn);
7616 if (__improbable(!legacy && (register_rc & FILTER_REGISTER_WAIT))) {
7617 thread_t thread = current_thread();
7618
7619 kqlock_held(kqu);
7620
7621 if (act_clear_astkevent(thread, AST_KEVENT_REDRIVE_THREADREQ)) {
7622 workq_kern_threadreq_redrive(kqu.kq->kq_p, WORKQ_THREADREQ_NONE);
7623 }
7624
7625 // f_post_register_wait is meant to call a continuation and not to
7626 // return, which is why we don't support FILTER_REGISTER_WAIT if
7627 // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
7628 // waits isn't the last.
7629 //
7630 // It is implementable, but not used by any userspace code at the
7631 // moment, so for now return ENOTSUP if someone tries to do it.
7632 if (nchanges == 1 && noutputs < nevents &&
7633 (flags & KEVENT_FLAG_KERNEL) == 0 &&
7634 (flags & KEVENT_FLAG_PARKING) == 0 &&
7635 (flags & KEVENT_FLAG_ERROR_EVENTS) &&
7636 (flags & KEVENT_FLAG_WORKLOOP)) {
7637 uthread_t ut = get_bsdthread_info(thread);
7638
7639 /*
7640 * store the continuation/completion data in the uthread
7641 *
7642 * Note: the kectx aliases with this,
7643 * and is destroyed in the process.
7644 */
7645 ut->uu_save.uus_kevent_register = (struct _kevent_register){
7646 .kev = kev,
7647 .kqwl = kqu.kqwl,
7648 .eventout = noutputs,
7649 .ueventlist = ueventlist,
7650 };
7651 knote_fops(kn)->f_post_register_wait(ut, kn,
7652 &ut->uu_save.uus_kevent_register);
7653 __builtin_unreachable();
7654 }
7655 kqunlock(kqu);
7656
7657 kev.flags |= EV_ERROR;
7658 kev.data = ENOTSUP;
7659 } else {
7660 assert((register_rc & FILTER_REGISTER_WAIT) == 0);
7661 }
7662
7663 // keep in sync with kevent_register_wait_return()
7664 if (noutputs < nevents && (kev.flags & (EV_ERROR | EV_RECEIPT))) {
7665 if ((kev.flags & EV_ERROR) == 0) {
7666 kev.flags |= EV_ERROR;
7667 kev.data = 0;
7668 }
7669 if (legacy) {
7670 error = kevent_legacy_copyout(&kev, &ueventlist, flags);
7671 } else {
7672 error = kevent_modern_copyout(&kev, &ueventlist);
7673 }
7674 if (error == 0) {
7675 noutputs++;
7676 }
7677 } else if (kev.flags & EV_ERROR) {
7678 error = (int)kev.data;
7679 }
7680 nchanges--;
7681 }
7682
7683 if ((flags & KEVENT_FLAG_ERROR_EVENTS) == 0 &&
7684 nevents > 0 && noutputs == 0 && error == 0) {
7685 kectx->kec_process_flags = flags;
7686 kectx->kec_process_nevents = nevents;
7687 kectx->kec_process_noutputs = 0;
7688 kectx->kec_process_eventlist = ueventlist;
7689
7690 if (legacy) {
7691 error = kqueue_scan(kqu.kq, flags, kectx, kevent_legacy_callback);
7692 } else {
7693 error = kqueue_scan(kqu.kq, flags, kectx, kevent_modern_callback);
7694 }
7695
7696 noutputs = kectx->kec_process_noutputs;
7697 } else if (!legacy && (flags & KEVENT_FLAG_NEEDS_END_PROCESSING)) {
7698 /*
7699 * If we didn't through kqworkloop_end_processing(),
7700 * we need to do it here.
7701 *
7702 * kqueue_scan will call kqworkloop_end_processing(),
7703 * so we only need to do it if we didn't scan.
7704 */
7705 kqlock(kqu);
7706 kqworkloop_end_processing(kqu.kqwl, 0, 0);
7707 kqunlock(kqu);
7708 }
7709
7710 *retval = noutputs;
7711 out:
7712 return kevent_cleanup(kqu.kq, flags, error, kectx);
7713 }
7714
7715 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
7716
7717 /*!
7718 * @function kevent_modern_internal
7719 *
7720 * @brief
7721 * The backend of the kevent_id and kevent_workq_internal entry points.
7722 *
7723 * @discussion
7724 * Needs to be inline due to the number of arguments.
7725 */
7726 OS_NOINLINE
7727 static int
7728 kevent_modern_internal(kqueue_t kqu,
7729 user_addr_t changelist, int nchanges,
7730 user_addr_t ueventlist, int nevents,
7731 int flags, kevent_ctx_t kectx, int32_t *retval)
7732 {
7733 return kevent_internal(kqu.kq, changelist, nchanges,
7734 ueventlist, nevents, flags, kectx, retval, /*legacy*/ false);
7735 }
7736
7737 /*!
7738 * @function kevent_id
7739 *
7740 * @brief
7741 * The kevent_id() syscall.
7742 */
7743 int
7744 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
7745 {
7746 int error, flags = uap->flags & KEVENT_FLAG_USER;
7747 uthread_t uth = current_uthread();
7748 workq_threadreq_t kqr = uth->uu_kqr_bound;
7749 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7750 kqueue_t kqu;
7751
7752 flags = kevent_adjust_flags_for_proc(p, flags);
7753 flags |= KEVENT_FLAG_DYNAMIC_KQUEUE;
7754
7755 if (__improbable((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP)) !=
7756 KEVENT_FLAG_WORKLOOP)) {
7757 return EINVAL;
7758 }
7759
7760 error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7761 if (__improbable(error)) {
7762 return error;
7763 }
7764
7765 kectx->kec_deadline = 0;
7766 kectx->kec_fp = NULL;
7767 kectx->kec_fd = -1;
7768 /* the kec_process_* fields are filled if kqueue_scann is called only */
7769
7770 /*
7771 * Get the kq we are going to be working on
7772 * As a fastpath, look at the currently bound workloop.
7773 */
7774 kqu.kqwl = kqr ? kqr_kqworkloop(kqr) : NULL;
7775 if (kqu.kqwl && kqu.kqwl->kqwl_dynamicid == uap->id) {
7776 if (__improbable(flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
7777 return EEXIST;
7778 }
7779 kqworkloop_retain(kqu.kqwl);
7780 } else if (__improbable(kevent_args_requesting_events(flags, uap->nevents))) {
7781 return EXDEV;
7782 } else {
7783 error = kqworkloop_get_or_create(p, uap->id, NULL, flags, &kqu.kqwl);
7784 if (__improbable(error)) {
7785 return error;
7786 }
7787 }
7788
7789 return kevent_modern_internal(kqu, uap->changelist, uap->nchanges,
7790 uap->eventlist, uap->nevents, flags, kectx, retval);
7791 }
7792
7793 /**!
7794 * @function kevent_workq_internal
7795 *
7796 * @discussion
7797 * This function is exported for the sake of the workqueue subsystem.
7798 *
7799 * It is called in two ways:
7800 * - when a thread is about to go to userspace to ask for pending event
7801 * - when a thread is returning from userspace with events back
7802 *
7803 * the workqueue subsystem will only use the following flags:
7804 * - KEVENT_FLAG_STACK_DATA (always)
7805 * - KEVENT_FLAG_IMMEDIATE (always)
7806 * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
7807 * userspace).
7808 *
7809 * It implicitly acts on the bound kqueue, and for the case of workloops
7810 * will copyout the kqueue ID before anything else.
7811 *
7812 *
7813 * Pthread will have setup the various arguments to fit this stack layout:
7814 *
7815 * +-------....----+--------------+-----------+--------------------+
7816 * | user stack | data avail | nevents | pthread_self() |
7817 * +-------....----+--------------+-----------+--------------------+
7818 * ^ ^
7819 * data_out eventlist
7820 *
7821 * When a workloop is used, the workloop ID is copied out right before
7822 * the eventlist and is taken from the data buffer.
7823 *
7824 * @warning
7825 * This function is carefuly tailored to not make any call except the final tail
7826 * call into kevent_modern_internal. (LTO inlines current_uthread()).
7827 *
7828 * This function is performance sensitive due to the workq subsystem.
7829 */
7830 int
7831 kevent_workq_internal(struct proc *p,
7832 user_addr_t changelist, int nchanges,
7833 user_addr_t eventlist, int nevents,
7834 user_addr_t data_out, user_size_t *data_available,
7835 unsigned int flags, int32_t *retval)
7836 {
7837 uthread_t uth = current_uthread();
7838 workq_threadreq_t kqr = uth->uu_kqr_bound;
7839 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7840 kqueue_t kqu;
7841
7842 assert(flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE) ||
7843 flags == (KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_PARKING));
7844
7845 kectx->kec_data_out = data_out;
7846 kectx->kec_data_avail = (uint64_t)data_available;
7847 kectx->kec_data_size = *data_available;
7848 kectx->kec_data_resid = *data_available;
7849 kectx->kec_deadline = 0;
7850 kectx->kec_fp = NULL;
7851 kectx->kec_fd = -1;
7852 /* the kec_process_* fields are filled if kqueue_scann is called only */
7853
7854 flags = kevent_adjust_flags_for_proc(p, flags);
7855
7856 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
7857 kqu.kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
7858 kqworkloop_retain(kqu.kqwl);
7859
7860 flags |= KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_DYNAMIC_KQUEUE |
7861 KEVENT_FLAG_KERNEL;
7862 } else {
7863 kqu.kqwq = p->p_fd->fd_wqkqueue;
7864
7865 flags |= KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL;
7866 }
7867
7868 return kevent_modern_internal(kqu, changelist, nchanges,
7869 eventlist, nevents, flags, kectx, retval);
7870 }
7871
7872 /*!
7873 * @function kevent_qos
7874 *
7875 * @brief
7876 * The kevent_qos() syscall.
7877 */
7878 int
7879 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
7880 {
7881 uthread_t uth = current_uthread();
7882 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7883 int error, flags = uap->flags & KEVENT_FLAG_USER;
7884 struct kqueue *kq;
7885
7886 if (__improbable(flags & KEVENT_ID_FLAG_USER)) {
7887 return EINVAL;
7888 }
7889
7890 flags = kevent_adjust_flags_for_proc(p, flags);
7891
7892 error = kevent_get_data_size(flags, uap->data_available, uap->data_out, kectx);
7893 if (__improbable(error)) {
7894 return error;
7895 }
7896
7897 kectx->kec_deadline = 0;
7898 kectx->kec_fp = NULL;
7899 kectx->kec_fd = uap->fd;
7900 /* the kec_process_* fields are filled if kqueue_scann is called only */
7901
7902 /* get the kq we are going to be working on */
7903 if (__probable(flags & KEVENT_FLAG_WORKQ)) {
7904 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
7905 } else {
7906 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
7907 }
7908 if (__improbable(error)) {
7909 return error;
7910 }
7911
7912 return kevent_modern_internal(kq, uap->changelist, uap->nchanges,
7913 uap->eventlist, uap->nevents, flags, kectx, retval);
7914 }
7915
7916 #pragma mark legacy syscalls: kevent, kevent64
7917
7918 /*!
7919 * @function kevent_legacy_get_deadline
7920 *
7921 * @brief
7922 * Compute the deadline for the legacy kevent syscalls.
7923 *
7924 * @discussion
7925 * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
7926 * as this takes precedence over the deadline.
7927 *
7928 * This function will fail if utimeout is USER_ADDR_NULL
7929 * (the caller should check).
7930 */
7931 static int
7932 kevent_legacy_get_deadline(int flags, user_addr_t utimeout, uint64_t *deadline)
7933 {
7934 struct timespec ts;
7935
7936 if (flags & KEVENT_FLAG_PROC64) {
7937 struct user64_timespec ts64;
7938 int error = copyin(utimeout, &ts64, sizeof(ts64));
7939 if (__improbable(error)) {
7940 return error;
7941 }
7942 ts.tv_sec = (unsigned long)ts64.tv_sec;
7943 ts.tv_nsec = (long)ts64.tv_nsec;
7944 } else {
7945 struct user32_timespec ts32;
7946 int error = copyin(utimeout, &ts32, sizeof(ts32));
7947 if (__improbable(error)) {
7948 return error;
7949 }
7950 ts.tv_sec = ts32.tv_sec;
7951 ts.tv_nsec = ts32.tv_nsec;
7952 }
7953 if (!timespec_is_valid(&ts)) {
7954 return EINVAL;
7955 }
7956
7957 clock_absolutetime_interval_to_deadline(tstoabstime(&ts), deadline);
7958 return 0;
7959 }
7960
7961 /*!
7962 * @function kevent_legacy_internal
7963 *
7964 * @brief
7965 * The core implementation for kevent and kevent64
7966 */
7967 OS_NOINLINE
7968 static int
7969 kevent_legacy_internal(struct proc *p, struct kevent64_args *uap,
7970 int32_t *retval, int flags)
7971 {
7972 uthread_t uth = current_uthread();
7973 kevent_ctx_t kectx = &uth->uu_save.uus_kevent;
7974 struct kqueue *kq;
7975 int error;
7976
7977 if (__improbable(uap->flags & KEVENT_ID_FLAG_USER)) {
7978 return EINVAL;
7979 }
7980
7981 flags = kevent_adjust_flags_for_proc(p, flags);
7982
7983 kectx->kec_data_out = 0;
7984 kectx->kec_data_avail = 0;
7985 kectx->kec_data_size = 0;
7986 kectx->kec_data_resid = 0;
7987 kectx->kec_deadline = 0;
7988 kectx->kec_fp = NULL;
7989 kectx->kec_fd = uap->fd;
7990 /* the kec_process_* fields are filled if kqueue_scann is called only */
7991
7992 /* convert timeout to absolute - if we have one (and not immediate) */
7993 if (__improbable(uap->timeout && !(flags & KEVENT_FLAG_IMMEDIATE))) {
7994 error = kevent_legacy_get_deadline(flags, uap->timeout,
7995 &kectx->kec_deadline);
7996 if (__improbable(error)) {
7997 return error;
7998 }
7999 }
8000
8001 /* get the kq we are going to be working on */
8002 if (flags & KEVENT_FLAG_WORKQ) {
8003 error = kevent_get_kqwq(p, flags, uap->nevents, &kq);
8004 } else {
8005 error = kevent_get_kqfile(p, uap->fd, flags, &kectx->kec_fp, &kq);
8006 }
8007 if (__improbable(error)) {
8008 return error;
8009 }
8010
8011 return kevent_internal(kq, uap->changelist, uap->nchanges,
8012 uap->eventlist, uap->nevents, flags, kectx, retval,
8013 /*legacy*/ true);
8014 }
8015
8016 /*!
8017 * @function kevent
8018 *
8019 * @brief
8020 * The legacy kevent() syscall.
8021 */
8022 int
8023 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
8024 {
8025 struct kevent64_args args = {
8026 .fd = uap->fd,
8027 .changelist = uap->changelist,
8028 .nchanges = uap->nchanges,
8029 .eventlist = uap->eventlist,
8030 .nevents = uap->nevents,
8031 .timeout = uap->timeout,
8032 };
8033
8034 return kevent_legacy_internal(p, &args, retval, KEVENT_FLAG_LEGACY32);
8035 }
8036
8037 /*!
8038 * @function kevent64
8039 *
8040 * @brief
8041 * The legacy kevent64() syscall.
8042 */
8043 int
8044 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
8045 {
8046 int flags = (uap->flags & KEVENT_FLAG_USER) | KEVENT_FLAG_LEGACY64;
8047 return kevent_legacy_internal(p, uap, retval, flags);
8048 }
8049
8050 #pragma mark - socket interface
8051
8052 #if SOCKETS
8053 #include <sys/param.h>
8054 #include <sys/socket.h>
8055 #include <sys/protosw.h>
8056 #include <sys/domain.h>
8057 #include <sys/mbuf.h>
8058 #include <sys/kern_event.h>
8059 #include <sys/malloc.h>
8060 #include <sys/sys_domain.h>
8061 #include <sys/syslog.h>
8062
8063 #ifndef ROUNDUP64
8064 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8065 #endif
8066
8067 #ifndef ADVANCE64
8068 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8069 #endif
8070
8071 static LCK_GRP_DECLARE(kev_lck_grp, "Kernel Event Protocol");
8072 static LCK_RW_DECLARE(kev_rwlock, &kev_lck_grp);
8073
8074 static int kev_attach(struct socket *so, int proto, struct proc *p);
8075 static int kev_detach(struct socket *so);
8076 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8077 struct ifnet *ifp, struct proc *p);
8078 static lck_mtx_t * event_getlock(struct socket *, int);
8079 static int event_lock(struct socket *, int, void *);
8080 static int event_unlock(struct socket *, int, void *);
8081
8082 static int event_sofreelastref(struct socket *);
8083 static void kev_delete(struct kern_event_pcb *);
8084
8085 static struct pr_usrreqs event_usrreqs = {
8086 .pru_attach = kev_attach,
8087 .pru_control = kev_control,
8088 .pru_detach = kev_detach,
8089 .pru_soreceive = soreceive,
8090 };
8091
8092 static struct protosw eventsw[] = {
8093 {
8094 .pr_type = SOCK_RAW,
8095 .pr_protocol = SYSPROTO_EVENT,
8096 .pr_flags = PR_ATOMIC,
8097 .pr_usrreqs = &event_usrreqs,
8098 .pr_lock = event_lock,
8099 .pr_unlock = event_unlock,
8100 .pr_getlock = event_getlock,
8101 }
8102 };
8103
8104 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8105 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8106
8107 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8108 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Kernel event family");
8109
8110 struct kevtstat kevtstat;
8111 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8112 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8113 kevt_getstat, "S,kevtstat", "");
8114
8115 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8116 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8117 kevt_pcblist, "S,xkevtpcb", "");
8118
8119 static lck_mtx_t *
8120 event_getlock(struct socket *so, int flags)
8121 {
8122 #pragma unused(flags)
8123 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8124
8125 if (so->so_pcb != NULL) {
8126 if (so->so_usecount < 0) {
8127 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8128 so, so->so_usecount, solockhistory_nr(so));
8129 }
8130 /* NOTREACHED */
8131 } else {
8132 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
8133 so, solockhistory_nr(so));
8134 /* NOTREACHED */
8135 }
8136 return &ev_pcb->evp_mtx;
8137 }
8138
8139 static int
8140 event_lock(struct socket *so, int refcount, void *lr)
8141 {
8142 void *lr_saved;
8143
8144 if (lr == NULL) {
8145 lr_saved = __builtin_return_address(0);
8146 } else {
8147 lr_saved = lr;
8148 }
8149
8150 if (so->so_pcb != NULL) {
8151 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8152 } else {
8153 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
8154 so, lr_saved, solockhistory_nr(so));
8155 /* NOTREACHED */
8156 }
8157
8158 if (so->so_usecount < 0) {
8159 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
8160 so, so->so_pcb, lr_saved, so->so_usecount,
8161 solockhistory_nr(so));
8162 /* NOTREACHED */
8163 }
8164
8165 if (refcount) {
8166 so->so_usecount++;
8167 }
8168
8169 so->lock_lr[so->next_lock_lr] = lr_saved;
8170 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
8171 return 0;
8172 }
8173
8174 static int
8175 event_unlock(struct socket *so, int refcount, void *lr)
8176 {
8177 void *lr_saved;
8178 lck_mtx_t *mutex_held;
8179
8180 if (lr == NULL) {
8181 lr_saved = __builtin_return_address(0);
8182 } else {
8183 lr_saved = lr;
8184 }
8185
8186 if (refcount) {
8187 so->so_usecount--;
8188 }
8189 if (so->so_usecount < 0) {
8190 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8191 so, so->so_usecount, solockhistory_nr(so));
8192 /* NOTREACHED */
8193 }
8194 if (so->so_pcb == NULL) {
8195 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
8196 so, so->so_usecount, (void *)lr_saved,
8197 solockhistory_nr(so));
8198 /* NOTREACHED */
8199 }
8200 mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8201
8202 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8203 so->unlock_lr[so->next_unlock_lr] = lr_saved;
8204 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
8205
8206 if (so->so_usecount == 0) {
8207 VERIFY(so->so_flags & SOF_PCBCLEARING);
8208 event_sofreelastref(so);
8209 } else {
8210 lck_mtx_unlock(mutex_held);
8211 }
8212
8213 return 0;
8214 }
8215
8216 static int
8217 event_sofreelastref(struct socket *so)
8218 {
8219 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8220
8221 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8222
8223 so->so_pcb = NULL;
8224
8225 /*
8226 * Disable upcall in the event another thread is in kev_post_msg()
8227 * appending record to the receive socket buffer, since sbwakeup()
8228 * may release the socket lock otherwise.
8229 */
8230 so->so_rcv.sb_flags &= ~SB_UPCALL;
8231 so->so_snd.sb_flags &= ~SB_UPCALL;
8232 so->so_event = sonullevent;
8233 lck_mtx_unlock(&(ev_pcb->evp_mtx));
8234
8235 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8236 lck_rw_lock_exclusive(&kev_rwlock);
8237 LIST_REMOVE(ev_pcb, evp_link);
8238 kevtstat.kes_pcbcount--;
8239 kevtstat.kes_gencnt++;
8240 lck_rw_done(&kev_rwlock);
8241 kev_delete(ev_pcb);
8242
8243 sofreelastref(so, 1);
8244 return 0;
8245 }
8246
8247 static int event_proto_count = (sizeof(eventsw) / sizeof(struct protosw));
8248
8249 static
8250 struct kern_event_head kern_event_head;
8251
8252 static u_int32_t static_event_id = 0;
8253
8254 static ZONE_DECLARE(ev_pcb_zone, "kerneventpcb",
8255 sizeof(struct kern_event_pcb), ZC_ZFREE_CLEARMEM);
8256
8257 /*
8258 * Install the protosw's for the NKE manager. Invoked at extension load time
8259 */
8260 void
8261 kern_event_init(struct domain *dp)
8262 {
8263 struct protosw *pr;
8264 int i;
8265
8266 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8267 VERIFY(dp == systemdomain);
8268
8269 for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++) {
8270 net_add_proto(pr, dp, 1);
8271 }
8272 }
8273
8274 static int
8275 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8276 {
8277 int error = 0;
8278 struct kern_event_pcb *ev_pcb;
8279
8280 error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8281 if (error != 0) {
8282 return error;
8283 }
8284
8285 ev_pcb = zalloc_flags(ev_pcb_zone, Z_WAITOK | Z_ZERO);
8286 lck_mtx_init(&ev_pcb->evp_mtx, &kev_lck_grp, LCK_ATTR_NULL);
8287
8288 ev_pcb->evp_socket = so;
8289 ev_pcb->evp_vendor_code_filter = 0xffffffff;
8290
8291 so->so_pcb = (caddr_t) ev_pcb;
8292 lck_rw_lock_exclusive(&kev_rwlock);
8293 LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8294 kevtstat.kes_pcbcount++;
8295 kevtstat.kes_gencnt++;
8296 lck_rw_done(&kev_rwlock);
8297
8298 return error;
8299 }
8300
8301 static void
8302 kev_delete(struct kern_event_pcb *ev_pcb)
8303 {
8304 VERIFY(ev_pcb != NULL);
8305 lck_mtx_destroy(&ev_pcb->evp_mtx, &kev_lck_grp);
8306 zfree(ev_pcb_zone, ev_pcb);
8307 }
8308
8309 static int
8310 kev_detach(struct socket *so)
8311 {
8312 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8313
8314 if (ev_pcb != NULL) {
8315 soisdisconnected(so);
8316 so->so_flags |= SOF_PCBCLEARING;
8317 }
8318
8319 return 0;
8320 }
8321
8322 /*
8323 * For now, kev_vendor_code and mbuf_tags use the same
8324 * mechanism.
8325 */
8326 errno_t
8327 kev_vendor_code_find(
8328 const char *string,
8329 u_int32_t *out_vendor_code)
8330 {
8331 if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8332 return EINVAL;
8333 }
8334 return net_str_id_find_internal(string, out_vendor_code,
8335 NSI_VENDOR_CODE, 1);
8336 }
8337
8338 errno_t
8339 kev_msg_post(struct kev_msg *event_msg)
8340 {
8341 mbuf_tag_id_t min_vendor, max_vendor;
8342
8343 net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8344
8345 if (event_msg == NULL) {
8346 return EINVAL;
8347 }
8348
8349 /*
8350 * Limit third parties to posting events for registered vendor codes
8351 * only
8352 */
8353 if (event_msg->vendor_code < min_vendor ||
8354 event_msg->vendor_code > max_vendor) {
8355 os_atomic_inc(&kevtstat.kes_badvendor, relaxed);
8356 return EINVAL;
8357 }
8358 return kev_post_msg(event_msg);
8359 }
8360
8361 int
8362 kev_post_msg(struct kev_msg *event_msg)
8363 {
8364 struct mbuf *m, *m2;
8365 struct kern_event_pcb *ev_pcb;
8366 struct kern_event_msg *ev;
8367 char *tmp;
8368 u_int32_t total_size;
8369 int i;
8370
8371 /* Verify the message is small enough to fit in one mbuf w/o cluster */
8372 total_size = KEV_MSG_HEADER_SIZE;
8373
8374 for (i = 0; i < 5; i++) {
8375 if (event_msg->dv[i].data_length == 0) {
8376 break;
8377 }
8378 total_size += event_msg->dv[i].data_length;
8379 }
8380
8381 if (total_size > MLEN) {
8382 os_atomic_inc(&kevtstat.kes_toobig, relaxed);
8383 return EMSGSIZE;
8384 }
8385
8386 m = m_get(M_WAIT, MT_DATA);
8387 if (m == 0) {
8388 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8389 return ENOMEM;
8390 }
8391 ev = mtod(m, struct kern_event_msg *);
8392 total_size = KEV_MSG_HEADER_SIZE;
8393
8394 tmp = (char *) &ev->event_data[0];
8395 for (i = 0; i < 5; i++) {
8396 if (event_msg->dv[i].data_length == 0) {
8397 break;
8398 }
8399
8400 total_size += event_msg->dv[i].data_length;
8401 bcopy(event_msg->dv[i].data_ptr, tmp,
8402 event_msg->dv[i].data_length);
8403 tmp += event_msg->dv[i].data_length;
8404 }
8405
8406 ev->id = ++static_event_id;
8407 ev->total_size = total_size;
8408 ev->vendor_code = event_msg->vendor_code;
8409 ev->kev_class = event_msg->kev_class;
8410 ev->kev_subclass = event_msg->kev_subclass;
8411 ev->event_code = event_msg->event_code;
8412
8413 m->m_len = total_size;
8414 lck_rw_lock_shared(&kev_rwlock);
8415 for (ev_pcb = LIST_FIRST(&kern_event_head);
8416 ev_pcb;
8417 ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8418 lck_mtx_lock(&ev_pcb->evp_mtx);
8419 if (ev_pcb->evp_socket->so_pcb == NULL) {
8420 lck_mtx_unlock(&ev_pcb->evp_mtx);
8421 continue;
8422 }
8423 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8424 if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8425 lck_mtx_unlock(&ev_pcb->evp_mtx);
8426 continue;
8427 }
8428
8429 if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
8430 if (ev_pcb->evp_class_filter != ev->kev_class) {
8431 lck_mtx_unlock(&ev_pcb->evp_mtx);
8432 continue;
8433 }
8434
8435 if ((ev_pcb->evp_subclass_filter !=
8436 KEV_ANY_SUBCLASS) &&
8437 (ev_pcb->evp_subclass_filter !=
8438 ev->kev_subclass)) {
8439 lck_mtx_unlock(&ev_pcb->evp_mtx);
8440 continue;
8441 }
8442 }
8443 }
8444
8445 m2 = m_copym(m, 0, m->m_len, M_WAIT);
8446 if (m2 == 0) {
8447 os_atomic_inc(&kevtstat.kes_nomem, relaxed);
8448 m_free(m);
8449 lck_mtx_unlock(&ev_pcb->evp_mtx);
8450 lck_rw_done(&kev_rwlock);
8451 return ENOMEM;
8452 }
8453 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
8454 /*
8455 * We use "m" for the socket stats as it would be
8456 * unsafe to use "m2"
8457 */
8458 so_inc_recv_data_stat(ev_pcb->evp_socket,
8459 1, m->m_len, MBUF_TC_BE);
8460
8461 sorwakeup(ev_pcb->evp_socket);
8462 os_atomic_inc(&kevtstat.kes_posted, relaxed);
8463 } else {
8464 os_atomic_inc(&kevtstat.kes_fullsock, relaxed);
8465 }
8466 lck_mtx_unlock(&ev_pcb->evp_mtx);
8467 }
8468 m_free(m);
8469 lck_rw_done(&kev_rwlock);
8470
8471 return 0;
8472 }
8473
8474 static int
8475 kev_control(struct socket *so,
8476 u_long cmd,
8477 caddr_t data,
8478 __unused struct ifnet *ifp,
8479 __unused struct proc *p)
8480 {
8481 struct kev_request *kev_req = (struct kev_request *) data;
8482 struct kern_event_pcb *ev_pcb;
8483 struct kev_vendor_code *kev_vendor;
8484 u_int32_t *id_value = (u_int32_t *) data;
8485
8486 switch (cmd) {
8487 case SIOCGKEVID:
8488 *id_value = static_event_id;
8489 break;
8490 case SIOCSKEVFILT:
8491 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8492 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
8493 ev_pcb->evp_class_filter = kev_req->kev_class;
8494 ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
8495 break;
8496 case SIOCGKEVFILT:
8497 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8498 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
8499 kev_req->kev_class = ev_pcb->evp_class_filter;
8500 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
8501 break;
8502 case SIOCGKEVVENDOR:
8503 kev_vendor = (struct kev_vendor_code *)data;
8504 /* Make sure string is NULL terminated */
8505 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN - 1] = 0;
8506 return net_str_id_find_internal(kev_vendor->vendor_string,
8507 &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0);
8508 default:
8509 return ENOTSUP;
8510 }
8511
8512 return 0;
8513 }
8514
8515 int
8516 kevt_getstat SYSCTL_HANDLER_ARGS
8517 {
8518 #pragma unused(oidp, arg1, arg2)
8519 int error = 0;
8520
8521 lck_rw_lock_shared(&kev_rwlock);
8522
8523 if (req->newptr != USER_ADDR_NULL) {
8524 error = EPERM;
8525 goto done;
8526 }
8527 if (req->oldptr == USER_ADDR_NULL) {
8528 req->oldidx = sizeof(struct kevtstat);
8529 goto done;
8530 }
8531
8532 error = SYSCTL_OUT(req, &kevtstat,
8533 MIN(sizeof(struct kevtstat), req->oldlen));
8534 done:
8535 lck_rw_done(&kev_rwlock);
8536
8537 return error;
8538 }
8539
8540 __private_extern__ int
8541 kevt_pcblist SYSCTL_HANDLER_ARGS
8542 {
8543 #pragma unused(oidp, arg1, arg2)
8544 int error = 0;
8545 uint64_t n, i;
8546 struct xsystmgen xsg;
8547 void *buf = NULL;
8548 size_t item_size = ROUNDUP64(sizeof(struct xkevtpcb)) +
8549 ROUNDUP64(sizeof(struct xsocket_n)) +
8550 2 * ROUNDUP64(sizeof(struct xsockbuf_n)) +
8551 ROUNDUP64(sizeof(struct xsockstat_n));
8552 struct kern_event_pcb *ev_pcb;
8553
8554 buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
8555 if (buf == NULL) {
8556 return ENOMEM;
8557 }
8558
8559 lck_rw_lock_shared(&kev_rwlock);
8560
8561 n = kevtstat.kes_pcbcount;
8562
8563 if (req->oldptr == USER_ADDR_NULL) {
8564 req->oldidx = (size_t) ((n + n / 8) * item_size);
8565 goto done;
8566 }
8567 if (req->newptr != USER_ADDR_NULL) {
8568 error = EPERM;
8569 goto done;
8570 }
8571 bzero(&xsg, sizeof(xsg));
8572 xsg.xg_len = sizeof(xsg);
8573 xsg.xg_count = n;
8574 xsg.xg_gen = kevtstat.kes_gencnt;
8575 xsg.xg_sogen = so_gencnt;
8576 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8577 if (error) {
8578 goto done;
8579 }
8580 /*
8581 * We are done if there is no pcb
8582 */
8583 if (n == 0) {
8584 goto done;
8585 }
8586
8587 i = 0;
8588 for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
8589 i < n && ev_pcb != NULL;
8590 i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8591 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
8592 struct xsocket_n *xso = (struct xsocket_n *)
8593 ADVANCE64(xk, sizeof(*xk));
8594 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
8595 ADVANCE64(xso, sizeof(*xso));
8596 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
8597 ADVANCE64(xsbrcv, sizeof(*xsbrcv));
8598 struct xsockstat_n *xsostats = (struct xsockstat_n *)
8599 ADVANCE64(xsbsnd, sizeof(*xsbsnd));
8600
8601 bzero(buf, item_size);
8602
8603 lck_mtx_lock(&ev_pcb->evp_mtx);
8604
8605 xk->kep_len = sizeof(struct xkevtpcb);
8606 xk->kep_kind = XSO_EVT;
8607 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
8608 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
8609 xk->kep_class_filter = ev_pcb->evp_class_filter;
8610 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
8611
8612 sotoxsocket_n(ev_pcb->evp_socket, xso);
8613 sbtoxsockbuf_n(ev_pcb->evp_socket ?
8614 &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
8615 sbtoxsockbuf_n(ev_pcb->evp_socket ?
8616 &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
8617 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
8618
8619 lck_mtx_unlock(&ev_pcb->evp_mtx);
8620
8621 error = SYSCTL_OUT(req, buf, item_size);
8622 }
8623
8624 if (error == 0) {
8625 /*
8626 * Give the user an updated idea of our state.
8627 * If the generation differs from what we told
8628 * her before, she knows that something happened
8629 * while we were processing this request, and it
8630 * might be necessary to retry.
8631 */
8632 bzero(&xsg, sizeof(xsg));
8633 xsg.xg_len = sizeof(xsg);
8634 xsg.xg_count = n;
8635 xsg.xg_gen = kevtstat.kes_gencnt;
8636 xsg.xg_sogen = so_gencnt;
8637 error = SYSCTL_OUT(req, &xsg, sizeof(xsg));
8638 if (error) {
8639 goto done;
8640 }
8641 }
8642
8643 done:
8644 lck_rw_done(&kev_rwlock);
8645
8646 if (buf != NULL) {
8647 FREE(buf, M_TEMP);
8648 }
8649
8650 return error;
8651 }
8652
8653 #endif /* SOCKETS */
8654
8655
8656 int
8657 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
8658 {
8659 struct vinfo_stat * st;
8660
8661 st = &kinfo->kq_stat;
8662
8663 st->vst_size = kq->kq_count;
8664 if (kq->kq_state & KQ_KEV_QOS) {
8665 st->vst_blksize = sizeof(struct kevent_qos_s);
8666 } else if (kq->kq_state & KQ_KEV64) {
8667 st->vst_blksize = sizeof(struct kevent64_s);
8668 } else {
8669 st->vst_blksize = sizeof(struct kevent);
8670 }
8671 st->vst_mode = S_IFIFO;
8672 st->vst_ino = (kq->kq_state & KQ_DYNAMIC) ?
8673 ((struct kqworkloop *)kq)->kqwl_dynamicid : 0;
8674
8675 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
8676 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
8677 kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
8678
8679 return 0;
8680 }
8681
8682 static int
8683 fill_kqueue_dyninfo(struct kqworkloop *kqwl, struct kqueue_dyninfo *kqdi)
8684 {
8685 workq_threadreq_t kqr = &kqwl->kqwl_request;
8686 workq_threadreq_param_t trp = {};
8687 int err;
8688
8689 if ((kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
8690 return EINVAL;
8691 }
8692
8693 if ((err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi->kqdi_info))) {
8694 return err;
8695 }
8696
8697 kqlock(kqwl);
8698
8699 kqdi->kqdi_servicer = thread_tid(kqr_thread(kqr));
8700 kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
8701 kqdi->kqdi_request_state = kqr->tr_state;
8702 kqdi->kqdi_async_qos = kqr->tr_kq_qos_index;
8703 kqdi->kqdi_events_qos = kqr->tr_kq_override_index;
8704 kqdi->kqdi_sync_waiters = 0;
8705 kqdi->kqdi_sync_waiter_qos = 0;
8706
8707 trp.trp_value = kqwl->kqwl_params;
8708 if (trp.trp_flags & TRP_PRIORITY) {
8709 kqdi->kqdi_pri = trp.trp_pri;
8710 } else {
8711 kqdi->kqdi_pri = 0;
8712 }
8713
8714 if (trp.trp_flags & TRP_POLICY) {
8715 kqdi->kqdi_pol = trp.trp_pol;
8716 } else {
8717 kqdi->kqdi_pol = 0;
8718 }
8719
8720 if (trp.trp_flags & TRP_CPUPERCENT) {
8721 kqdi->kqdi_cpupercent = trp.trp_cpupercent;
8722 } else {
8723 kqdi->kqdi_cpupercent = 0;
8724 }
8725
8726 kqunlock(kqwl);
8727
8728 return 0;
8729 }
8730
8731
8732 void
8733 knote_markstayactive(struct knote *kn)
8734 {
8735 struct kqueue *kq = knote_get_kq(kn);
8736 kq_index_t qos;
8737
8738 kqlock(kq);
8739 kn->kn_status |= KN_STAYACTIVE;
8740
8741 /*
8742 * Making a knote stay active is a property of the knote that must be
8743 * established before it is fully attached.
8744 */
8745 assert((kn->kn_status & (KN_QUEUED | KN_SUPPRESSED)) == 0);
8746
8747 /* handle all stayactive knotes on the (appropriate) manager */
8748 if (kq->kq_state & KQ_WORKLOOP) {
8749 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8750
8751 qos = _pthread_priority_thread_qos(kn->kn_qos);
8752 assert(qos && qos < THREAD_QOS_LAST);
8753 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, qos);
8754 qos = KQWL_BUCKET_STAYACTIVE;
8755 } else if (kq->kq_state & KQ_WORKQ) {
8756 qos = KQWQ_QOS_MANAGER;
8757 } else {
8758 qos = THREAD_QOS_UNSPECIFIED;
8759 }
8760
8761 kn->kn_qos_override = qos;
8762 kn->kn_qos_index = qos;
8763
8764 knote_activate(kq, kn, FILTER_ACTIVE);
8765 kqunlock(kq);
8766 }
8767
8768 void
8769 knote_clearstayactive(struct knote *kn)
8770 {
8771 struct kqueue *kq = knote_get_kq(kn);
8772 kqlock(kq);
8773 kn->kn_status &= ~(KN_STAYACTIVE | KN_ACTIVE);
8774 knote_dequeue(kq, kn);
8775 kqunlock(kq);
8776 }
8777
8778 static unsigned long
8779 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
8780 unsigned long buflen, unsigned long nknotes)
8781 {
8782 for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
8783 if (kq == knote_get_kq(kn)) {
8784 if (nknotes < buflen) {
8785 struct kevent_extinfo *info = &buf[nknotes];
8786
8787 kqlock(kq);
8788
8789 info->kqext_kev = *(struct kevent_qos_s *)&kn->kn_kevent;
8790 if (knote_has_qos(kn)) {
8791 info->kqext_kev.qos =
8792 _pthread_priority_thread_qos_fast(kn->kn_qos);
8793 } else {
8794 info->kqext_kev.qos = kn->kn_qos_override;
8795 }
8796 info->kqext_kev.filter |= 0xff00; /* sign extend filter */
8797 info->kqext_kev.xflags = 0; /* this is where sfflags lives */
8798 info->kqext_kev.data = 0; /* this is where sdata lives */
8799 info->kqext_sdata = kn->kn_sdata;
8800 info->kqext_status = kn->kn_status;
8801 info->kqext_sfflags = kn->kn_sfflags;
8802
8803 kqunlock(kq);
8804 }
8805
8806 /* we return total number of knotes, which may be more than requested */
8807 nknotes++;
8808 }
8809 }
8810
8811 return nknotes;
8812 }
8813
8814 int
8815 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
8816 int32_t *nkqueues_out)
8817 {
8818 proc_t p = (proc_t)proc;
8819 struct filedesc *fdp = p->p_fd;
8820 unsigned int nkqueues = 0;
8821 unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
8822 size_t buflen, bufsize;
8823 kqueue_id_t *kq_ids = NULL;
8824 int err = 0;
8825
8826 assert(p != NULL);
8827
8828 if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
8829 err = EINVAL;
8830 goto out;
8831 }
8832
8833 buflen = MIN(ubuflen, PROC_PIDDYNKQUEUES_MAX);
8834
8835 if (ubuflen != 0) {
8836 if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
8837 err = ERANGE;
8838 goto out;
8839 }
8840 kq_ids = kheap_alloc(KHEAP_TEMP, bufsize, Z_WAITOK | Z_ZERO);
8841 if (!kq_ids) {
8842 err = ENOMEM;
8843 goto out;
8844 }
8845 }
8846
8847 kqhash_lock(fdp);
8848
8849 if (fdp->fd_kqhashmask > 0) {
8850 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
8851 struct kqworkloop *kqwl;
8852
8853 LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
8854 /* report the number of kqueues, even if they don't all fit */
8855 if (nkqueues < buflen) {
8856 kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
8857 }
8858 nkqueues++;
8859 }
8860 }
8861 }
8862
8863 kqhash_unlock(fdp);
8864
8865 if (kq_ids) {
8866 size_t copysize;
8867 if (os_mul_overflow(sizeof(kqueue_id_t), MIN(buflen, nkqueues), &copysize)) {
8868 err = ERANGE;
8869 goto out;
8870 }
8871
8872 assert(ubufsize >= copysize);
8873 err = copyout(kq_ids, ubuf, copysize);
8874 }
8875
8876 out:
8877 if (kq_ids) {
8878 kheap_free(KHEAP_TEMP, kq_ids, bufsize);
8879 }
8880
8881 if (!err) {
8882 *nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
8883 }
8884 return err;
8885 }
8886
8887 int
8888 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8889 uint32_t ubufsize, int32_t *size_out)
8890 {
8891 proc_t p = (proc_t)proc;
8892 struct kqworkloop *kqwl;
8893 int err = 0;
8894 struct kqueue_dyninfo kqdi = { };
8895
8896 assert(p != NULL);
8897
8898 if (ubufsize < sizeof(struct kqueue_info)) {
8899 return ENOBUFS;
8900 }
8901
8902 kqwl = kqworkloop_hash_lookup_and_retain(p->p_fd, kq_id);
8903 if (!kqwl) {
8904 return ESRCH;
8905 }
8906
8907 /*
8908 * backward compatibility: allow the argument to this call to only be
8909 * a struct kqueue_info
8910 */
8911 if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
8912 ubufsize = sizeof(struct kqueue_dyninfo);
8913 err = fill_kqueue_dyninfo(kqwl, &kqdi);
8914 } else {
8915 ubufsize = sizeof(struct kqueue_info);
8916 err = fill_kqueueinfo(&kqwl->kqwl_kqueue, &kqdi.kqdi_info);
8917 }
8918 if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
8919 *size_out = ubufsize;
8920 }
8921 kqworkloop_release(kqwl);
8922 return err;
8923 }
8924
8925 int
8926 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
8927 uint32_t ubufsize, int32_t *nknotes_out)
8928 {
8929 proc_t p = (proc_t)proc;
8930 struct kqworkloop *kqwl;
8931 int err;
8932
8933 kqwl = kqworkloop_hash_lookup_and_retain(p->p_fd, kq_id);
8934 if (!kqwl) {
8935 return ESRCH;
8936 }
8937
8938 err = pid_kqueue_extinfo(p, &kqwl->kqwl_kqueue, ubuf, ubufsize, nknotes_out);
8939 kqworkloop_release(kqwl);
8940 return err;
8941 }
8942
8943 int
8944 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
8945 uint32_t bufsize, int32_t *retval)
8946 {
8947 struct knote *kn;
8948 int i;
8949 int err = 0;
8950 struct filedesc *fdp = p->p_fd;
8951 unsigned long nknotes = 0;
8952 unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
8953 struct kevent_extinfo *kqext = NULL;
8954
8955 /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
8956 buflen = MIN(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
8957
8958 kqext = kheap_alloc(KHEAP_TEMP,
8959 buflen * sizeof(struct kevent_extinfo), Z_WAITOK | Z_ZERO);
8960 if (kqext == NULL) {
8961 err = ENOMEM;
8962 goto out;
8963 }
8964
8965 proc_fdlock(p);
8966 for (i = 0; i < fdp->fd_knlistsize; i++) {
8967 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
8968 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8969 }
8970 proc_fdunlock(p);
8971
8972 if (fdp->fd_knhashmask != 0) {
8973 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
8974 knhash_lock(fdp);
8975 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
8976 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
8977 knhash_unlock(fdp);
8978 }
8979 }
8980
8981 assert(bufsize >= sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
8982 err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * MIN(buflen, nknotes));
8983
8984 out:
8985 if (kqext) {
8986 kheap_free(KHEAP_TEMP, kqext, buflen * sizeof(struct kevent_extinfo));
8987 kqext = NULL;
8988 }
8989
8990 if (!err) {
8991 *retval = (int32_t)MIN(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
8992 }
8993 return err;
8994 }
8995
8996 static unsigned int
8997 klist_copy_udata(struct klist *list, uint64_t *buf,
8998 unsigned int buflen, unsigned int nknotes)
8999 {
9000 struct knote *kn;
9001 SLIST_FOREACH(kn, list, kn_link) {
9002 if (nknotes < buflen) {
9003 /*
9004 * kevent_register will always set kn_udata atomically
9005 * so that we don't have to take any kqlock here.
9006 */
9007 buf[nknotes] = os_atomic_load_wide(&kn->kn_udata, relaxed);
9008 }
9009 /* we return total number of knotes, which may be more than requested */
9010 nknotes++;
9011 }
9012
9013 return nknotes;
9014 }
9015
9016 int
9017 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, uint32_t bufsize)
9018 {
9019 proc_t p = (proc_t)proc;
9020 struct filedesc *fdp = p->p_fd;
9021 unsigned int nuptrs = 0;
9022 unsigned int buflen = bufsize / sizeof(uint64_t);
9023 struct kqworkloop *kqwl;
9024
9025 if (buflen > 0) {
9026 assert(buf != NULL);
9027 }
9028
9029 proc_fdlock(p);
9030 for (int i = 0; i < fdp->fd_knlistsize; i++) {
9031 nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
9032 }
9033 proc_fdunlock(p);
9034
9035 knhash_lock(fdp);
9036 if (fdp->fd_knhashmask != 0) {
9037 for (size_t i = 0; i < fdp->fd_knhashmask + 1; i++) {
9038 nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9039 }
9040 }
9041 knhash_unlock(fdp);
9042
9043 kqhash_lock(fdp);
9044 if (fdp->fd_kqhashmask != 0) {
9045 for (size_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
9046 LIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9047 if (nuptrs < buflen) {
9048 buf[nuptrs] = kqwl->kqwl_dynamicid;
9049 }
9050 nuptrs++;
9051 }
9052 }
9053 }
9054 kqhash_unlock(fdp);
9055
9056 return (int)nuptrs;
9057 }
9058
9059 static void
9060 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9061 {
9062 uint64_t ast_addr;
9063 bool proc_is_64bit = !!(p->p_flag & P_LP64);
9064 size_t user_addr_size = proc_is_64bit ? 8 : 4;
9065 uint32_t ast_flags32 = 0;
9066 uint64_t ast_flags64 = 0;
9067 struct uthread *ut = get_bsdthread_info(thread);
9068
9069 if (ut->uu_kqr_bound != NULL) {
9070 ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9071 }
9072
9073 if (ast_flags64 == 0) {
9074 return;
9075 }
9076
9077 if (!(p->p_flag & P_LP64)) {
9078 ast_flags32 = (uint32_t)ast_flags64;
9079 assert(ast_flags64 < 0x100000000ull);
9080 }
9081
9082 ast_addr = thread_rettokern_addr(thread);
9083 if (ast_addr == 0) {
9084 return;
9085 }
9086
9087 if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9088 (user_addr_t)ast_addr,
9089 user_addr_size) != 0) {
9090 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9091 "ast_addr = %llu\n", p->p_pid, thread_tid(current_thread()), ast_addr);
9092 }
9093 }
9094
9095 void
9096 kevent_ast(thread_t thread, uint16_t bits)
9097 {
9098 proc_t p = current_proc();
9099
9100 if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9101 workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
9102 }
9103 if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9104 kevent_set_return_to_kernel_user_tsd(p, thread);
9105 }
9106 }
9107
9108 #if DEVELOPMENT || DEBUG
9109
9110 #define KEVENT_SYSCTL_BOUND_ID 1
9111
9112 static int
9113 kevent_sysctl SYSCTL_HANDLER_ARGS
9114 {
9115 #pragma unused(oidp, arg2)
9116 uintptr_t type = (uintptr_t)arg1;
9117 uint64_t bound_id = 0;
9118
9119 if (type != KEVENT_SYSCTL_BOUND_ID) {
9120 return EINVAL;
9121 }
9122
9123 if (req->newptr) {
9124 return EINVAL;
9125 }
9126
9127 struct uthread *ut = get_bsdthread_info(current_thread());
9128 if (!ut) {
9129 return EFAULT;
9130 }
9131
9132 workq_threadreq_t kqr = ut->uu_kqr_bound;
9133 if (kqr) {
9134 if (kqr->tr_flags & WORKQ_TR_FLAG_WORKLOOP) {
9135 bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
9136 } else {
9137 bound_id = -1;
9138 }
9139 }
9140
9141 return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9142 }
9143
9144 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9145 "kevent information");
9146
9147 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9148 CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9149 (void *)KEVENT_SYSCTL_BOUND_ID,
9150 sizeof(kqueue_id_t), kevent_sysctl, "Q",
9151 "get the ID of the bound kqueue");
9152
9153 #endif /* DEVELOPMENT || DEBUG */