2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * @(#)kern_event.c 1.0 (3/31/2000)
58 #include <machine/atomic.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
78 #include <sys/syscall.h> // SYS_* constants
79 #include <sys/sysctl.h>
81 #include <sys/sysproto.h>
83 #include <sys/vnode_internal.h>
85 #include <sys/proc_info.h>
86 #include <sys/codesign.h>
87 #include <sys/pthread_shims.h>
88 #include <sys/kdebug.h>
90 #include <pexpert/pexpert.h>
92 #include <kern/locks.h>
93 #include <kern/clock.h>
94 #include <kern/cpu_data.h>
95 #include <kern/policy_internal.h>
96 #include <kern/thread_call.h>
97 #include <kern/sched_prim.h>
98 #include <kern/waitq.h>
99 #include <kern/zalloc.h>
100 #include <kern/kalloc.h>
101 #include <kern/assert.h>
102 #include <kern/ast.h>
103 #include <kern/thread.h>
104 #include <kern/kcdata.h>
106 #include <pthread/priority_private.h>
107 #include <pthread/workqueue_syscalls.h>
108 #include <pthread/workqueue_internal.h>
109 #include <libkern/libkern.h>
111 #include "net/net_str_id.h"
113 #include <mach/task.h>
114 #include <libkern/section_keywords.h>
116 #if CONFIG_MEMORYSTATUS
117 #include <sys/kern_memorystatus.h>
120 #if DEVELOPMENT || DEBUG
121 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0)
122 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1)
123 TUNABLE(uint32_t, kevent_debug_flags
, "kevent_debug", 0);
126 static LCK_GRP_DECLARE(kq_lck_grp
, "kqueue");
127 SECURITY_READ_ONLY_EARLY(vm_packing_params_t
) kn_kq_packing_params
=
128 VM_PACKING_PARAMS(KNOTE_KQ_PACKED
);
130 extern mach_port_name_t
ipc_entry_name_mask(mach_port_name_t name
); /* osfmk/ipc/ipc_entry.h */
131 extern int cansignal(struct proc
*, kauth_cred_t
, struct proc
*, int); /* bsd/kern/kern_sig.c */
133 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
135 MALLOC_DEFINE(M_KQUEUE
, "kqueue", "memory for kqueue system");
137 #define KQ_EVENT NO_EVENT64
139 static int kqueue_select(struct fileproc
*fp
, int which
, void *wq_link_id
,
141 static int kqueue_close(struct fileglob
*fg
, vfs_context_t ctx
);
142 static int kqueue_kqfilter(struct fileproc
*fp
, struct knote
*kn
,
143 struct kevent_qos_s
*kev
);
144 static int kqueue_drain(struct fileproc
*fp
, vfs_context_t ctx
);
146 static const struct fileops kqueueops
= {
147 .fo_type
= DTYPE_KQUEUE
,
148 .fo_read
= fo_no_read
,
149 .fo_write
= fo_no_write
,
150 .fo_ioctl
= fo_no_ioctl
,
151 .fo_select
= kqueue_select
,
152 .fo_close
= kqueue_close
,
153 .fo_drain
= kqueue_drain
,
154 .fo_kqfilter
= kqueue_kqfilter
,
157 static inline int kevent_modern_copyout(struct kevent_qos_s
*, user_addr_t
*);
158 static int kevent_register_wait_prepare(struct knote
*kn
, struct kevent_qos_s
*kev
, int result
);
159 static void kevent_register_wait_block(struct turnstile
*ts
, thread_t handoff_thread
,
160 thread_continue_t cont
, struct _kevent_register
*cont_args
) __dead2
;
161 static void kevent_register_wait_return(struct _kevent_register
*cont_args
) __dead2
;
162 static void kevent_register_wait_cleanup(struct knote
*kn
);
164 static struct kqtailq
*kqueue_get_suppressed_queue(kqueue_t kq
, struct knote
*kn
);
165 static void kqueue_threadreq_initiate(struct kqueue
*kq
, workq_threadreq_t
, kq_index_t qos
, int flags
);
167 static void kqworkq_unbind(proc_t p
, workq_threadreq_t
);
168 static thread_qos_t
kqworkq_unbind_locked(struct kqworkq
*kqwq
, workq_threadreq_t
, thread_t thread
);
169 static workq_threadreq_t
kqworkq_get_request(struct kqworkq
*kqwq
, kq_index_t qos_index
);
171 static void kqworkloop_unbind(struct kqworkloop
*kwql
);
173 enum kqwl_unbind_locked_mode
{
174 KQWL_OVERRIDE_DROP_IMMEDIATELY
,
175 KQWL_OVERRIDE_DROP_DELAYED
,
177 static void kqworkloop_unbind_locked(struct kqworkloop
*kwql
, thread_t thread
,
178 enum kqwl_unbind_locked_mode how
);
179 static void kqworkloop_unbind_delayed_override_drop(thread_t thread
);
180 static kq_index_t
kqworkloop_override(struct kqworkloop
*kqwl
);
181 static void kqworkloop_set_overcommit(struct kqworkloop
*kqwl
);
185 * The wakeup qos is the qos of QUEUED knotes.
187 * This QoS is accounted for with the events override in the
188 * kqr_override_index field. It is raised each time a new knote is queued at
189 * a given QoS. The kqwl_wakeup_indexes field is a superset of the non empty
190 * knote buckets and is recomputed after each event delivery.
192 KQWL_UTQ_UPDATE_WAKEUP_QOS
,
193 KQWL_UTQ_UPDATE_STAYACTIVE_QOS
,
194 KQWL_UTQ_RECOMPUTE_WAKEUP_QOS
,
195 KQWL_UTQ_UNBINDING
, /* attempt to rebind */
198 * The wakeup override is for suppressed knotes that have fired again at
199 * a higher QoS than the one for which they are suppressed already.
200 * This override is cleared when the knote suppressed list becomes empty.
202 KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE
,
203 KQWL_UTQ_RESET_WAKEUP_OVERRIDE
,
205 * The QoS is the maximum QoS of an event enqueued on this workloop in
206 * userland. It is copied from the only EVFILT_WORKLOOP knote with
207 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
208 * such knote, this QoS is 0.
210 KQWL_UTQ_SET_QOS_INDEX
,
211 KQWL_UTQ_REDRIVE_EVENTS
,
213 static void kqworkloop_update_threads_qos(struct kqworkloop
*kqwl
, int op
, kq_index_t qos
);
214 static int kqworkloop_end_processing(struct kqworkloop
*kqwl
, int flags
, int kevent_flags
);
216 static struct knote
*knote_alloc(void);
217 static void knote_free(struct knote
*kn
);
218 static int kq_add_knote(struct kqueue
*kq
, struct knote
*kn
,
219 struct knote_lock_ctx
*knlc
, struct proc
*p
);
220 static struct knote
*kq_find_knote_and_kq_lock(struct kqueue
*kq
,
221 struct kevent_qos_s
*kev
, bool is_fd
, struct proc
*p
);
223 static void knote_activate(kqueue_t kqu
, struct knote
*kn
, int result
);
224 static void knote_dequeue(kqueue_t kqu
, struct knote
*kn
);
226 static void knote_apply_touch(kqueue_t kqu
, struct knote
*kn
,
227 struct kevent_qos_s
*kev
, int result
);
228 static void knote_suppress(kqueue_t kqu
, struct knote
*kn
);
229 static void knote_unsuppress(kqueue_t kqu
, struct knote
*kn
);
230 static void knote_drop(kqueue_t kqu
, struct knote
*kn
, struct knote_lock_ctx
*knlc
);
232 // both these functions may dequeue the knote and it is up to the caller
233 // to enqueue the knote back
234 static void knote_adjust_qos(struct kqueue
*kq
, struct knote
*kn
, int result
);
235 static void knote_reset_priority(kqueue_t kqu
, struct knote
*kn
, pthread_priority_t pp
);
237 static ZONE_DECLARE(knote_zone
, "knote zone",
238 sizeof(struct knote
), ZC_CACHING
| ZC_ZFREE_CLEARMEM
);
239 static ZONE_DECLARE(kqfile_zone
, "kqueue file zone",
240 sizeof(struct kqfile
), ZC_ZFREE_CLEARMEM
);
241 static ZONE_DECLARE(kqworkq_zone
, "kqueue workq zone",
242 sizeof(struct kqworkq
), ZC_ZFREE_CLEARMEM
);
243 static ZONE_DECLARE(kqworkloop_zone
, "kqueue workloop zone",
244 sizeof(struct kqworkloop
), ZC_CACHING
| ZC_ZFREE_CLEARMEM
);
246 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
248 static int filt_no_attach(struct knote
*kn
, struct kevent_qos_s
*kev
);
249 static void filt_no_detach(struct knote
*kn
);
250 static int filt_bad_event(struct knote
*kn
, long hint
);
251 static int filt_bad_touch(struct knote
*kn
, struct kevent_qos_s
*kev
);
252 static int filt_bad_process(struct knote
*kn
, struct kevent_qos_s
*kev
);
254 SECURITY_READ_ONLY_EARLY(static struct filterops
) bad_filtops
= {
255 .f_attach
= filt_no_attach
,
256 .f_detach
= filt_no_detach
,
257 .f_event
= filt_bad_event
,
258 .f_touch
= filt_bad_touch
,
259 .f_process
= filt_bad_process
,
262 #if CONFIG_MEMORYSTATUS
263 extern const struct filterops memorystatus_filtops
;
264 #endif /* CONFIG_MEMORYSTATUS */
265 extern const struct filterops fs_filtops
;
266 extern const struct filterops sig_filtops
;
267 extern const struct filterops machport_filtops
;
268 extern const struct filterops pipe_nfiltops
;
269 extern const struct filterops pipe_rfiltops
;
270 extern const struct filterops pipe_wfiltops
;
271 extern const struct filterops ptsd_kqops
;
272 extern const struct filterops ptmx_kqops
;
273 extern const struct filterops soread_filtops
;
274 extern const struct filterops sowrite_filtops
;
275 extern const struct filterops sock_filtops
;
276 extern const struct filterops soexcept_filtops
;
277 extern const struct filterops spec_filtops
;
278 extern const struct filterops bpfread_filtops
;
279 extern const struct filterops necp_fd_rfiltops
;
280 extern const struct filterops fsevent_filtops
;
281 extern const struct filterops vnode_filtops
;
282 extern const struct filterops tty_filtops
;
284 const static struct filterops file_filtops
;
285 const static struct filterops kqread_filtops
;
286 const static struct filterops proc_filtops
;
287 const static struct filterops timer_filtops
;
288 const static struct filterops user_filtops
;
289 const static struct filterops workloop_filtops
;
293 * Rules for adding new filters to the system:
295 * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
296 * in the exported section of the header
297 * - Update the EVFILT_SYSCOUNT value to reflect the new addition
298 * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
299 * of the Public Filters section in the array.
301 * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
302 * in the XNU_KERNEL_PRIVATE section of the header
303 * - Update the EVFILTID_MAX value to reflect the new addition
304 * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
305 * the Private filters section of the array.
307 static_assert(EVFILTID_MAX
< UINT8_MAX
, "kn_filtid expects this to be true");
308 static const struct filterops
* const sysfilt_ops
[EVFILTID_MAX
] = {
310 [~EVFILT_READ
] = &file_filtops
,
311 [~EVFILT_WRITE
] = &file_filtops
,
312 [~EVFILT_AIO
] = &bad_filtops
,
313 [~EVFILT_VNODE
] = &file_filtops
,
314 [~EVFILT_PROC
] = &proc_filtops
,
315 [~EVFILT_SIGNAL
] = &sig_filtops
,
316 [~EVFILT_TIMER
] = &timer_filtops
,
317 [~EVFILT_MACHPORT
] = &machport_filtops
,
318 [~EVFILT_FS
] = &fs_filtops
,
319 [~EVFILT_USER
] = &user_filtops
,
320 [~EVFILT_UNUSED_11
] = &bad_filtops
,
321 [~EVFILT_VM
] = &bad_filtops
,
322 [~EVFILT_SOCK
] = &file_filtops
,
323 #if CONFIG_MEMORYSTATUS
324 [~EVFILT_MEMORYSTATUS
] = &memorystatus_filtops
,
326 [~EVFILT_MEMORYSTATUS
] = &bad_filtops
,
328 [~EVFILT_EXCEPT
] = &file_filtops
,
329 [~EVFILT_WORKLOOP
] = &workloop_filtops
,
331 /* Private filters */
332 [EVFILTID_KQREAD
] = &kqread_filtops
,
333 [EVFILTID_PIPE_N
] = &pipe_nfiltops
,
334 [EVFILTID_PIPE_R
] = &pipe_rfiltops
,
335 [EVFILTID_PIPE_W
] = &pipe_wfiltops
,
336 [EVFILTID_PTSD
] = &ptsd_kqops
,
337 [EVFILTID_SOREAD
] = &soread_filtops
,
338 [EVFILTID_SOWRITE
] = &sowrite_filtops
,
339 [EVFILTID_SCK
] = &sock_filtops
,
340 [EVFILTID_SOEXCEPT
] = &soexcept_filtops
,
341 [EVFILTID_SPEC
] = &spec_filtops
,
342 [EVFILTID_BPFREAD
] = &bpfread_filtops
,
343 [EVFILTID_NECP_FD
] = &necp_fd_rfiltops
,
344 [EVFILTID_FSEVENT
] = &fsevent_filtops
,
345 [EVFILTID_VN
] = &vnode_filtops
,
346 [EVFILTID_TTY
] = &tty_filtops
,
347 [EVFILTID_PTMX
] = &ptmx_kqops
,
349 /* fake filter for detached knotes, keep last */
350 [EVFILTID_DETACHED
] = &bad_filtops
,
353 /* waitq prepost callback */
354 void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t
*kq_hook
);
357 kqr_thread_bound(workq_threadreq_t kqr
)
359 return kqr
->tr_state
== WORKQ_TR_STATE_BOUND
;
363 kqr_thread_requested_pending(workq_threadreq_t kqr
)
365 workq_tr_state_t tr_state
= kqr
->tr_state
;
366 return tr_state
> WORKQ_TR_STATE_IDLE
&& tr_state
< WORKQ_TR_STATE_BOUND
;
370 kqr_thread_requested(workq_threadreq_t kqr
)
372 return kqr
->tr_state
!= WORKQ_TR_STATE_IDLE
;
375 static inline thread_t
376 kqr_thread_fast(workq_threadreq_t kqr
)
378 assert(kqr_thread_bound(kqr
));
379 return kqr
->tr_thread
;
382 static inline thread_t
383 kqr_thread(workq_threadreq_t kqr
)
385 return kqr_thread_bound(kqr
) ? kqr
->tr_thread
: THREAD_NULL
;
388 static inline struct kqworkloop
*
389 kqr_kqworkloop(workq_threadreq_t kqr
)
391 if (kqr
->tr_flags
& WORKQ_TR_FLAG_WORKLOOP
) {
392 return __container_of(kqr
, struct kqworkloop
, kqwl_request
);
397 static inline kqueue_t
398 kqr_kqueue(proc_t p
, workq_threadreq_t kqr
)
401 if (kqr
->tr_flags
& WORKQ_TR_FLAG_WORKLOOP
) {
402 kqu
.kqwl
= kqr_kqworkloop(kqr
);
404 kqu
.kqwq
= p
->p_fd
->fd_wqkqueue
;
405 assert(kqr
>= kqu
.kqwq
->kqwq_request
&&
406 kqr
< kqu
.kqwq
->kqwq_request
+ KQWQ_NBUCKETS
);
412 * kqueue/note lock implementations
414 * The kqueue lock guards the kq state, the state of its queues,
415 * and the kqueue-aware status and locks of individual knotes.
417 * The kqueue workq lock is used to protect state guarding the
418 * interaction of the kqueue with the workq. This state cannot
419 * be guarded by the kq lock - as it needs to be taken when we
420 * already have the waitq set lock held (during the waitq hook
421 * callback). It might be better to use the waitq lock itself
422 * for this, but the IRQ requirements make that difficult).
424 * Knote flags, filter flags, and associated data are protected
425 * by the underlying object lock - and are only ever looked at
426 * by calling the filter to get a [consistent] snapshot of that
433 lck_spin_lock(&kqu
.kq
->kq_lock
);
437 kqlock_held(__assert_only kqueue_t kqu
)
439 LCK_SPIN_ASSERT(&kqu
.kq
->kq_lock
, LCK_ASSERT_OWNED
);
443 kqunlock(kqueue_t kqu
)
445 lck_spin_unlock(&kqu
.kq
->kq_lock
);
449 knhash_lock(struct filedesc
*fdp
)
451 lck_mtx_lock(&fdp
->fd_knhashlock
);
455 knhash_unlock(struct filedesc
*fdp
)
457 lck_mtx_unlock(&fdp
->fd_knhashlock
);
460 /* wait event for knote locks */
461 static inline event_t
462 knote_lock_wev(struct knote
*kn
)
464 return (event_t
)(&kn
->kn_hook
);
467 /* wait event for kevent_register_wait_* */
468 static inline event64_t
469 knote_filt_wev64(struct knote
*kn
)
471 /* kdp_workloop_sync_wait_find_owner knows about this */
472 return CAST_EVENT64_T(kn
);
475 /* wait event for knote_post/knote_drop */
476 static inline event64_t
477 knote_post_wev64(struct knote
*kn
)
479 return CAST_EVENT64_T(&kn
->kn_kevent
);
483 * @function knote_has_qos
486 * Whether the knote has a regular QoS.
489 * kn_qos_override is:
491 * - THREAD_QOS_LAST for special buckets (stayactive, manager)
493 * Other values mean the knote participates to QoS propagation.
496 knote_has_qos(struct knote
*kn
)
498 return kn
->kn_qos_override
> 0 && kn
->kn_qos_override
< THREAD_QOS_LAST
;
501 #pragma mark knote locks
504 * Enum used by the knote_lock_* functions.
506 * KNOTE_KQ_LOCK_ALWAYS
507 * The function will always return with the kq lock held.
509 * KNOTE_KQ_LOCK_ON_SUCCESS
510 * The function will return with the kq lock held if it was successful
511 * (knote_lock() is the only function that can fail).
513 * KNOTE_KQ_LOCK_ON_FAILURE
514 * The function will return with the kq lock held if it was unsuccessful
515 * (knote_lock() is the only function that can fail).
518 * The function returns with the kq unlocked.
521 KNOTE_KQ_LOCK_ALWAYS
,
522 KNOTE_KQ_LOCK_ON_SUCCESS
,
523 KNOTE_KQ_LOCK_ON_FAILURE
,
527 static struct knote_lock_ctx
*
528 knote_lock_ctx_find(kqueue_t kqu
, struct knote
*kn
)
530 struct knote_lock_ctx
*ctx
;
531 LIST_FOREACH(ctx
, &kqu
.kq
->kq_knlocks
, knlc_link
) {
532 if (ctx
->knlc_knote
== kn
) {
536 panic("knote lock context not found: %p", kn
);
540 /* slowpath of knote_lock() */
541 __attribute__((noinline
))
542 static bool __result_use_check
543 knote_lock_slow(kqueue_t kqu
, struct knote
*kn
,
544 struct knote_lock_ctx
*knlc
, int kqlocking
)
546 struct knote_lock_ctx
*owner_lc
;
547 struct uthread
*uth
= current_uthread();
552 owner_lc
= knote_lock_ctx_find(kqu
, kn
);
553 #if DEBUG || DEVELOPMENT
554 knlc
->knlc_state
= KNOTE_LOCK_CTX_WAITING
;
556 owner_lc
->knlc_waiters
++;
559 * Make our lock context visible to knote_unlock()
561 uth
->uu_knlock
= knlc
;
563 wr
= lck_spin_sleep_with_inheritor(&kqu
.kq
->kq_lock
, LCK_SLEEP_UNLOCK
,
564 knote_lock_wev(kn
), owner_lc
->knlc_thread
,
565 THREAD_UNINT
| THREAD_WAIT_NOREPORT
, TIMEOUT_WAIT_FOREVER
);
567 if (wr
== THREAD_RESTART
) {
569 * We haven't been woken up by knote_unlock() but knote_unlock_cancel.
570 * We need to cleanup the state since no one did.
572 uth
->uu_knlock
= NULL
;
573 #if DEBUG || DEVELOPMENT
574 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_WAITING
);
575 knlc
->knlc_state
= KNOTE_LOCK_CTX_UNLOCKED
;
578 if (kqlocking
== KNOTE_KQ_LOCK_ALWAYS
||
579 kqlocking
== KNOTE_KQ_LOCK_ON_FAILURE
) {
584 if (kqlocking
== KNOTE_KQ_LOCK_ALWAYS
||
585 kqlocking
== KNOTE_KQ_LOCK_ON_SUCCESS
) {
587 #if DEBUG || DEVELOPMENT
589 * This state is set under the lock so we can't
590 * really assert this unless we hold the lock.
592 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_LOCKED
);
600 * Attempts to take the "knote" lock.
602 * Called with the kqueue lock held.
604 * Returns true if the knote lock is acquired, false if it has been dropped
606 static bool __result_use_check
607 knote_lock(kqueue_t kqu
, struct knote
*kn
, struct knote_lock_ctx
*knlc
,
608 enum kqlocking kqlocking
)
612 #if DEBUG || DEVELOPMENT
613 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_UNLOCKED
);
615 knlc
->knlc_knote
= kn
;
616 knlc
->knlc_thread
= current_thread();
617 knlc
->knlc_waiters
= 0;
619 if (__improbable(kn
->kn_status
& KN_LOCKED
)) {
620 return knote_lock_slow(kqu
, kn
, knlc
, kqlocking
);
624 * When the knote will be dropped, the knote lock is taken before
625 * KN_DROPPING is set, and then the knote will be removed from any
626 * hash table that references it before the lock is canceled.
628 assert((kn
->kn_status
& KN_DROPPING
) == 0);
629 LIST_INSERT_HEAD(&kqu
.kq
->kq_knlocks
, knlc
, knlc_link
);
630 kn
->kn_status
|= KN_LOCKED
;
631 #if DEBUG || DEVELOPMENT
632 knlc
->knlc_state
= KNOTE_LOCK_CTX_LOCKED
;
635 if (kqlocking
== KNOTE_KQ_UNLOCK
||
636 kqlocking
== KNOTE_KQ_LOCK_ON_FAILURE
) {
643 * Unlocks a knote successfully locked with knote_lock().
645 * Called with the kqueue lock held.
647 * Returns with the kqueue lock held according to KNOTE_KQ_* mode.
650 knote_unlock(kqueue_t kqu
, struct knote
*kn
,
651 struct knote_lock_ctx
*knlc
, enum kqlocking kqlocking
)
655 assert(knlc
->knlc_knote
== kn
);
656 assert(kn
->kn_status
& KN_LOCKED
);
657 #if DEBUG || DEVELOPMENT
658 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_LOCKED
);
661 LIST_REMOVE(knlc
, knlc_link
);
663 if (knlc
->knlc_waiters
) {
664 thread_t thread
= THREAD_NULL
;
666 wakeup_one_with_inheritor(knote_lock_wev(kn
), THREAD_AWAKENED
,
667 LCK_WAKE_DEFAULT
, &thread
);
670 * knote_lock_slow() publishes the lock context of waiters
671 * in uthread::uu_knlock.
673 * Reach out and make this context the new owner.
675 struct uthread
*ut
= get_bsdthread_info(thread
);
676 struct knote_lock_ctx
*next_owner_lc
= ut
->uu_knlock
;
678 assert(next_owner_lc
->knlc_knote
== kn
);
679 next_owner_lc
->knlc_waiters
= knlc
->knlc_waiters
- 1;
680 LIST_INSERT_HEAD(&kqu
.kq
->kq_knlocks
, next_owner_lc
, knlc_link
);
681 #if DEBUG || DEVELOPMENT
682 next_owner_lc
->knlc_state
= KNOTE_LOCK_CTX_LOCKED
;
684 ut
->uu_knlock
= NULL
;
685 thread_deallocate_safe(thread
);
687 kn
->kn_status
&= ~KN_LOCKED
;
690 if ((kn
->kn_status
& KN_MERGE_QOS
) && !(kn
->kn_status
& KN_POSTING
)) {
692 * No f_event() in flight anymore, we can leave QoS "Merge" mode
694 * See knote_adjust_qos()
696 kn
->kn_status
&= ~KN_MERGE_QOS
;
698 if (kqlocking
== KNOTE_KQ_UNLOCK
) {
701 #if DEBUG || DEVELOPMENT
702 knlc
->knlc_state
= KNOTE_LOCK_CTX_UNLOCKED
;
707 * Aborts all waiters for a knote lock, and unlock the knote.
709 * Called with the kqueue lock held.
711 * Returns with the kqueue unlocked.
714 knote_unlock_cancel(struct kqueue
*kq
, struct knote
*kn
,
715 struct knote_lock_ctx
*knlc
)
719 assert(knlc
->knlc_knote
== kn
);
720 assert(kn
->kn_status
& KN_LOCKED
);
721 assert(kn
->kn_status
& KN_DROPPING
);
723 LIST_REMOVE(knlc
, knlc_link
);
724 kn
->kn_status
&= ~KN_LOCKED
;
727 if (knlc
->knlc_waiters
) {
728 wakeup_all_with_inheritor(knote_lock_wev(kn
), THREAD_RESTART
);
730 #if DEBUG || DEVELOPMENT
731 knlc
->knlc_state
= KNOTE_LOCK_CTX_UNLOCKED
;
736 * Call the f_event hook of a given filter.
738 * Takes a use count to protect against concurrent drops.
741 knote_post(struct knote
*kn
, long hint
)
743 struct kqueue
*kq
= knote_get_kq(kn
);
744 int dropping
, result
;
748 if (__improbable(kn
->kn_status
& (KN_DROPPING
| KN_VANISHED
))) {
752 if (__improbable(kn
->kn_status
& KN_POSTING
)) {
753 panic("KNOTE() called concurrently on knote %p", kn
);
756 kn
->kn_status
|= KN_POSTING
;
759 result
= filter_call(knote_fops(kn
), f_event(kn
, hint
));
762 dropping
= (kn
->kn_status
& KN_DROPPING
);
764 if (!dropping
&& (result
& FILTER_ACTIVE
)) {
765 knote_activate(kq
, kn
, result
);
768 if ((kn
->kn_status
& KN_LOCKED
) == 0) {
770 * There's no other f_* call in flight, we can leave QoS "Merge" mode.
772 * See knote_adjust_qos()
774 kn
->kn_status
&= ~(KN_POSTING
| KN_MERGE_QOS
);
776 kn
->kn_status
&= ~KN_POSTING
;
779 if (__improbable(dropping
)) {
780 waitq_wakeup64_all((struct waitq
*)&kq
->kq_wqs
, knote_post_wev64(kn
),
781 THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
788 * Called by knote_drop() to wait for the last f_event() caller to be done.
790 * - kq locked at entry
791 * - kq unlocked at exit
794 knote_wait_for_post(struct kqueue
*kq
, struct knote
*kn
)
796 wait_result_t wr
= THREAD_NOT_WAITING
;
800 assert(kn
->kn_status
& KN_DROPPING
);
802 if (kn
->kn_status
& KN_POSTING
) {
803 wr
= waitq_assert_wait64((struct waitq
*)&kq
->kq_wqs
,
804 knote_post_wev64(kn
), THREAD_UNINT
| THREAD_WAIT_NOREPORT
,
805 TIMEOUT_WAIT_FOREVER
);
808 if (wr
== THREAD_WAITING
) {
809 thread_block(THREAD_CONTINUE_NULL
);
813 #pragma mark knote helpers for filters
817 knote_set_error(struct knote
*kn
, int error
)
819 kn
->kn_flags
|= EV_ERROR
;
820 kn
->kn_sdata
= error
;
825 knote_low_watermark(const struct knote
*kn
)
827 return (kn
->kn_sfflags
& NOTE_LOWAT
) ? kn
->kn_sdata
: 1;
831 * @function knote_fill_kevent_with_sdata
834 * Fills in a kevent from the current content of a knote.
837 * This is meant to be called from filter's f_event hooks.
838 * The kevent data is filled with kn->kn_sdata.
840 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
842 * Using knote_fill_kevent is typically preferred.
846 knote_fill_kevent_with_sdata(struct knote
*kn
, struct kevent_qos_s
*kev
)
848 #define knote_assert_aliases(name1, offs1, name2) \
849 static_assert(offsetof(struct kevent_qos_s, name1) + offs1 == \
850 offsetof(struct kevent_internal_s, name2), \
851 "kevent_qos_s::" #name1 " and kevent_internal_s::" #name2 "need to alias")
853 * All the code makes assumptions on these aliasing,
854 * so make sure we fail the build if we ever ever ever break them.
856 knote_assert_aliases(ident
, 0, kei_ident
);
857 #ifdef __LITTLE_ENDIAN__
858 knote_assert_aliases(filter
, 0, kei_filter
); // non trivial overlap
859 knote_assert_aliases(filter
, 1, kei_filtid
); // non trivial overlap
861 knote_assert_aliases(filter
, 0, kei_filtid
); // non trivial overlap
862 knote_assert_aliases(filter
, 1, kei_filter
); // non trivial overlap
864 knote_assert_aliases(flags
, 0, kei_flags
);
865 knote_assert_aliases(qos
, 0, kei_qos
);
866 knote_assert_aliases(udata
, 0, kei_udata
);
867 knote_assert_aliases(fflags
, 0, kei_fflags
);
868 knote_assert_aliases(xflags
, 0, kei_sfflags
); // non trivial overlap
869 knote_assert_aliases(data
, 0, kei_sdata
); // non trivial overlap
870 knote_assert_aliases(ext
, 0, kei_ext
);
871 #undef knote_assert_aliases
874 * Fix the differences between kevent_qos_s and kevent_internal_s:
875 * - xflags is where kn_sfflags lives, we need to zero it
876 * - fixup the high bits of `filter` where kn_filtid lives
878 *kev
= *(struct kevent_qos_s
*)&kn
->kn_kevent
;
880 kev
->filter
|= 0xff00;
881 if (kn
->kn_flags
& EV_CLEAR
) {
887 * @function knote_fill_kevent
890 * Fills in a kevent from the current content of a knote.
893 * This is meant to be called from filter's f_event hooks.
894 * The kevent data is filled with the passed in data.
896 * kn->kn_fflags is cleared if kn->kn_flags has EV_CLEAR set.
900 knote_fill_kevent(struct knote
*kn
, struct kevent_qos_s
*kev
, int64_t data
)
902 knote_fill_kevent_with_sdata(kn
, kev
);
903 kev
->filter
= kn
->kn_filter
;
908 #pragma mark file_filtops
911 filt_fileattach(struct knote
*kn
, struct kevent_qos_s
*kev
)
913 return fo_kqfilter(kn
->kn_fp
, kn
, kev
);
916 SECURITY_READ_ONLY_EARLY(static struct filterops
) file_filtops
= {
918 .f_attach
= filt_fileattach
,
921 #pragma mark kqread_filtops
923 #define f_flag fp_glob->fg_flag
924 #define f_ops fp_glob->fg_ops
925 #define f_data fp_glob->fg_data
926 #define f_lflags fp_glob->fg_lflags
929 filt_kqdetach(struct knote
*kn
)
931 struct kqfile
*kqf
= (struct kqfile
*)kn
->kn_fp
->f_data
;
932 struct kqueue
*kq
= &kqf
->kqf_kqueue
;
935 KNOTE_DETACH(&kqf
->kqf_sel
.si_note
, kn
);
940 filt_kqueue(struct knote
*kn
, __unused
long hint
)
942 struct kqueue
*kq
= (struct kqueue
*)kn
->kn_fp
->f_data
;
944 return kq
->kq_count
> 0;
948 filt_kqtouch(struct knote
*kn
, struct kevent_qos_s
*kev
)
951 struct kqueue
*kq
= (struct kqueue
*)kn
->kn_fp
->f_data
;
955 res
= (kq
->kq_count
> 0);
962 filt_kqprocess(struct knote
*kn
, struct kevent_qos_s
*kev
)
964 struct kqueue
*kq
= (struct kqueue
*)kn
->kn_fp
->f_data
;
969 knote_fill_kevent(kn
, kev
, kq
->kq_count
);
977 SECURITY_READ_ONLY_EARLY(static struct filterops
) kqread_filtops
= {
979 .f_detach
= filt_kqdetach
,
980 .f_event
= filt_kqueue
,
981 .f_touch
= filt_kqtouch
,
982 .f_process
= filt_kqprocess
,
985 #pragma mark proc_filtops
988 filt_procattach(struct knote
*kn
, __unused
struct kevent_qos_s
*kev
)
992 assert(PID_MAX
< NOTE_PDATAMASK
);
994 if ((kn
->kn_sfflags
& (NOTE_TRACK
| NOTE_TRACKERR
| NOTE_CHILD
)) != 0) {
995 knote_set_error(kn
, ENOTSUP
);
999 p
= proc_find((int)kn
->kn_id
);
1001 knote_set_error(kn
, ESRCH
);
1005 const uint32_t NoteExitStatusBits
= NOTE_EXIT
| NOTE_EXITSTATUS
;
1007 if ((kn
->kn_sfflags
& NoteExitStatusBits
) == NoteExitStatusBits
) {
1009 pid_t selfpid
= proc_selfpid();
1011 if (p
->p_ppid
== selfpid
) {
1012 break; /* parent => ok */
1014 if ((p
->p_lflag
& P_LTRACED
) != 0 &&
1015 (p
->p_oppid
== selfpid
)) {
1016 break; /* parent-in-waiting => ok */
1018 if (cansignal(current_proc(), kauth_cred_get(), p
, SIGKILL
)) {
1019 break; /* allowed to signal => ok */
1022 knote_set_error(kn
, EACCES
);
1028 kn
->kn_flags
|= EV_CLEAR
; /* automatically set */
1029 kn
->kn_sdata
= 0; /* incoming data is ignored */
1033 KNOTE_ATTACH(&p
->p_klist
, kn
);
1035 proc_klist_unlock();
1040 * only captures edge-triggered events after this point
1041 * so it can't already be fired.
1048 * The knote may be attached to a different process, which may exit,
1049 * leaving nothing for the knote to be attached to. In that case,
1050 * the pointer to the process will have already been nulled out.
1053 filt_procdetach(struct knote
*kn
)
1060 if (p
!= PROC_NULL
) {
1061 kn
->kn_proc
= PROC_NULL
;
1062 KNOTE_DETACH(&p
->p_klist
, kn
);
1065 proc_klist_unlock();
1069 filt_procevent(struct knote
*kn
, long hint
)
1073 /* ALWAYS CALLED WITH proc_klist_lock */
1076 * Note: a lot of bits in hint may be obtained from the knote
1077 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1078 * bits in hint for filt_procevent
1080 * mask off extra data
1082 event
= (u_int
)hint
& NOTE_PCTRLMASK
;
1085 * termination lifecycle events can happen while a debugger
1086 * has reparented a process, in which case notifications
1087 * should be quashed except to the tracing parent. When
1088 * the debugger reaps the child (either via wait4(2) or
1089 * process exit), the child will be reparented to the original
1090 * parent and these knotes re-fired.
1092 if (event
& NOTE_EXIT
) {
1093 if ((kn
->kn_proc
->p_oppid
!= 0)
1094 && (knote_get_kq(kn
)->kq_p
->p_pid
!= kn
->kn_proc
->p_ppid
)) {
1096 * This knote is not for the current ptrace(2) parent, ignore.
1103 * if the user is interested in this event, record it.
1105 if (kn
->kn_sfflags
& event
) {
1106 kn
->kn_fflags
|= event
;
1109 #pragma clang diagnostic push
1110 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1111 if ((event
== NOTE_REAP
) || ((event
== NOTE_EXIT
) && !(kn
->kn_sfflags
& NOTE_REAP
))) {
1112 kn
->kn_flags
|= (EV_EOF
| EV_ONESHOT
);
1114 #pragma clang diagnostic pop
1118 * The kernel has a wrapper in place that returns the same data
1119 * as is collected here, in kn_hook32. Any changes to how
1120 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1121 * should also be reflected in the proc_pidnoteexit() wrapper.
1123 if (event
== NOTE_EXIT
) {
1125 if ((kn
->kn_sfflags
& NOTE_EXITSTATUS
) != 0) {
1126 kn
->kn_fflags
|= NOTE_EXITSTATUS
;
1127 kn
->kn_hook32
|= (hint
& NOTE_PDATAMASK
);
1129 if ((kn
->kn_sfflags
& NOTE_EXIT_DETAIL
) != 0) {
1130 kn
->kn_fflags
|= NOTE_EXIT_DETAIL
;
1131 if ((kn
->kn_proc
->p_lflag
&
1132 P_LTERM_DECRYPTFAIL
) != 0) {
1133 kn
->kn_hook32
|= NOTE_EXIT_DECRYPTFAIL
;
1135 if ((kn
->kn_proc
->p_lflag
&
1136 P_LTERM_JETSAM
) != 0) {
1137 kn
->kn_hook32
|= NOTE_EXIT_MEMORY
;
1138 switch (kn
->kn_proc
->p_lflag
& P_JETSAM_MASK
) {
1139 case P_JETSAM_VMPAGESHORTAGE
:
1140 kn
->kn_hook32
|= NOTE_EXIT_MEMORY_VMPAGESHORTAGE
;
1142 case P_JETSAM_VMTHRASHING
:
1143 kn
->kn_hook32
|= NOTE_EXIT_MEMORY_VMTHRASHING
;
1145 case P_JETSAM_FCTHRASHING
:
1146 kn
->kn_hook32
|= NOTE_EXIT_MEMORY_FCTHRASHING
;
1148 case P_JETSAM_VNODE
:
1149 kn
->kn_hook32
|= NOTE_EXIT_MEMORY_VNODE
;
1151 case P_JETSAM_HIWAT
:
1152 kn
->kn_hook32
|= NOTE_EXIT_MEMORY_HIWAT
;
1155 kn
->kn_hook32
|= NOTE_EXIT_MEMORY_PID
;
1157 case P_JETSAM_IDLEEXIT
:
1158 kn
->kn_hook32
|= NOTE_EXIT_MEMORY_IDLE
;
1162 if ((kn
->kn_proc
->p_csflags
&
1164 kn
->kn_hook32
|= NOTE_EXIT_CSERROR
;
1169 /* if we have any matching state, activate the knote */
1170 return kn
->kn_fflags
!= 0;
1174 filt_proctouch(struct knote
*kn
, struct kevent_qos_s
*kev
)
1180 /* accept new filter flags and mask off output events no long interesting */
1181 kn
->kn_sfflags
= kev
->fflags
;
1183 /* restrict the current results to the (smaller?) set of new interest */
1185 * For compatibility with previous implementations, we leave kn_fflags
1186 * as they were before.
1188 //kn->kn_fflags &= kn->kn_sfflags;
1190 res
= (kn
->kn_fflags
!= 0);
1192 proc_klist_unlock();
1198 filt_procprocess(struct knote
*kn
, struct kevent_qos_s
*kev
)
1203 if (kn
->kn_fflags
) {
1204 knote_fill_kevent(kn
, kev
, kn
->kn_hook32
);
1208 proc_klist_unlock();
1212 SECURITY_READ_ONLY_EARLY(static struct filterops
) proc_filtops
= {
1213 .f_attach
= filt_procattach
,
1214 .f_detach
= filt_procdetach
,
1215 .f_event
= filt_procevent
,
1216 .f_touch
= filt_proctouch
,
1217 .f_process
= filt_procprocess
,
1220 #pragma mark timer_filtops
1222 struct filt_timer_params
{
1223 uint64_t deadline
; /* deadline in abs/cont time
1224 * (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1225 uint64_t leeway
; /* leeway in abstime, or 0 if none */
1226 uint64_t interval
; /* interval in abstime or 0 if non-repeating timer */
1230 * Values stored in the knote at rest (using Mach absolute time units)
1232 * kn->kn_thcall where the thread_call object is stored
1233 * kn->kn_ext[0] next deadline or 0 if immediate expiration
1234 * kn->kn_ext[1] leeway value
1235 * kn->kn_sdata interval timer: the interval
1236 * absolute/deadline timer: 0
1237 * kn->kn_hook32 timer state (with gencount)
1240 * The timer has either never been scheduled or been cancelled.
1241 * It is safe to schedule a new one in this state.
1244 * The timer has been scheduled
1247 * The timer has fired and an event needs to be delivered.
1248 * When in this state, the callout may still be running.
1251 * The timer has fired at registration time, and the callout was never
1254 #define TIMER_IDLE 0x0
1255 #define TIMER_ARMED 0x1
1256 #define TIMER_FIRED 0x2
1257 #define TIMER_IMMEDIATE 0x3
1258 #define TIMER_STATE_MASK 0x3
1259 #define TIMER_GEN_INC 0x4
1262 filt_timer_set_params(struct knote
*kn
, struct filt_timer_params
*params
)
1264 kn
->kn_ext
[0] = params
->deadline
;
1265 kn
->kn_ext
[1] = params
->leeway
;
1266 kn
->kn_sdata
= params
->interval
;
1270 * filt_timervalidate - process data from user
1272 * Sets up the deadline, interval, and leeway from the provided user data
1275 * kn_sdata timer deadline or interval time
1276 * kn_sfflags style of timer, unit of measurement
1279 * struct filter_timer_params to apply to the filter with
1280 * filt_timer_set_params when changes are ready to be commited.
1283 * EINVAL Invalid user data parameters
1284 * ERANGE Various overflows with the parameters
1286 * Called with timer filter lock held.
1289 filt_timervalidate(const struct kevent_qos_s
*kev
,
1290 struct filt_timer_params
*params
)
1293 * There are 5 knobs that need to be chosen for a timer registration:
1295 * A) Units of time (what is the time duration of the specified number)
1296 * Absolute and interval take:
1297 * NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1298 * Defaults to milliseconds if not specified
1300 * B) Clock epoch (what is the zero point of the specified number)
1301 * For interval, there is none
1302 * For absolute, defaults to the gettimeofday/calendar epoch
1303 * With NOTE_MACHTIME, uses mach_absolute_time()
1304 * With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1306 * C) The knote's behavior on delivery
1307 * Interval timer causes the knote to arm for the next interval unless one-shot is set
1308 * Absolute is a forced one-shot timer which deletes on delivery
1309 * TODO: Add a way for absolute to be not forced one-shot
1311 * D) Whether the time duration is relative to now or absolute
1312 * Interval fires at now + duration when it is set up
1313 * Absolute fires at now + difference between now walltime and passed in walltime
1314 * With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1316 * E) Whether the timer continues to tick across sleep
1317 * By default all three do not.
1318 * For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1319 * With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1320 * expires when mach_continuous_time() is > the passed in value.
1323 uint64_t multiplier
;
1325 boolean_t use_abstime
= FALSE
;
1327 switch (kev
->fflags
& (NOTE_SECONDS
| NOTE_USECONDS
| NOTE_NSECONDS
| NOTE_MACHTIME
)) {
1329 multiplier
= NSEC_PER_SEC
;
1332 multiplier
= NSEC_PER_USEC
;
1341 case 0: /* milliseconds (default) */
1342 multiplier
= NSEC_PER_SEC
/ 1000;
1348 /* transform the leeway in kn_ext[1] to same time scale */
1349 if (kev
->fflags
& NOTE_LEEWAY
) {
1350 uint64_t leeway_abs
;
1353 leeway_abs
= (uint64_t)kev
->ext
[1];
1356 if (os_mul_overflow((uint64_t)kev
->ext
[1], multiplier
, &leeway_ns
)) {
1360 nanoseconds_to_absolutetime(leeway_ns
, &leeway_abs
);
1363 params
->leeway
= leeway_abs
;
1368 if (kev
->fflags
& NOTE_ABSOLUTE
) {
1369 uint64_t deadline_abs
;
1372 deadline_abs
= (uint64_t)kev
->data
;
1374 uint64_t calendar_deadline_ns
;
1376 if (os_mul_overflow((uint64_t)kev
->data
, multiplier
, &calendar_deadline_ns
)) {
1380 /* calendar_deadline_ns is in nanoseconds since the epoch */
1382 clock_sec_t seconds
;
1383 clock_nsec_t nanoseconds
;
1386 * Note that the conversion through wall-time is only done once.
1388 * If the relationship between MAT and gettimeofday changes,
1389 * the underlying timer does not update.
1391 * TODO: build a wall-time denominated timer_call queue
1392 * and a flag to request DTRTing with wall-time timers
1394 clock_get_calendar_nanotime(&seconds
, &nanoseconds
);
1396 uint64_t calendar_now_ns
= (uint64_t)seconds
* NSEC_PER_SEC
+ nanoseconds
;
1398 /* if deadline is in the future */
1399 if (calendar_now_ns
< calendar_deadline_ns
) {
1400 uint64_t interval_ns
= calendar_deadline_ns
- calendar_now_ns
;
1401 uint64_t interval_abs
;
1403 nanoseconds_to_absolutetime(interval_ns
, &interval_abs
);
1406 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1407 * causes the timer to keep ticking across sleep, but
1408 * it does not change the calendar timebase.
1411 if (kev
->fflags
& NOTE_MACH_CONTINUOUS_TIME
) {
1412 clock_continuoustime_interval_to_deadline(interval_abs
,
1415 clock_absolutetime_interval_to_deadline(interval_abs
,
1419 deadline_abs
= 0; /* cause immediate expiration */
1423 params
->deadline
= deadline_abs
;
1424 params
->interval
= 0; /* NOTE_ABSOLUTE is non-repeating */
1425 } else if (kev
->data
< 0) {
1427 * Negative interval timers fire immediately, once.
1429 * Ideally a negative interval would be an error, but certain clients
1430 * pass negative values on accident, and expect an event back.
1432 * In the old implementation the timer would repeat with no delay
1433 * N times until mach_absolute_time() + (N * interval) underflowed,
1434 * then it would wait ~forever by accidentally arming a timer for the far future.
1436 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1439 params
->deadline
= 0; /* expire immediately */
1440 params
->interval
= 0; /* non-repeating */
1442 uint64_t interval_abs
= 0;
1445 interval_abs
= (uint64_t)kev
->data
;
1447 uint64_t interval_ns
;
1448 if (os_mul_overflow((uint64_t)kev
->data
, multiplier
, &interval_ns
)) {
1452 nanoseconds_to_absolutetime(interval_ns
, &interval_abs
);
1455 uint64_t deadline
= 0;
1457 if (kev
->fflags
& NOTE_MACH_CONTINUOUS_TIME
) {
1458 clock_continuoustime_interval_to_deadline(interval_abs
, &deadline
);
1460 clock_absolutetime_interval_to_deadline(interval_abs
, &deadline
);
1463 params
->deadline
= deadline
;
1464 params
->interval
= interval_abs
;
1471 * filt_timerexpire - the timer callout routine
1474 filt_timerexpire(void *knx
, void *state_on_arm
)
1476 struct knote
*kn
= knx
;
1478 uint32_t state
= (uint32_t)(uintptr_t)state_on_arm
;
1479 uint32_t fired_state
= state
^ TIMER_ARMED
^ TIMER_FIRED
;
1481 if (os_atomic_cmpxchg(&kn
->kn_hook32
, state
, fired_state
, relaxed
)) {
1482 // our f_event always would say FILTER_ACTIVE,
1483 // so be leaner and just do it.
1484 struct kqueue
*kq
= knote_get_kq(kn
);
1486 knote_activate(kq
, kn
, FILTER_ACTIVE
);
1490 * The timer has been reprogrammed or canceled since it was armed,
1491 * and this is a late firing for the timer, just ignore it.
1497 * Does this deadline needs a timer armed for it, or has it expired?
1500 filt_timer_is_ready(struct knote
*kn
)
1502 uint64_t now
, deadline
= kn
->kn_ext
[0];
1504 if (deadline
== 0) {
1508 if (kn
->kn_sfflags
& NOTE_MACH_CONTINUOUS_TIME
) {
1509 now
= mach_continuous_time();
1511 now
= mach_absolute_time();
1513 return deadline
<= now
;
1519 * It is the responsibility of the caller to make sure the timer call
1520 * has completed or been cancelled properly prior to arming it.
1523 filt_timerarm(struct knote
*kn
)
1525 uint64_t deadline
= kn
->kn_ext
[0];
1526 uint64_t leeway
= kn
->kn_ext
[1];
1529 int filter_flags
= kn
->kn_sfflags
;
1530 unsigned int timer_flags
= 0;
1532 if (filter_flags
& NOTE_CRITICAL
) {
1533 timer_flags
|= THREAD_CALL_DELAY_USER_CRITICAL
;
1534 } else if (filter_flags
& NOTE_BACKGROUND
) {
1535 timer_flags
|= THREAD_CALL_DELAY_USER_BACKGROUND
;
1537 timer_flags
|= THREAD_CALL_DELAY_USER_NORMAL
;
1540 if (filter_flags
& NOTE_LEEWAY
) {
1541 timer_flags
|= THREAD_CALL_DELAY_LEEWAY
;
1544 if (filter_flags
& NOTE_MACH_CONTINUOUS_TIME
) {
1545 timer_flags
|= THREAD_CALL_CONTINUOUS
;
1551 * We increase the gencount, and setup the thread call with this expected
1552 * state. It means that if there was a previous generation of the timer in
1553 * flight that needs to be ignored, then 3 things are possible:
1555 * - the timer fires first, filt_timerexpire() and sets the state to FIRED
1556 * but we clobber it with ARMED and a new gencount. The knote will still
1557 * be activated, but filt_timerprocess() which is serialized with this
1558 * call will not see the FIRED bit set and will not deliver an event.
1560 * - this code runs first, but filt_timerexpire() comes second. Because it
1561 * knows an old gencount, it will debounce and not activate the knote.
1563 * - filt_timerexpire() wasn't in flight yet, and thread_call_enter below
1564 * will just cancel it properly.
1566 * This is important as userspace expects to never be woken up for past
1567 * timers after filt_timertouch ran.
1569 state
= os_atomic_load(&kn
->kn_hook32
, relaxed
);
1570 state
&= ~TIMER_STATE_MASK
;
1571 state
+= TIMER_GEN_INC
+ TIMER_ARMED
;
1572 os_atomic_store(&kn
->kn_hook32
, state
, relaxed
);
1574 thread_call_enter_delayed_with_leeway(kn
->kn_thcall
,
1575 (void *)(uintptr_t)state
, deadline
, leeway
, timer_flags
);
1579 * Mark a timer as "already fired" when it is being reprogrammed
1581 * If there is a timer in flight, this will do a best effort at canceling it,
1582 * but will not wait. If the thread call was in flight, having set the
1583 * TIMER_IMMEDIATE bit will debounce a filt_timerexpire() racing with this
1587 filt_timerfire_immediate(struct knote
*kn
)
1591 static_assert(TIMER_IMMEDIATE
== TIMER_STATE_MASK
,
1592 "validate that this atomic or will transition to IMMEDIATE");
1593 state
= os_atomic_or_orig(&kn
->kn_hook32
, TIMER_IMMEDIATE
, relaxed
);
1595 if ((state
& TIMER_STATE_MASK
) == TIMER_ARMED
) {
1596 thread_call_cancel(kn
->kn_thcall
);
1601 * Allocate a thread call for the knote's lifetime, and kick off the timer.
1604 filt_timerattach(struct knote
*kn
, struct kevent_qos_s
*kev
)
1606 thread_call_t callout
;
1607 struct filt_timer_params params
;
1610 if ((error
= filt_timervalidate(kev
, ¶ms
)) != 0) {
1611 knote_set_error(kn
, error
);
1615 callout
= thread_call_allocate_with_options(filt_timerexpire
,
1616 (thread_call_param_t
)kn
, THREAD_CALL_PRIORITY_HIGH
,
1617 THREAD_CALL_OPTIONS_ONCE
);
1619 if (NULL
== callout
) {
1620 knote_set_error(kn
, ENOMEM
);
1624 filt_timer_set_params(kn
, ¶ms
);
1625 kn
->kn_thcall
= callout
;
1626 kn
->kn_flags
|= EV_CLEAR
;
1627 os_atomic_store(&kn
->kn_hook32
, TIMER_IDLE
, relaxed
);
1629 /* NOTE_ABSOLUTE implies EV_ONESHOT */
1630 if (kn
->kn_sfflags
& NOTE_ABSOLUTE
) {
1631 kn
->kn_flags
|= EV_ONESHOT
;
1634 if (filt_timer_is_ready(kn
)) {
1635 os_atomic_store(&kn
->kn_hook32
, TIMER_IMMEDIATE
, relaxed
);
1636 return FILTER_ACTIVE
;
1644 * Shut down the timer if it's running, and free the callout.
1647 filt_timerdetach(struct knote
*kn
)
1649 __assert_only boolean_t freed
;
1652 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1655 thread_call_cancel_wait(kn
->kn_thcall
);
1656 freed
= thread_call_free(kn
->kn_thcall
);
1661 * filt_timertouch - update timer knote with new user input
1663 * Cancel and restart the timer based on new user data. When
1664 * the user picks up a knote, clear the count of how many timer
1665 * pops have gone off (in kn_data).
1668 filt_timertouch(struct knote
*kn
, struct kevent_qos_s
*kev
)
1670 struct filt_timer_params params
;
1671 uint32_t changed_flags
= (kn
->kn_sfflags
^ kev
->fflags
);
1674 if (changed_flags
& NOTE_ABSOLUTE
) {
1675 kev
->flags
|= EV_ERROR
;
1680 if ((error
= filt_timervalidate(kev
, ¶ms
)) != 0) {
1681 kev
->flags
|= EV_ERROR
;
1686 /* capture the new values used to compute deadline */
1687 filt_timer_set_params(kn
, ¶ms
);
1688 kn
->kn_sfflags
= kev
->fflags
;
1690 if (filt_timer_is_ready(kn
)) {
1691 filt_timerfire_immediate(kn
);
1692 return FILTER_ACTIVE
| FILTER_UPDATE_REQ_QOS
;
1695 return FILTER_UPDATE_REQ_QOS
;
1700 * filt_timerprocess - query state of knote and snapshot event data
1702 * Determine if the timer has fired in the past, snapshot the state
1703 * of the kevent for returning to user-space, and clear pending event
1704 * counters for the next time.
1707 filt_timerprocess(struct knote
*kn
, struct kevent_qos_s
*kev
)
1709 uint32_t state
= os_atomic_load(&kn
->kn_hook32
, relaxed
);
1712 * filt_timerprocess is serialized with any filter routine except for
1713 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1714 * transition, and on success, activates the knote.
1716 * Hence, we don't need atomic modifications of the state, only to peek at
1717 * whether we see any of the "FIRED" state, and if we do, it is safe to
1718 * do simple state machine transitions.
1720 switch (state
& TIMER_STATE_MASK
) {
1724 * This can happen if a touch resets a timer that had fired
1725 * without being processed
1730 os_atomic_store(&kn
->kn_hook32
, state
& ~TIMER_STATE_MASK
, relaxed
);
1733 * Copy out the interesting kevent state,
1734 * but don't leak out the raw time calculations.
1736 * TODO: potential enhancements - tell the user about:
1737 * - deadline to which this timer thought it was expiring
1738 * - return kn_sfflags in the fflags field so the client can know
1739 * under what flags the timer fired
1741 knote_fill_kevent(kn
, kev
, 1);
1743 /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */
1745 if (kn
->kn_sdata
!= 0) {
1747 * This is a 'repeating' timer, so we have to emit
1748 * how many intervals expired between the arm
1751 * A very strange style of interface, because
1752 * this could easily be done in the client...
1757 if (kn
->kn_sfflags
& NOTE_MACH_CONTINUOUS_TIME
) {
1758 now
= mach_continuous_time();
1760 now
= mach_absolute_time();
1763 uint64_t first_deadline
= kn
->kn_ext
[0];
1764 uint64_t interval_abs
= kn
->kn_sdata
;
1765 uint64_t orig_arm_time
= first_deadline
- interval_abs
;
1767 assert(now
> orig_arm_time
);
1768 assert(now
> first_deadline
);
1770 uint64_t elapsed
= now
- orig_arm_time
;
1772 uint64_t num_fired
= elapsed
/ interval_abs
;
1775 * To reach this code, we must have seen the timer pop
1776 * and be in repeating mode, so therefore it must have been
1777 * more than 'interval' time since the attach or last
1780 assert(num_fired
> 0);
1782 /* report how many intervals have elapsed to the user */
1783 kev
->data
= (int64_t)num_fired
;
1785 /* We only need to re-arm the timer if it's not about to be destroyed */
1786 if ((kn
->kn_flags
& EV_ONESHOT
) == 0) {
1787 /* fire at the end of the next interval */
1788 uint64_t new_deadline
= first_deadline
+ num_fired
* interval_abs
;
1790 assert(new_deadline
> now
);
1792 kn
->kn_ext
[0] = new_deadline
;
1795 * This can't shortcut setting up the thread call, because
1796 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1802 return FILTER_ACTIVE
;
1805 SECURITY_READ_ONLY_EARLY(static struct filterops
) timer_filtops
= {
1806 .f_extended_codes
= true,
1807 .f_attach
= filt_timerattach
,
1808 .f_detach
= filt_timerdetach
,
1809 .f_event
= filt_bad_event
,
1810 .f_touch
= filt_timertouch
,
1811 .f_process
= filt_timerprocess
,
1814 #pragma mark user_filtops
1817 filt_userattach(struct knote
*kn
, __unused
struct kevent_qos_s
*kev
)
1819 if (kn
->kn_sfflags
& NOTE_TRIGGER
) {
1820 kn
->kn_hook32
= FILTER_ACTIVE
;
1824 return kn
->kn_hook32
;
1828 filt_usertouch(struct knote
*kn
, struct kevent_qos_s
*kev
)
1833 ffctrl
= kev
->fflags
& NOTE_FFCTRLMASK
;
1834 fflags
= kev
->fflags
& NOTE_FFLAGSMASK
;
1839 kn
->kn_sfflags
&= fflags
;
1842 kn
->kn_sfflags
|= fflags
;
1845 kn
->kn_sfflags
= fflags
;
1848 kn
->kn_sdata
= kev
->data
;
1850 if (kev
->fflags
& NOTE_TRIGGER
) {
1851 kn
->kn_hook32
= FILTER_ACTIVE
;
1853 return (int)kn
->kn_hook32
;
1857 filt_userprocess(struct knote
*kn
, struct kevent_qos_s
*kev
)
1859 int result
= (int)kn
->kn_hook32
;
1862 /* EVFILT_USER returns the data that was passed in */
1863 knote_fill_kevent_with_sdata(kn
, kev
);
1864 kev
->fflags
= kn
->kn_sfflags
;
1865 if (kn
->kn_flags
& EV_CLEAR
) {
1866 /* knote_fill_kevent cleared kn_fflags */
1874 SECURITY_READ_ONLY_EARLY(static struct filterops
) user_filtops
= {
1875 .f_extended_codes
= true,
1876 .f_attach
= filt_userattach
,
1877 .f_detach
= filt_no_detach
,
1878 .f_event
= filt_bad_event
,
1879 .f_touch
= filt_usertouch
,
1880 .f_process
= filt_userprocess
,
1883 #pragma mark workloop_filtops
1885 #define EPREEMPTDISABLED (-1)
1888 filt_wllock(struct kqworkloop
*kqwl
)
1890 lck_spin_lock(&kqwl
->kqwl_statelock
);
1894 filt_wlunlock(struct kqworkloop
*kqwl
)
1896 lck_spin_unlock(&kqwl
->kqwl_statelock
);
1900 * Returns true when the interlock for the turnstile is the workqueue lock
1902 * When this is the case, all turnstiles operations are delegated
1903 * to the workqueue subsystem.
1905 * This is required because kqueue_threadreq_bind_prepost only holds the
1906 * workqueue lock but needs to move the inheritor from the workloop turnstile
1907 * away from the creator thread, so that this now fulfilled request cannot be
1908 * picked anymore by other threads.
1911 filt_wlturnstile_interlock_is_workq(struct kqworkloop
*kqwl
)
1913 return kqr_thread_requested_pending(&kqwl
->kqwl_request
);
1917 filt_wlupdate_inheritor(struct kqworkloop
*kqwl
, struct turnstile
*ts
,
1918 turnstile_update_flags_t flags
)
1920 turnstile_inheritor_t inheritor
= TURNSTILE_INHERITOR_NULL
;
1921 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
1924 * binding to the workq should always happen through
1925 * workq_kern_threadreq_update_inheritor()
1927 assert(!filt_wlturnstile_interlock_is_workq(kqwl
));
1929 if ((inheritor
= kqwl
->kqwl_owner
)) {
1930 flags
|= TURNSTILE_INHERITOR_THREAD
;
1931 } else if ((inheritor
= kqr_thread(kqr
))) {
1932 flags
|= TURNSTILE_INHERITOR_THREAD
;
1935 turnstile_update_inheritor(ts
, inheritor
, flags
);
1938 #define EVFILT_WORKLOOP_EFAULT_RETRY_COUNT 100
1939 #define FILT_WLATTACH 0
1940 #define FILT_WLTOUCH 1
1941 #define FILT_WLDROP 2
1945 filt_wlupdate(struct kqworkloop
*kqwl
, struct knote
*kn
,
1946 struct kevent_qos_s
*kev
, kq_index_t qos_index
, int op
)
1948 user_addr_t uaddr
= CAST_USER_ADDR_T(kev
->ext
[EV_EXTIDX_WL_ADDR
]);
1949 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
1950 thread_t cur_owner
, new_owner
, extra_thread_ref
= THREAD_NULL
;
1951 kq_index_t cur_override
= THREAD_QOS_UNSPECIFIED
;
1952 int efault_retry
= EVFILT_WORKLOOP_EFAULT_RETRY_COUNT
;
1953 int action
= KQWL_UTQ_NONE
, error
= 0;
1954 bool wl_inheritor_updated
= false, needs_wake
= false;
1955 uint64_t kdata
= kev
->ext
[EV_EXTIDX_WL_VALUE
];
1956 uint64_t mask
= kev
->ext
[EV_EXTIDX_WL_MASK
];
1958 struct turnstile
*ts
= TURNSTILE_NULL
;
1963 new_owner
= cur_owner
= kqwl
->kqwl_owner
;
1968 * If asked, load the uint64 value at the user provided address and compare
1969 * it against the passed in mask and expected value.
1971 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
1972 * a thread reference.
1974 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
1975 * the current thread, then end ownership.
1977 * Lastly decide whether we need to perform a QoS update.
1981 * Until <rdar://problem/24999882> exists,
1982 * disabling preemption copyin forces any
1983 * vm_fault we encounter to fail.
1985 error
= copyin_atomic64(uaddr
, &udata
);
1988 * If we get EFAULT, drop locks, and retry.
1989 * If we still get an error report it,
1990 * else assume the memory has been faulted
1991 * and attempt to copyin under lock again.
1997 if (efault_retry
-- > 0) {
1998 filt_wlunlock(kqwl
);
1999 error
= copyin_atomic64(uaddr
, &udata
);
2010 /* Update state as copied in. */
2011 kev
->ext
[EV_EXTIDX_WL_VALUE
] = udata
;
2013 if ((udata
& mask
) != (kdata
& mask
)) {
2015 } else if (kev
->fflags
& NOTE_WL_DISCOVER_OWNER
) {
2017 * Decipher the owner port name, and translate accordingly.
2018 * The low 2 bits were borrowed for other flags, so mask them off.
2020 * Then attempt translation to a thread reference or fail.
2022 mach_port_name_t name
= (mach_port_name_t
)udata
& ~0x3;
2023 if (name
!= MACH_PORT_NULL
) {
2024 name
= ipc_entry_name_mask(name
);
2025 extra_thread_ref
= port_name_to_thread(name
,
2026 PORT_TO_THREAD_IN_CURRENT_TASK
);
2027 if (extra_thread_ref
== THREAD_NULL
) {
2031 new_owner
= extra_thread_ref
;
2036 if ((kev
->fflags
& NOTE_WL_END_OWNERSHIP
) && new_owner
== current_thread()) {
2037 new_owner
= THREAD_NULL
;
2041 if ((kev
->fflags
& NOTE_WL_THREAD_REQUEST
) && (kev
->flags
& EV_DELETE
)) {
2042 action
= KQWL_UTQ_SET_QOS_INDEX
;
2043 } else if (qos_index
&& kqr
->tr_kq_qos_index
!= qos_index
) {
2044 action
= KQWL_UTQ_SET_QOS_INDEX
;
2047 if (op
== FILT_WLTOUCH
) {
2049 * Save off any additional fflags/data we just accepted
2050 * But only keep the last round of "update" bits we acted on which helps
2053 kn
->kn_sfflags
&= ~NOTE_WL_UPDATES_MASK
;
2054 kn
->kn_sfflags
|= kev
->fflags
;
2055 if (kev
->fflags
& NOTE_WL_SYNC_WAKE
) {
2056 needs_wake
= (kn
->kn_thread
!= THREAD_NULL
);
2058 } else if (op
== FILT_WLDROP
) {
2059 if ((kn
->kn_sfflags
& (NOTE_WL_SYNC_WAIT
| NOTE_WL_SYNC_WAKE
)) ==
2060 NOTE_WL_SYNC_WAIT
) {
2062 * When deleting a SYNC_WAIT knote that hasn't been woken up
2063 * explicitly, issue a wake up.
2065 kn
->kn_sfflags
|= NOTE_WL_SYNC_WAKE
;
2066 needs_wake
= (kn
->kn_thread
!= THREAD_NULL
);
2074 * Commit ownership and QoS changes if any, possibly wake up waiters
2077 if (cur_owner
== new_owner
&& action
== KQWL_UTQ_NONE
&& !needs_wake
) {
2083 /* If already tracked as servicer, don't track as owner */
2084 if (new_owner
== kqr_thread(kqr
)) {
2085 new_owner
= THREAD_NULL
;
2088 if (cur_owner
!= new_owner
) {
2089 kqwl
->kqwl_owner
= new_owner
;
2090 if (new_owner
== extra_thread_ref
) {
2091 /* we just transfered this ref to kqwl_owner */
2092 extra_thread_ref
= THREAD_NULL
;
2094 cur_override
= kqworkloop_override(kqwl
);
2097 /* override it before we drop the old */
2098 if (cur_override
!= THREAD_QOS_UNSPECIFIED
) {
2099 thread_add_kevent_override(new_owner
, cur_override
);
2101 if (kqr_thread_requested_pending(kqr
)) {
2102 if (action
== KQWL_UTQ_NONE
) {
2103 action
= KQWL_UTQ_REDRIVE_EVENTS
;
2107 if (!kqr_thread_requested(kqr
) && kqr
->tr_kq_wakeup
) {
2108 if (action
== KQWL_UTQ_NONE
) {
2109 action
= KQWL_UTQ_REDRIVE_EVENTS
;
2115 if (action
!= KQWL_UTQ_NONE
) {
2116 kqworkloop_update_threads_qos(kqwl
, action
, qos_index
);
2119 ts
= kqwl
->kqwl_turnstile
;
2120 if (cur_owner
!= new_owner
&& ts
) {
2121 if (action
== KQWL_UTQ_REDRIVE_EVENTS
) {
2123 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
2124 * the code went through workq_kern_threadreq_initiate()
2125 * and the workqueue has set the inheritor already
2127 assert(filt_wlturnstile_interlock_is_workq(kqwl
));
2128 } else if (filt_wlturnstile_interlock_is_workq(kqwl
)) {
2129 workq_kern_threadreq_lock(kqwl
->kqwl_p
);
2130 workq_kern_threadreq_update_inheritor(kqwl
->kqwl_p
, kqr
, new_owner
,
2131 ts
, TURNSTILE_IMMEDIATE_UPDATE
);
2132 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
2133 if (!filt_wlturnstile_interlock_is_workq(kqwl
)) {
2135 * If the workq is no longer the interlock, then
2136 * workq_kern_threadreq_update_inheritor() has finished a bind
2137 * and we need to fallback to the regular path.
2139 filt_wlupdate_inheritor(kqwl
, ts
, TURNSTILE_IMMEDIATE_UPDATE
);
2141 wl_inheritor_updated
= true;
2143 filt_wlupdate_inheritor(kqwl
, ts
, TURNSTILE_IMMEDIATE_UPDATE
);
2144 wl_inheritor_updated
= true;
2148 * We need a turnstile reference because we are dropping the interlock
2149 * and the caller has not called turnstile_prepare.
2151 if (wl_inheritor_updated
) {
2152 turnstile_reference(ts
);
2156 if (needs_wake
&& ts
) {
2157 waitq_wakeup64_thread(&ts
->ts_waitq
, knote_filt_wev64(kn
),
2158 kn
->kn_thread
, THREAD_AWAKENED
);
2159 if (op
== FILT_WLATTACH
|| op
== FILT_WLTOUCH
) {
2160 disable_preemption();
2161 error
= EPREEMPTDISABLED
;
2171 * Unlock and cleanup various lingering references and things.
2173 filt_wlunlock(kqwl
);
2175 #if CONFIG_WORKLOOP_DEBUG
2176 KQWL_HISTORY_WRITE_ENTRY(kqwl
, {
2177 .updater
= current_thread(),
2178 .servicer
= kqr_thread(kqr
), /* Note: racy */
2179 .old_owner
= cur_owner
,
2180 .new_owner
= new_owner
,
2182 .kev_ident
= kev
->ident
,
2183 .error
= (int16_t)error
,
2184 .kev_flags
= kev
->flags
,
2185 .kev_fflags
= kev
->fflags
,
2191 #endif // CONFIG_WORKLOOP_DEBUG
2193 if (wl_inheritor_updated
) {
2194 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_NOT_HELD
);
2195 turnstile_deallocate_safe(ts
);
2198 if (cur_owner
&& new_owner
!= cur_owner
) {
2199 if (cur_override
!= THREAD_QOS_UNSPECIFIED
) {
2200 thread_drop_kevent_override(cur_owner
);
2202 thread_deallocate_safe(cur_owner
);
2204 if (extra_thread_ref
) {
2205 thread_deallocate_safe(extra_thread_ref
);
2211 * Remembers the last updated that came in from userspace for debugging reasons.
2212 * - fflags is mirrored from the userspace kevent
2213 * - ext[i, i != VALUE] is mirrored from the userspace kevent
2214 * - ext[VALUE] is set to what the kernel loaded atomically
2215 * - data is set to the error if any
2218 filt_wlremember_last_update(struct knote
*kn
, struct kevent_qos_s
*kev
,
2221 kn
->kn_fflags
= kev
->fflags
;
2222 kn
->kn_sdata
= error
;
2223 memcpy(kn
->kn_ext
, kev
->ext
, sizeof(kev
->ext
));
2227 filt_wlupdate_sync_ipc(struct kqworkloop
*kqwl
, struct knote
*kn
,
2228 struct kevent_qos_s
*kev
, int op
)
2230 user_addr_t uaddr
= (user_addr_t
) kev
->ext
[EV_EXTIDX_WL_ADDR
];
2231 uint64_t kdata
= kev
->ext
[EV_EXTIDX_WL_VALUE
];
2232 uint64_t mask
= kev
->ext
[EV_EXTIDX_WL_MASK
];
2234 int efault_retry
= EVFILT_WORKLOOP_EFAULT_RETRY_COUNT
;
2237 if (op
== FILT_WLATTACH
) {
2238 (void)kqueue_alloc_turnstile(&kqwl
->kqwl_kqueue
);
2239 } else if (uaddr
== 0) {
2248 * Do the debounce thing, the lock serializing the state is the knote lock.
2252 * Until <rdar://problem/24999882> exists,
2253 * disabling preemption copyin forces any
2254 * vm_fault we encounter to fail.
2256 error
= copyin_atomic64(uaddr
, &udata
);
2259 * If we get EFAULT, drop locks, and retry.
2260 * If we still get an error report it,
2261 * else assume the memory has been faulted
2262 * and attempt to copyin under lock again.
2268 if (efault_retry
-- > 0) {
2269 filt_wlunlock(kqwl
);
2270 error
= copyin_atomic64(uaddr
, &udata
);
2281 kev
->ext
[EV_EXTIDX_WL_VALUE
] = udata
;
2282 kn
->kn_ext
[EV_EXTIDX_WL_VALUE
] = udata
;
2284 if ((udata
& mask
) != (kdata
& mask
)) {
2290 if (op
== FILT_WLATTACH
) {
2291 error
= filt_wlattach_sync_ipc(kn
);
2293 disable_preemption();
2294 error
= EPREEMPTDISABLED
;
2299 filt_wlunlock(kqwl
);
2304 filt_wlattach(struct knote
*kn
, struct kevent_qos_s
*kev
)
2306 struct kqueue
*kq
= knote_get_kq(kn
);
2307 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
2308 int error
= 0, result
= 0;
2309 kq_index_t qos_index
= 0;
2311 if (__improbable((kq
->kq_state
& KQ_WORKLOOP
) == 0)) {
2316 uint32_t command
= (kn
->kn_sfflags
& NOTE_WL_COMMANDS_MASK
);
2318 case NOTE_WL_THREAD_REQUEST
:
2319 if (kn
->kn_id
!= kqwl
->kqwl_dynamicid
) {
2323 qos_index
= _pthread_priority_thread_qos(kn
->kn_qos
);
2324 if (qos_index
== THREAD_QOS_UNSPECIFIED
) {
2328 if (kqwl
->kqwl_request
.tr_kq_qos_index
) {
2330 * There already is a thread request, and well, you're only allowed
2331 * one per workloop, so fail the attach.
2337 case NOTE_WL_SYNC_WAIT
:
2338 case NOTE_WL_SYNC_WAKE
:
2339 if (kn
->kn_id
== kqwl
->kqwl_dynamicid
) {
2343 if ((kn
->kn_flags
& EV_DISABLE
) == 0) {
2347 if (kn
->kn_sfflags
& NOTE_WL_END_OWNERSHIP
) {
2353 case NOTE_WL_SYNC_IPC
:
2354 if ((kn
->kn_flags
& EV_DISABLE
) == 0) {
2358 if (kn
->kn_sfflags
& (NOTE_WL_UPDATE_QOS
| NOTE_WL_DISCOVER_OWNER
)) {
2368 if (command
== NOTE_WL_SYNC_IPC
) {
2369 error
= filt_wlupdate_sync_ipc(kqwl
, kn
, kev
, FILT_WLATTACH
);
2371 error
= filt_wlupdate(kqwl
, kn
, kev
, qos_index
, FILT_WLATTACH
);
2374 if (error
== EPREEMPTDISABLED
) {
2376 result
= FILTER_THREADREQ_NODEFEER
;
2380 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2381 if (error
== ESTALE
&& (kn
->kn_sfflags
& NOTE_WL_IGNORE_ESTALE
)) {
2384 knote_set_error(kn
, error
);
2387 if (command
== NOTE_WL_SYNC_WAIT
) {
2388 return kevent_register_wait_prepare(kn
, kev
, result
);
2390 /* Just attaching the thread request successfully will fire it */
2391 if (command
== NOTE_WL_THREAD_REQUEST
) {
2393 * Thread Request knotes need an explicit touch to be active again,
2394 * so delivering an event needs to also consume it.
2396 kn
->kn_flags
|= EV_CLEAR
;
2397 return result
| FILTER_ACTIVE
;
2403 filt_wlwait_continue(void *parameter
, wait_result_t wr
)
2405 struct _kevent_register
*cont_args
= parameter
;
2406 struct kqworkloop
*kqwl
= cont_args
->kqwl
;
2409 if (filt_wlturnstile_interlock_is_workq(kqwl
)) {
2410 workq_kern_threadreq_lock(kqwl
->kqwl_p
);
2411 turnstile_complete((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
, NULL
, TURNSTILE_WORKLOOPS
);
2412 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
2414 turnstile_complete((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
, NULL
, TURNSTILE_WORKLOOPS
);
2418 turnstile_cleanup();
2420 if (wr
== THREAD_INTERRUPTED
) {
2421 cont_args
->kev
.flags
|= EV_ERROR
;
2422 cont_args
->kev
.data
= EINTR
;
2423 } else if (wr
!= THREAD_AWAKENED
) {
2424 panic("Unexpected wait result: %d", wr
);
2427 kevent_register_wait_return(cont_args
);
2431 * Called with the workloop mutex held, most of the time never returns as it
2432 * calls filt_wlwait_continue through a continuation.
2435 filt_wlpost_register_wait(struct uthread
*uth
, struct knote
*kn
,
2436 struct _kevent_register
*cont_args
)
2438 struct kqworkloop
*kqwl
= cont_args
->kqwl
;
2439 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
2440 struct turnstile
*ts
;
2441 bool workq_locked
= false;
2445 if (filt_wlturnstile_interlock_is_workq(kqwl
)) {
2446 workq_kern_threadreq_lock(kqwl
->kqwl_p
);
2447 workq_locked
= true;
2450 ts
= turnstile_prepare((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
,
2451 TURNSTILE_NULL
, TURNSTILE_WORKLOOPS
);
2454 workq_kern_threadreq_update_inheritor(kqwl
->kqwl_p
,
2455 &kqwl
->kqwl_request
, kqwl
->kqwl_owner
, ts
,
2456 TURNSTILE_DELAYED_UPDATE
);
2457 if (!filt_wlturnstile_interlock_is_workq(kqwl
)) {
2459 * if the interlock is no longer the workqueue lock,
2460 * then we don't need to hold it anymore.
2462 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
2463 workq_locked
= false;
2466 if (!workq_locked
) {
2468 * If the interlock is the workloop's, then it's our responsibility to
2469 * call update_inheritor, so just do it.
2471 filt_wlupdate_inheritor(kqwl
, ts
, TURNSTILE_DELAYED_UPDATE
);
2474 thread_set_pending_block_hint(uth
->uu_thread
, kThreadWaitWorkloopSyncWait
);
2475 waitq_assert_wait64(&ts
->ts_waitq
, knote_filt_wev64(kn
),
2476 THREAD_ABORTSAFE
, TIMEOUT_WAIT_FOREVER
);
2479 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
2482 thread_t thread
= kqwl
->kqwl_owner
?: kqr_thread(kqr
);
2484 thread_reference(thread
);
2487 kevent_register_wait_block(ts
, thread
, filt_wlwait_continue
, cont_args
);
2490 /* called in stackshot context to report the thread responsible for blocking this thread */
2492 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread
,
2493 event64_t event
, thread_waitinfo_t
*waitinfo
)
2495 extern zone_t thread_zone
;
2496 struct knote
*kn
= (struct knote
*)event
;
2498 zone_require(knote_zone
, kn
);
2500 assert(kn
->kn_thread
== thread
);
2502 struct kqueue
*kq
= knote_get_kq(kn
);
2504 zone_require(kqworkloop_zone
, kq
);
2505 assert(kq
->kq_state
& KQ_WORKLOOP
);
2507 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
2508 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
2510 thread_t kqwl_owner
= kqwl
->kqwl_owner
;
2512 if (kqwl_owner
!= THREAD_NULL
) {
2513 zone_require(thread_zone
, kqwl_owner
);
2514 waitinfo
->owner
= thread_tid(kqwl
->kqwl_owner
);
2515 } else if (kqr_thread_requested_pending(kqr
)) {
2516 waitinfo
->owner
= STACKSHOT_WAITOWNER_THREQUESTED
;
2517 } else if (kqr
->tr_state
>= WORKQ_TR_STATE_BINDING
) {
2518 zone_require(thread_zone
, kqr
->tr_thread
);
2519 waitinfo
->owner
= thread_tid(kqr
->tr_thread
);
2521 waitinfo
->owner
= 0;
2524 waitinfo
->context
= kqwl
->kqwl_dynamicid
;
2528 filt_wldetach(struct knote
*kn
)
2530 if (kn
->kn_sfflags
& NOTE_WL_SYNC_IPC
) {
2531 filt_wldetach_sync_ipc(kn
);
2532 } else if (kn
->kn_thread
) {
2533 kevent_register_wait_cleanup(kn
);
2538 filt_wlvalidate_kev_flags(struct knote
*kn
, struct kevent_qos_s
*kev
,
2539 thread_qos_t
*qos_index
)
2541 uint32_t new_commands
= kev
->fflags
& NOTE_WL_COMMANDS_MASK
;
2542 uint32_t sav_commands
= kn
->kn_sfflags
& NOTE_WL_COMMANDS_MASK
;
2544 if ((kev
->fflags
& NOTE_WL_DISCOVER_OWNER
) && (kev
->flags
& EV_DELETE
)) {
2547 if (kev
->fflags
& NOTE_WL_UPDATE_QOS
) {
2548 if (kev
->flags
& EV_DELETE
) {
2551 if (sav_commands
!= NOTE_WL_THREAD_REQUEST
) {
2554 if (!(*qos_index
= _pthread_priority_thread_qos(kev
->qos
))) {
2559 switch (new_commands
) {
2560 case NOTE_WL_THREAD_REQUEST
:
2561 /* thread requests can only update themselves */
2562 if (sav_commands
!= NOTE_WL_THREAD_REQUEST
) {
2567 case NOTE_WL_SYNC_WAIT
:
2568 if (kev
->fflags
& NOTE_WL_END_OWNERSHIP
) {
2573 case NOTE_WL_SYNC_WAKE
:
2575 if (!(sav_commands
& (NOTE_WL_SYNC_WAIT
| NOTE_WL_SYNC_WAKE
))) {
2578 if ((kev
->flags
& (EV_ENABLE
| EV_DELETE
)) == EV_ENABLE
) {
2583 case NOTE_WL_SYNC_IPC
:
2584 if (sav_commands
!= NOTE_WL_SYNC_IPC
) {
2587 if ((kev
->flags
& (EV_ENABLE
| EV_DELETE
)) == EV_ENABLE
) {
2599 filt_wltouch(struct knote
*kn
, struct kevent_qos_s
*kev
)
2601 struct kqworkloop
*kqwl
= (struct kqworkloop
*)knote_get_kq(kn
);
2602 thread_qos_t qos_index
= THREAD_QOS_UNSPECIFIED
;
2605 int error
= filt_wlvalidate_kev_flags(kn
, kev
, &qos_index
);
2610 uint32_t command
= kev
->fflags
& NOTE_WL_COMMANDS_MASK
;
2611 if (command
== NOTE_WL_SYNC_IPC
) {
2612 error
= filt_wlupdate_sync_ipc(kqwl
, kn
, kev
, FILT_WLTOUCH
);
2614 error
= filt_wlupdate(kqwl
, kn
, kev
, qos_index
, FILT_WLTOUCH
);
2615 filt_wlremember_last_update(kn
, kev
, error
);
2617 if (error
== EPREEMPTDISABLED
) {
2619 result
= FILTER_THREADREQ_NODEFEER
;
2624 if (error
== ESTALE
&& (kev
->fflags
& NOTE_WL_IGNORE_ESTALE
)) {
2625 /* If userland wants ESTALE to be hidden, do not activate */
2628 kev
->flags
|= EV_ERROR
;
2632 if (command
== NOTE_WL_SYNC_WAIT
&& !(kn
->kn_sfflags
& NOTE_WL_SYNC_WAKE
)) {
2633 return kevent_register_wait_prepare(kn
, kev
, result
);
2635 /* Just touching the thread request successfully will fire it */
2636 if (command
== NOTE_WL_THREAD_REQUEST
) {
2637 if (kev
->fflags
& NOTE_WL_UPDATE_QOS
) {
2638 result
|= FILTER_UPDATE_REQ_QOS
;
2640 result
|= FILTER_ACTIVE
;
2646 filt_wlallow_drop(struct knote
*kn
, struct kevent_qos_s
*kev
)
2648 struct kqworkloop
*kqwl
= (struct kqworkloop
*)knote_get_kq(kn
);
2650 int error
= filt_wlvalidate_kev_flags(kn
, kev
, NULL
);
2655 uint32_t command
= (kev
->fflags
& NOTE_WL_COMMANDS_MASK
);
2656 if (command
== NOTE_WL_SYNC_IPC
) {
2657 error
= filt_wlupdate_sync_ipc(kqwl
, kn
, kev
, FILT_WLDROP
);
2659 error
= filt_wlupdate(kqwl
, kn
, kev
, 0, FILT_WLDROP
);
2660 filt_wlremember_last_update(kn
, kev
, error
);
2662 assert(error
!= EPREEMPTDISABLED
);
2666 if (error
== ESTALE
&& (kev
->fflags
& NOTE_WL_IGNORE_ESTALE
)) {
2669 kev
->flags
|= EV_ERROR
;
2677 filt_wlprocess(struct knote
*kn
, struct kevent_qos_s
*kev
)
2679 struct kqworkloop
*kqwl
= (struct kqworkloop
*)knote_get_kq(kn
);
2682 assert(kn
->kn_sfflags
& NOTE_WL_THREAD_REQUEST
);
2686 if (kqwl
->kqwl_owner
) {
2688 * <rdar://problem/33584321> userspace sometimes due to events being
2689 * delivered but not triggering a drain session can cause a process
2690 * of the thread request knote.
2692 * When that happens, the automatic deactivation due to process
2693 * would swallow the event, so we have to activate the knote again.
2695 knote_activate(kqwl
, kn
, FILTER_ACTIVE
);
2697 #if DEBUG || DEVELOPMENT
2698 if (kevent_debug_flags
& KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS
) {
2700 * see src/queue_internal.h in libdispatch
2702 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2703 user_addr_t addr
= CAST_USER_ADDR_T(kn
->kn_ext
[EV_EXTIDX_WL_ADDR
]);
2704 task_t t
= current_task();
2706 if (addr
&& task_is_active(t
) && !task_is_halting(t
) &&
2707 copyin_atomic64(addr
, &val
) == 0 &&
2708 val
&& (val
& DISPATCH_QUEUE_ENQUEUED
) == 0 &&
2709 (val
>> 48) != 0xdead && (val
>> 48) != 0 && (val
>> 48) != 0xffff) {
2710 panic("kevent: workloop %#016llx is not enqueued "
2711 "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2712 kn
->kn_udata
, kn
, val
, kn
->kn_ext
[EV_EXTIDX_WL_VALUE
]);
2716 knote_fill_kevent(kn
, kev
, 0);
2717 kev
->fflags
= kn
->kn_sfflags
;
2718 rc
|= FILTER_ACTIVE
;
2723 if (rc
& FILTER_ACTIVE
) {
2724 workq_thread_set_max_qos(kqwl
->kqwl_p
, &kqwl
->kqwl_request
);
2729 SECURITY_READ_ONLY_EARLY(static struct filterops
) workloop_filtops
= {
2730 .f_extended_codes
= true,
2731 .f_attach
= filt_wlattach
,
2732 .f_detach
= filt_wldetach
,
2733 .f_event
= filt_bad_event
,
2734 .f_touch
= filt_wltouch
,
2735 .f_process
= filt_wlprocess
,
2736 .f_allow_drop
= filt_wlallow_drop
,
2737 .f_post_register_wait
= filt_wlpost_register_wait
,
2740 #pragma mark - kqueues allocation and deallocation
2743 * @enum kqworkloop_dealloc_flags_t
2746 * Flags that alter kqworkloop_dealloc() behavior.
2748 * @const KQWL_DEALLOC_NONE
2749 * Convenient name for "no flags".
2751 * @const KQWL_DEALLOC_SKIP_HASH_REMOVE
2752 * Do not remove the workloop fromt he hash table.
2753 * This is used for process tear-down codepaths as the workloops have been
2754 * removed by the caller already.
2756 OS_OPTIONS(kqworkloop_dealloc_flags
, unsigned,
2757 KQWL_DEALLOC_NONE
= 0x0000,
2758 KQWL_DEALLOC_SKIP_HASH_REMOVE
= 0x0001,
2762 kqworkloop_dealloc(struct kqworkloop
*, kqworkloop_dealloc_flags_t
, uint32_t);
2764 OS_NOINLINE OS_COLD OS_NORETURN
2766 kqworkloop_retain_panic(struct kqworkloop
*kqwl
, uint32_t previous
)
2768 if (previous
== 0) {
2769 panic("kq(%p) resurrection", kqwl
);
2771 panic("kq(%p) retain overflow", kqwl
);
2775 OS_NOINLINE OS_COLD OS_NORETURN
2777 kqworkloop_release_panic(struct kqworkloop
*kqwl
)
2779 panic("kq(%p) over-release", kqwl
);
2784 kqworkloop_try_retain(struct kqworkloop
*kqwl
)
2786 uint32_t old_ref
, new_ref
;
2787 os_atomic_rmw_loop(&kqwl
->kqwl_retains
, old_ref
, new_ref
, relaxed
, {
2788 if (__improbable(old_ref
== 0)) {
2789 os_atomic_rmw_loop_give_up(return false);
2791 if (__improbable(old_ref
>= KQ_WORKLOOP_RETAINS_MAX
)) {
2792 kqworkloop_retain_panic(kqwl
, old_ref
);
2794 new_ref
= old_ref
+ 1;
2801 kqworkloop_retain(struct kqworkloop
*kqwl
)
2803 uint32_t previous
= os_atomic_inc_orig(&kqwl
->kqwl_retains
, relaxed
);
2804 if (__improbable(previous
== 0 || previous
>= KQ_WORKLOOP_RETAINS_MAX
)) {
2805 kqworkloop_retain_panic(kqwl
, previous
);
2811 kqueue_retain(kqueue_t kqu
)
2813 if (kqu
.kq
->kq_state
& KQ_DYNAMIC
) {
2814 kqworkloop_retain(kqu
.kqwl
);
2820 kqworkloop_release_live(struct kqworkloop
*kqwl
)
2822 uint32_t refs
= os_atomic_dec_orig(&kqwl
->kqwl_retains
, relaxed
);
2823 if (__improbable(refs
<= 1)) {
2824 kqworkloop_release_panic(kqwl
);
2830 kqueue_release_live(kqueue_t kqu
)
2832 if (kqu
.kq
->kq_state
& KQ_DYNAMIC
) {
2833 kqworkloop_release_live(kqu
.kqwl
);
2839 kqworkloop_release(struct kqworkloop
*kqwl
)
2841 uint32_t refs
= os_atomic_dec_orig(&kqwl
->kqwl_retains
, relaxed
);
2843 if (__improbable(refs
<= 1)) {
2844 kqworkloop_dealloc(kqwl
, KQWL_DEALLOC_NONE
, refs
- 1);
2850 kqueue_release(kqueue_t kqu
)
2852 if (kqu
.kq
->kq_state
& KQ_DYNAMIC
) {
2853 kqworkloop_release(kqu
.kqwl
);
2858 * @function kqueue_destroy
2861 * Common part to all kqueue dealloc functions.
2865 kqueue_destroy(kqueue_t kqu
, zone_t zone
)
2868 * waitq_set_deinit() remove the KQ's waitq set from
2869 * any select sets to which it may belong.
2871 * The order of these deinits matter: before waitq_set_deinit() returns,
2872 * waitq_set__CALLING_PREPOST_HOOK__ may be called and it will take the
2875 waitq_set_deinit(&kqu
.kq
->kq_wqs
);
2876 lck_spin_destroy(&kqu
.kq
->kq_lock
, &kq_lck_grp
);
2878 zfree(zone
, kqu
.kq
);
2882 * @function kqueue_init
2885 * Common part to all kqueue alloc functions.
2888 kqueue_init(kqueue_t kqu
, waitq_set_prepost_hook_t
*hook
, int policy
)
2890 waitq_set_init(&kqu
.kq
->kq_wqs
, policy
, NULL
, hook
);
2891 lck_spin_init(&kqu
.kq
->kq_lock
, &kq_lck_grp
, LCK_ATTR_NULL
);
2895 #pragma mark kqfile allocation and deallocation
2898 * @function kqueue_dealloc
2901 * Detach all knotes from a kqfile and free it.
2904 * We walk each list looking for knotes referencing this
2905 * this kqueue. If we find one, we try to drop it. But
2906 * if we fail to get a drop reference, that will wait
2907 * until it is dropped. So, we can just restart again
2908 * safe in the assumption that the list will eventually
2909 * not contain any more references to this kqueue (either
2910 * we dropped them all, or someone else did).
2912 * Assumes no new events are being added to the kqueue.
2913 * Nothing locked on entry or exit.
2916 kqueue_dealloc(struct kqueue
*kq
)
2918 KNOTE_LOCK_CTX(knlc
);
2919 struct proc
*p
= kq
->kq_p
;
2920 struct filedesc
*fdp
= p
->p_fd
;
2923 assert(kq
&& (kq
->kq_state
& (KQ_WORKLOOP
| KQ_WORKQ
)) == 0);
2926 for (int i
= 0; i
< fdp
->fd_knlistsize
; i
++) {
2927 kn
= SLIST_FIRST(&fdp
->fd_knlist
[i
]);
2928 while (kn
!= NULL
) {
2929 if (kq
== knote_get_kq(kn
)) {
2932 if (knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ON_SUCCESS
)) {
2933 knote_drop(kq
, kn
, &knlc
);
2936 /* start over at beginning of list */
2937 kn
= SLIST_FIRST(&fdp
->fd_knlist
[i
]);
2940 kn
= SLIST_NEXT(kn
, kn_link
);
2947 if (fdp
->fd_knhashmask
!= 0) {
2948 for (int i
= 0; i
< (int)fdp
->fd_knhashmask
+ 1; i
++) {
2949 kn
= SLIST_FIRST(&fdp
->fd_knhash
[i
]);
2950 while (kn
!= NULL
) {
2951 if (kq
== knote_get_kq(kn
)) {
2954 if (knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ON_SUCCESS
)) {
2955 knote_drop(kq
, kn
, &knlc
);
2958 /* start over at beginning of list */
2959 kn
= SLIST_FIRST(&fdp
->fd_knhash
[i
]);
2962 kn
= SLIST_NEXT(kn
, kn_link
);
2968 kqueue_destroy(kq
, kqfile_zone
);
2972 * @function kqueue_alloc
2975 * Allocate a kqfile.
2978 kqueue_alloc(struct proc
*p
)
2983 * kqfiles are created with kqueue() so we need to wait for
2984 * the first kevent syscall to know which bit among
2985 * KQ_KEV_{32,64,QOS} will be set in kqf_state
2987 kqf
= zalloc_flags(kqfile_zone
, Z_WAITOK
| Z_ZERO
);
2989 TAILQ_INIT_AFTER_BZERO(&kqf
->kqf_queue
);
2990 TAILQ_INIT_AFTER_BZERO(&kqf
->kqf_suppressed
);
2992 return kqueue_init(kqf
, NULL
, SYNC_POLICY_FIFO
| SYNC_POLICY_PREPOST
).kq
;
2996 * @function kqueue_internal
2999 * Core implementation for kqueue and guarded_kqueue_np()
3002 kqueue_internal(struct proc
*p
, fp_allocfn_t fp_zalloc
, void *cra
, int32_t *retval
)
3005 struct fileproc
*fp
;
3008 error
= falloc_withalloc(p
, &fp
, &fd
, vfs_context_current(), fp_zalloc
, cra
);
3013 kq
= kqueue_alloc(p
);
3019 fp
->f_flag
= FREAD
| FWRITE
;
3020 fp
->f_ops
= &kqueueops
;
3022 fp
->f_lflags
|= FG_CONFINED
;
3025 *fdflags(p
, fd
) |= UF_EXCLOSE
| UF_FORKCLOSE
;
3026 procfdtbl_releasefd(p
, fd
, NULL
);
3027 fp_drop(p
, fd
, fp
, 1);
3038 * The kqueue syscall.
3041 kqueue(struct proc
*p
, __unused
struct kqueue_args
*uap
, int32_t *retval
)
3043 return kqueue_internal(p
, fileproc_alloc_init
, NULL
, retval
);
3046 #pragma mark kqworkq allocation and deallocation
3049 * @function kqworkq_dealloc
3052 * Deallocates a workqueue kqueue.
3055 * This only happens at process death, or for races with concurrent
3056 * kevent_get_kqwq calls, hence we don't have to care about knotes referencing
3057 * this kqueue, either there are none, or someone else took care of them.
3060 kqworkq_dealloc(struct kqworkq
*kqwq
)
3062 kqueue_destroy(kqwq
, kqworkq_zone
);
3066 * @function kqworkq_alloc
3069 * Allocates a workqueue kqueue.
3072 * This is the slow path of kevent_get_kqwq.
3073 * This takes care of making sure procs have a single workq kqueue.
3076 static struct kqworkq
*
3077 kqworkq_alloc(struct proc
*p
, unsigned int flags
)
3079 struct kqworkq
*kqwq
, *tmp
;
3081 kqwq
= zalloc_flags(kqworkq_zone
, Z_WAITOK
| Z_ZERO
);
3083 assert((flags
& KEVENT_FLAG_LEGACY32
) == 0);
3084 if (flags
& KEVENT_FLAG_LEGACY64
) {
3085 kqwq
->kqwq_state
= KQ_WORKQ
| KQ_KEV64
;
3087 kqwq
->kqwq_state
= KQ_WORKQ
| KQ_KEV_QOS
;
3091 for (int i
= 0; i
< KQWQ_NBUCKETS
; i
++) {
3092 TAILQ_INIT_AFTER_BZERO(&kqwq
->kqwq_queue
[i
]);
3093 TAILQ_INIT_AFTER_BZERO(&kqwq
->kqwq_suppressed
[i
]);
3095 for (int i
= 0; i
< KQWQ_NBUCKETS
; i
++) {
3097 * Because of how the bucketized system works, we mix overcommit
3098 * sources with not overcommit: each time we move a knote from
3099 * one bucket to the next due to overrides, we'd had to track
3100 * overcommitness, and it's really not worth it in the workloop
3101 * enabled world that track this faithfully.
3103 * Incidentally, this behaves like the original manager-based
3104 * kqwq where event delivery always happened (hence is
3107 kqwq
->kqwq_request
[i
].tr_state
= WORKQ_TR_STATE_IDLE
;
3108 kqwq
->kqwq_request
[i
].tr_flags
= WORKQ_TR_FLAG_KEVENT
;
3109 if (i
!= KQWQ_QOS_MANAGER
) {
3110 kqwq
->kqwq_request
[i
].tr_flags
|= WORKQ_TR_FLAG_OVERCOMMIT
;
3112 kqwq
->kqwq_request
[i
].tr_kq_qos_index
= (kq_index_t
)i
;
3115 kqueue_init(kqwq
, &kqwq
->kqwq_waitq_hook
, SYNC_POLICY_FIFO
);
3117 if (!os_atomic_cmpxchgv(&p
->p_fd
->fd_wqkqueue
, NULL
, kqwq
, &tmp
, release
)) {
3118 kqworkq_dealloc(kqwq
);
3125 #pragma mark kqworkloop allocation and deallocation
3127 #define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3128 #define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3132 kqhash_lock(struct filedesc
*fdp
)
3134 lck_mtx_lock_spin_always(&fdp
->fd_kqhashlock
);
3139 kqhash_unlock(struct filedesc
*fdp
)
3141 lck_mtx_unlock(&fdp
->fd_kqhashlock
);
3146 kqworkloop_hash_insert_locked(struct filedesc
*fdp
, kqueue_id_t id
,
3147 struct kqworkloop
*kqwl
)
3149 struct kqwllist
*list
= &fdp
->fd_kqhash
[KQ_HASH(id
, fdp
->fd_kqhashmask
)];
3150 LIST_INSERT_HEAD(list
, kqwl
, kqwl_hashlink
);
3154 static inline struct kqworkloop
*
3155 kqworkloop_hash_lookup_locked(struct filedesc
*fdp
, kqueue_id_t id
)
3157 struct kqwllist
*list
= &fdp
->fd_kqhash
[KQ_HASH(id
, fdp
->fd_kqhashmask
)];
3158 struct kqworkloop
*kqwl
;
3160 LIST_FOREACH(kqwl
, list
, kqwl_hashlink
) {
3161 if (kqwl
->kqwl_dynamicid
== id
) {
3168 static struct kqworkloop
*
3169 kqworkloop_hash_lookup_and_retain(struct filedesc
*fdp
, kqueue_id_t kq_id
)
3171 struct kqworkloop
*kqwl
= NULL
;
3174 if (__probable(fdp
->fd_kqhash
)) {
3175 kqwl
= kqworkloop_hash_lookup_locked(fdp
, kq_id
);
3176 if (kqwl
&& !kqworkloop_try_retain(kqwl
)) {
3186 kqworkloop_hash_init(struct filedesc
*fdp
)
3188 struct kqwllist
*alloc_hash
;
3192 alloc_hash
= hashinit(CONFIG_KQ_HASHSIZE
, M_KQUEUE
, &alloc_mask
);
3195 /* See if we won the race */
3196 if (__probable(fdp
->fd_kqhashmask
== 0)) {
3197 fdp
->fd_kqhash
= alloc_hash
;
3198 fdp
->fd_kqhashmask
= alloc_mask
;
3201 hashdestroy(alloc_hash
, M_KQUEUE
, alloc_mask
);
3207 * @function kqworkloop_dealloc
3210 * Deallocates a workloop kqueue.
3213 * Knotes hold references on the workloop, so we can't really reach this
3214 * function unless all of these are already gone.
3216 * Nothing locked on entry or exit.
3219 * Unless KQWL_DEALLOC_SKIP_HASH_REMOVE is set, the workloop is removed
3220 * from its hash table.
3222 * @param current_ref
3223 * This function is also called to undo a kqworkloop_alloc in case of
3224 * allocation races, expected_ref is the current refcount that is expected
3225 * on the workloop object, usually 0, and 1 when a dealloc race is resolved.
3228 kqworkloop_dealloc(struct kqworkloop
*kqwl
, kqworkloop_dealloc_flags_t flags
,
3229 uint32_t current_ref
)
3233 if (__improbable(current_ref
> 1)) {
3234 kqworkloop_release_panic(kqwl
);
3236 assert(kqwl
->kqwl_retains
== current_ref
);
3238 /* pair with kqunlock() and other kq locks */
3239 os_atomic_thread_fence(acquire
);
3241 cur_owner
= kqwl
->kqwl_owner
;
3243 if (kqworkloop_override(kqwl
) != THREAD_QOS_UNSPECIFIED
) {
3244 thread_drop_kevent_override(cur_owner
);
3246 thread_deallocate(cur_owner
);
3247 kqwl
->kqwl_owner
= THREAD_NULL
;
3250 if (kqwl
->kqwl_state
& KQ_HAS_TURNSTILE
) {
3251 struct turnstile
*ts
;
3252 turnstile_complete((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
,
3253 &ts
, TURNSTILE_WORKLOOPS
);
3254 turnstile_cleanup();
3255 turnstile_deallocate(ts
);
3258 if ((flags
& KQWL_DEALLOC_SKIP_HASH_REMOVE
) == 0) {
3259 struct filedesc
*fdp
= kqwl
->kqwl_p
->p_fd
;
3262 LIST_REMOVE(kqwl
, kqwl_hashlink
);
3266 assert(TAILQ_EMPTY(&kqwl
->kqwl_suppressed
));
3267 assert(kqwl
->kqwl_owner
== THREAD_NULL
);
3268 assert(kqwl
->kqwl_turnstile
== TURNSTILE_NULL
);
3270 lck_spin_destroy(&kqwl
->kqwl_statelock
, &kq_lck_grp
);
3271 kqueue_destroy(kqwl
, kqworkloop_zone
);
3275 * @function kqworkloop_alloc
3278 * Allocates a workloop kqueue.
3281 kqworkloop_init(struct kqworkloop
*kqwl
, proc_t p
,
3282 kqueue_id_t id
, workq_threadreq_param_t
*trp
)
3284 kqwl
->kqwl_state
= KQ_WORKLOOP
| KQ_DYNAMIC
| KQ_KEV_QOS
;
3285 kqwl
->kqwl_retains
= 1; /* donate a retain to creator */
3286 kqwl
->kqwl_dynamicid
= id
;
3289 kqwl
->kqwl_params
= trp
->trp_value
;
3292 workq_tr_flags_t tr_flags
= WORKQ_TR_FLAG_WORKLOOP
;
3294 if (trp
->trp_flags
& TRP_PRIORITY
) {
3295 tr_flags
|= WORKQ_TR_FLAG_WL_OUTSIDE_QOS
;
3297 if (trp
->trp_flags
) {
3298 tr_flags
|= WORKQ_TR_FLAG_WL_PARAMS
;
3301 kqwl
->kqwl_request
.tr_state
= WORKQ_TR_STATE_IDLE
;
3302 kqwl
->kqwl_request
.tr_flags
= tr_flags
;
3304 for (int i
= 0; i
< KQWL_NBUCKETS
; i
++) {
3305 TAILQ_INIT_AFTER_BZERO(&kqwl
->kqwl_queue
[i
]);
3307 TAILQ_INIT_AFTER_BZERO(&kqwl
->kqwl_suppressed
);
3309 lck_spin_init(&kqwl
->kqwl_statelock
, &kq_lck_grp
, LCK_ATTR_NULL
);
3311 kqueue_init(kqwl
, &kqwl
->kqwl_waitq_hook
, SYNC_POLICY_FIFO
);
3315 * @function kqworkloop_get_or_create
3318 * Wrapper around kqworkloop_alloc that handles the uniquing of workloops.
3322 * EINVAL: invalid parameters
3323 * EEXIST: KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST is set and a collision exists.
3324 * ENOENT: KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST is set and the entry wasn't found.
3325 * ENOMEM: allocation failed
3328 kqworkloop_get_or_create(struct proc
*p
, kqueue_id_t id
,
3329 workq_threadreq_param_t
*trp
, unsigned int flags
, struct kqworkloop
**kqwlp
)
3331 struct filedesc
*fdp
= p
->p_fd
;
3332 struct kqworkloop
*alloc_kqwl
= NULL
;
3333 struct kqworkloop
*kqwl
= NULL
;
3336 assert(!trp
|| (flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
));
3338 if (id
== 0 || id
== (kqueue_id_t
)-1) {
3344 if (__improbable(fdp
->fd_kqhash
== NULL
)) {
3345 kqworkloop_hash_init(fdp
);
3348 kqwl
= kqworkloop_hash_lookup_locked(fdp
, id
);
3350 if (__improbable(flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
)) {
3352 * If MUST_NOT_EXIST was passed, even if we would have failed
3353 * the try_retain, it could have gone the other way, and
3354 * userspace can't tell. Let'em fix their race.
3360 if (__probable(kqworkloop_try_retain(kqwl
))) {
3362 * This is a valid live workloop !
3370 if (__improbable(flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST
)) {
3376 * We didn't find what we were looking for.
3378 * If this is the second time we reach this point (alloc_kqwl != NULL),
3381 * If this is the first time we reach this point (alloc_kqwl == NULL),
3382 * then try to allocate one without blocking.
3384 if (__probable(alloc_kqwl
== NULL
)) {
3385 alloc_kqwl
= zalloc_flags(kqworkloop_zone
, Z_NOWAIT
| Z_ZERO
);
3387 if (__probable(alloc_kqwl
)) {
3388 kqworkloop_init(alloc_kqwl
, p
, id
, trp
);
3389 kqworkloop_hash_insert_locked(fdp
, id
, alloc_kqwl
);
3391 *kqwlp
= alloc_kqwl
;
3396 * We have to block to allocate a workloop, drop the lock,
3397 * allocate one, but then we need to retry lookups as someone
3398 * else could race with us.
3402 alloc_kqwl
= zalloc_flags(kqworkloop_zone
, Z_WAITOK
| Z_ZERO
);
3407 if (__improbable(alloc_kqwl
)) {
3408 zfree(kqworkloop_zone
, alloc_kqwl
);
3414 #pragma mark - knotes
3417 filt_no_attach(struct knote
*kn
, __unused
struct kevent_qos_s
*kev
)
3419 knote_set_error(kn
, ENOTSUP
);
3424 filt_no_detach(__unused
struct knote
*kn
)
3429 filt_bad_event(struct knote
*kn
, long hint
)
3431 panic("%s[%d](%p, %ld)", __func__
, kn
->kn_filter
, kn
, hint
);
3435 filt_bad_touch(struct knote
*kn
, struct kevent_qos_s
*kev
)
3437 panic("%s[%d](%p, %p)", __func__
, kn
->kn_filter
, kn
, kev
);
3441 filt_bad_process(struct knote
*kn
, struct kevent_qos_s
*kev
)
3443 panic("%s[%d](%p, %p)", __func__
, kn
->kn_filter
, kn
, kev
);
3447 * knotes_dealloc - detach all knotes for the process and drop them
3449 * Called with proc_fdlock held.
3450 * Returns with it locked.
3451 * May drop it temporarily.
3452 * Process is in such a state that it will not try to allocate
3453 * any more knotes during this process (stopped for exit or exec).
3456 knotes_dealloc(proc_t p
)
3458 struct filedesc
*fdp
= p
->p_fd
;
3461 struct klist
*kn_hash
= NULL
;
3465 /* Close all the fd-indexed knotes up front */
3466 if (fdp
->fd_knlistsize
> 0) {
3467 for (i
= 0; i
< fdp
->fd_knlistsize
; i
++) {
3468 while ((kn
= SLIST_FIRST(&fdp
->fd_knlist
[i
])) != NULL
) {
3469 kq
= knote_get_kq(kn
);
3472 knote_drop(kq
, kn
, NULL
);
3476 /* free the table */
3477 FREE(fdp
->fd_knlist
, M_KQUEUE
);
3478 fdp
->fd_knlist
= NULL
;
3480 fdp
->fd_knlistsize
= 0;
3485 /* Clean out all the hashed knotes as well */
3486 if (fdp
->fd_knhashmask
!= 0) {
3487 for (i
= 0; i
<= (int)fdp
->fd_knhashmask
; i
++) {
3488 while ((kn
= SLIST_FIRST(&fdp
->fd_knhash
[i
])) != NULL
) {
3489 kq
= knote_get_kq(kn
);
3492 knote_drop(kq
, kn
, NULL
);
3496 kn_hash
= fdp
->fd_knhash
;
3497 kn_hashmask
= fdp
->fd_knhashmask
;
3498 fdp
->fd_knhashmask
= 0;
3499 fdp
->fd_knhash
= NULL
;
3505 hashdestroy(kn_hash
, M_KQUEUE
, kn_hashmask
);
3512 * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3513 * scheduling parameters
3515 * Called with proc_fdlock held.
3516 * Returns with it locked.
3517 * Process is in such a state that it will not try to allocate
3518 * any more knotes during this process (stopped for exit or exec).
3521 kqworkloops_dealloc(proc_t p
)
3523 struct filedesc
*fdp
= p
->p_fd
;
3524 struct kqworkloop
*kqwl
, *kqwln
;
3525 struct kqwllist tofree
;
3527 if (!(fdp
->fd_flags
& FD_WORKLOOP
)) {
3533 if (fdp
->fd_kqhashmask
== 0) {
3540 for (size_t i
= 0; i
<= fdp
->fd_kqhashmask
; i
++) {
3541 LIST_FOREACH_SAFE(kqwl
, &fdp
->fd_kqhash
[i
], kqwl_hashlink
, kqwln
) {
3543 * kqworkloops that have scheduling parameters have an
3544 * implicit retain from kqueue_workloop_ctl that needs
3545 * to be balanced on process exit.
3547 assert(kqwl
->kqwl_params
);
3548 LIST_REMOVE(kqwl
, kqwl_hashlink
);
3549 LIST_INSERT_HEAD(&tofree
, kqwl
, kqwl_hashlink
);
3555 LIST_FOREACH_SAFE(kqwl
, &tofree
, kqwl_hashlink
, kqwln
) {
3556 kqworkloop_dealloc(kqwl
, KQWL_DEALLOC_SKIP_HASH_REMOVE
, 1);
3561 kevent_register_validate_priority(struct kqueue
*kq
, struct knote
*kn
,
3562 struct kevent_qos_s
*kev
)
3564 /* We don't care about the priority of a disabled or deleted knote */
3565 if (kev
->flags
& (EV_DISABLE
| EV_DELETE
)) {
3569 if (kq
->kq_state
& KQ_WORKLOOP
) {
3571 * Workloops need valid priorities with a QOS (excluding manager) for
3572 * any enabled knote.
3574 * When it is pre-existing, just make sure it has a valid QoS as
3575 * kevent_register() will not use the incoming priority (filters who do
3576 * have the responsibility to validate it again, see filt_wltouch).
3578 * If the knote is being made, validate the incoming priority.
3580 if (!_pthread_priority_thread_qos(kn
? kn
->kn_qos
: kev
->qos
)) {
3589 * Prepare a filter for waiting after register.
3591 * The f_post_register_wait hook will be called later by kevent_register()
3592 * and should call kevent_register_wait_block()
3595 kevent_register_wait_prepare(struct knote
*kn
, struct kevent_qos_s
*kev
, int rc
)
3597 thread_t thread
= current_thread();
3599 assert(knote_fops(kn
)->f_extended_codes
);
3601 if (kn
->kn_thread
== NULL
) {
3602 thread_reference(thread
);
3603 kn
->kn_thread
= thread
;
3604 } else if (kn
->kn_thread
!= thread
) {
3606 * kn_thread may be set from a previous aborted wait
3607 * However, it has to be from the same thread.
3609 kev
->flags
|= EV_ERROR
;
3614 return FILTER_REGISTER_WAIT
| rc
;
3618 * Cleanup a kevent_register_wait_prepare() effect for threads that have been
3619 * aborted instead of properly woken up with thread_wakeup_thread().
3622 kevent_register_wait_cleanup(struct knote
*kn
)
3624 thread_t thread
= kn
->kn_thread
;
3625 kn
->kn_thread
= NULL
;
3626 thread_deallocate(thread
);
3630 * Must be called at the end of a f_post_register_wait call from a filter.
3633 kevent_register_wait_block(struct turnstile
*ts
, thread_t thread
,
3634 thread_continue_t cont
, struct _kevent_register
*cont_args
)
3636 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_HELD
);
3637 kqunlock(cont_args
->kqwl
);
3638 cont_args
->handoff_thread
= thread
;
3639 thread_handoff_parameter(thread
, cont
, cont_args
, THREAD_HANDOFF_NONE
);
3643 * Called by Filters using a f_post_register_wait to return from their wait.
3646 kevent_register_wait_return(struct _kevent_register
*cont_args
)
3648 struct kqworkloop
*kqwl
= cont_args
->kqwl
;
3649 struct kevent_qos_s
*kev
= &cont_args
->kev
;
3652 if (cont_args
->handoff_thread
) {
3653 thread_deallocate(cont_args
->handoff_thread
);
3656 if (kev
->flags
& (EV_ERROR
| EV_RECEIPT
)) {
3657 if ((kev
->flags
& EV_ERROR
) == 0) {
3658 kev
->flags
|= EV_ERROR
;
3661 error
= kevent_modern_copyout(kev
, &cont_args
->ueventlist
);
3663 cont_args
->eventout
++;
3667 kqworkloop_release(kqwl
);
3669 *(int32_t *)¤t_uthread()->uu_rval
= cont_args
->eventout
;
3671 unix_syscall_return(error
);
3675 * kevent_register - add a new event to a kqueue
3677 * Creates a mapping between the event source and
3678 * the kqueue via a knote data structure.
3680 * Because many/most the event sources are file
3681 * descriptor related, the knote is linked off
3682 * the filedescriptor table for quick access.
3684 * called with nothing locked
3685 * caller holds a reference on the kqueue
3689 kevent_register(struct kqueue
*kq
, struct kevent_qos_s
*kev
,
3690 struct knote
**kn_out
)
3692 struct proc
*p
= kq
->kq_p
;
3693 const struct filterops
*fops
;
3694 struct knote
*kn
= NULL
;
3695 int result
= 0, error
= 0;
3696 unsigned short kev_flags
= kev
->flags
;
3697 KNOTE_LOCK_CTX(knlc
);
3699 if (__probable(kev
->filter
< 0 && kev
->filter
+ EVFILT_SYSCOUNT
>= 0)) {
3700 fops
= sysfilt_ops
[~kev
->filter
]; /* to 0-base index */
3706 /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
3707 if (__improbable((kev
->flags
& EV_VANISHED
) &&
3708 (kev
->flags
& (EV_ADD
| EV_DISPATCH2
)) != (EV_ADD
| EV_DISPATCH2
))) {
3713 /* Simplify the flags - delete and disable overrule */
3714 if (kev
->flags
& EV_DELETE
) {
3715 kev
->flags
&= ~EV_ADD
;
3717 if (kev
->flags
& EV_DISABLE
) {
3718 kev
->flags
&= ~EV_ENABLE
;
3721 if (kq
->kq_state
& KQ_WORKLOOP
) {
3722 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER
),
3723 ((struct kqworkloop
*)kq
)->kqwl_dynamicid
,
3724 kev
->udata
, kev
->flags
, kev
->filter
);
3725 } else if (kq
->kq_state
& KQ_WORKQ
) {
3726 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER
),
3727 0, kev
->udata
, kev
->flags
, kev
->filter
);
3729 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_REGISTER
),
3730 VM_KERNEL_UNSLIDE_OR_PERM(kq
),
3731 kev
->udata
, kev
->flags
, kev
->filter
);
3735 /* find the matching knote from the fd tables/hashes */
3736 kn
= kq_find_knote_and_kq_lock(kq
, kev
, fops
->f_isfd
, p
);
3737 error
= kevent_register_validate_priority(kq
, kn
, kev
);
3743 if (kn
== NULL
&& (kev
->flags
& EV_ADD
) == 0) {
3745 * No knote found, EV_ADD wasn't specified
3748 if ((kev_flags
& EV_ADD
) && (kev_flags
& EV_DELETE
) &&
3749 (kq
->kq_state
& KQ_WORKLOOP
)) {
3751 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
3752 * that doesn't care about ENOENT, so just pretend the deletion
3759 } else if (kn
== NULL
) {
3761 * No knote found, need to attach a new one (attach)
3764 struct fileproc
*knote_fp
= NULL
;
3766 /* grab a file reference for the new knote */
3768 if ((error
= fp_lookup(p
, (int)kev
->ident
, &knote_fp
, 0)) != 0) {
3776 if (knote_fp
!= NULL
) {
3777 fp_drop(p
, (int)kev
->ident
, knote_fp
, 0);
3782 kn
->kn_fp
= knote_fp
;
3783 kn
->kn_is_fd
= fops
->f_isfd
;
3784 kn
->kn_kq_packed
= VM_PACK_POINTER((vm_offset_t
)kq
, KNOTE_KQ_PACKED
);
3787 /* was vanish support requested */
3788 if (kev
->flags
& EV_VANISHED
) {
3789 kev
->flags
&= ~EV_VANISHED
;
3790 kn
->kn_status
|= KN_REQVANISH
;
3793 /* snapshot matching/dispatching protocol flags into knote */
3794 if (kev
->flags
& EV_DISABLE
) {
3795 kn
->kn_status
|= KN_DISABLED
;
3799 * copy the kevent state into knote
3800 * protocol is that fflags and data
3801 * are saved off, and cleared before
3802 * calling the attach routine.
3804 * - kn->kn_sfflags aliases with kev->xflags
3805 * - kn->kn_sdata aliases with kev->data
3806 * - kn->kn_filter is the top 8 bits of kev->filter
3808 kn
->kn_kevent
= *(struct kevent_internal_s
*)kev
;
3809 kn
->kn_sfflags
= kev
->fflags
;
3810 kn
->kn_filtid
= (uint8_t)~kev
->filter
;
3812 knote_reset_priority(kq
, kn
, kev
->qos
);
3814 /* Add the knote for lookup thru the fd table */
3815 error
= kq_add_knote(kq
, kn
, &knlc
, p
);
3818 if (knote_fp
!= NULL
) {
3819 fp_drop(p
, (int)kev
->ident
, knote_fp
, 0);
3822 if (error
== ERESTART
) {
3828 /* fp reference count now applies to knote */
3831 * we can't use filter_call() because f_attach can change the filter ops
3832 * for a filter that supports f_extended_codes, so we need to reload
3833 * knote_fops() and not use `fops`.
3835 result
= fops
->f_attach(kn
, kev
);
3836 if (result
&& !knote_fops(kn
)->f_extended_codes
) {
3837 result
= FILTER_ACTIVE
;
3842 if (result
& FILTER_THREADREQ_NODEFEER
) {
3843 enable_preemption();
3846 if (kn
->kn_flags
& EV_ERROR
) {
3848 * Failed to attach correctly, so drop.
3850 kn
->kn_filtid
= EVFILTID_DETACHED
;
3851 error
= (int)kn
->kn_sdata
;
3852 knote_drop(kq
, kn
, &knlc
);
3858 * end "attaching" phase - now just attached
3860 * Mark the thread request overcommit, if appropos
3862 * If the attach routine indicated that an
3863 * event is already fired, activate the knote.
3865 if ((kn
->kn_qos
& _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
) &&
3866 (kq
->kq_state
& KQ_WORKLOOP
)) {
3867 kqworkloop_set_overcommit((struct kqworkloop
*)kq
);
3869 } else if (!knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ON_SUCCESS
)) {
3871 * The knote was dropped while we were waiting for the lock,
3872 * we need to re-evaluate entirely
3876 } else if (kev
->flags
& EV_DELETE
) {
3878 * Deletion of a knote (drop)
3880 * If the filter wants to filter drop events, let it do so.
3882 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
3883 * we must wait for the knote to be re-enabled (unless it is being
3884 * re-enabled atomically here).
3887 if (knote_fops(kn
)->f_allow_drop
) {
3891 drop
= knote_fops(kn
)->f_allow_drop(kn
, kev
);
3899 if ((kev
->flags
& EV_ENABLE
) == 0 &&
3900 (kn
->kn_flags
& EV_DISPATCH2
) == EV_DISPATCH2
&&
3901 (kn
->kn_status
& KN_DISABLED
) != 0) {
3902 kn
->kn_status
|= KN_DEFERDELETE
;
3903 error
= EINPROGRESS
;
3907 knote_drop(kq
, kn
, &knlc
);
3911 * Regular update of a knote (touch)
3913 * Call touch routine to notify filter of changes in filter values
3914 * (and to re-determine if any events are fired).
3916 * If the knote is in defer-delete, avoid calling the filter touch
3917 * routine (it has delivered its last event already).
3919 * If the touch routine had no failure,
3920 * apply the requested side effects to the knote.
3923 if (kn
->kn_status
& (KN_DEFERDELETE
| KN_VANISHED
)) {
3924 if (kev
->flags
& EV_ENABLE
) {
3925 result
= FILTER_ACTIVE
;
3929 result
= filter_call(knote_fops(kn
), f_touch(kn
, kev
));
3931 if (result
& FILTER_THREADREQ_NODEFEER
) {
3932 enable_preemption();
3936 if (kev
->flags
& EV_ERROR
) {
3941 if ((kn
->kn_flags
& EV_UDATA_SPECIFIC
) == 0 &&
3942 kn
->kn_udata
!= kev
->udata
) {
3943 // this allows klist_copy_udata() not to take locks
3944 os_atomic_store_wide(&kn
->kn_udata
, kev
->udata
, relaxed
);
3946 if ((kev
->flags
& EV_DISABLE
) && !(kn
->kn_status
& KN_DISABLED
)) {
3947 kn
->kn_status
|= KN_DISABLED
;
3948 knote_dequeue(kq
, kn
);
3952 /* accept new kevent state */
3953 knote_apply_touch(kq
, kn
, kev
, result
);
3957 * When the filter asked for a post-register wait,
3958 * we leave the kqueue locked for kevent_register()
3959 * to call the filter's f_post_register_wait hook.
3961 if (result
& FILTER_REGISTER_WAIT
) {
3962 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ALWAYS
);
3965 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_UNLOCK
);
3969 /* output local errors through the kevent */
3971 kev
->flags
|= EV_ERROR
;
3978 * knote_process - process a triggered event
3980 * Validate that it is really still a triggered event
3981 * by calling the filter routines (if necessary). Hold
3982 * a use reference on the knote to avoid it being detached.
3984 * If it is still considered triggered, we will have taken
3985 * a copy of the state under the filter lock. We use that
3986 * snapshot to dispatch the knote for future processing (or
3987 * not, if this was a lost event).
3989 * Our caller assures us that nobody else can be processing
3990 * events from this knote during the whole operation. But
3991 * others can be touching or posting events to the knote
3992 * interspersed with our processing it.
3994 * caller holds a reference on the kqueue.
3995 * kqueue locked on entry and exit - but may be dropped
3998 knote_process(struct knote
*kn
, kevent_ctx_t kectx
,
3999 kevent_callback_t callback
)
4001 struct kevent_qos_s kev
;
4002 struct kqueue
*kq
= knote_get_kq(kn
);
4003 KNOTE_LOCK_CTX(knlc
);
4004 int result
= FILTER_ACTIVE
;
4009 * Must be active or stayactive
4010 * Must be queued and not disabled/suppressed or dropping
4012 assert(kn
->kn_status
& KN_QUEUED
);
4013 assert(kn
->kn_status
& (KN_ACTIVE
| KN_STAYACTIVE
));
4014 assert(!(kn
->kn_status
& (KN_DISABLED
| KN_SUPPRESSED
| KN_DROPPING
)));
4016 if (kq
->kq_state
& KQ_WORKLOOP
) {
4017 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS
),
4018 ((struct kqworkloop
*)kq
)->kqwl_dynamicid
,
4019 kn
->kn_udata
, kn
->kn_status
| (kn
->kn_id
<< 32),
4021 } else if (kq
->kq_state
& KQ_WORKQ
) {
4022 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS
),
4023 0, kn
->kn_udata
, kn
->kn_status
| (kn
->kn_id
<< 32),
4026 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS
),
4027 VM_KERNEL_UNSLIDE_OR_PERM(kq
), kn
->kn_udata
,
4028 kn
->kn_status
| (kn
->kn_id
<< 32), kn
->kn_filtid
);
4031 if (!knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ALWAYS
)) {
4033 * When the knote is dropping or has dropped,
4034 * then there's nothing we want to process.
4040 * While waiting for the knote lock, we may have dropped the kq lock.
4041 * and a touch may have disabled and dequeued the knote.
4043 if (!(kn
->kn_status
& KN_QUEUED
)) {
4044 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ALWAYS
);
4049 * For deferred-drop or vanished events, we just create a fake
4050 * event to acknowledge end-of-life. Otherwise, we call the
4051 * filter's process routine to snapshot the kevent state under
4052 * the filter's locking protocol.
4054 * suppress knotes to avoid returning the same event multiple times in
4057 knote_suppress(kq
, kn
);
4059 if (kn
->kn_status
& (KN_DEFERDELETE
| KN_VANISHED
)) {
4060 uint16_t kev_flags
= EV_DISPATCH2
| EV_ONESHOT
;
4061 if (kn
->kn_status
& KN_DEFERDELETE
) {
4062 kev_flags
|= EV_DELETE
;
4064 kev_flags
|= EV_VANISHED
;
4067 /* create fake event */
4068 kev
= (struct kevent_qos_s
){
4069 .filter
= kn
->kn_filter
,
4072 .udata
= kn
->kn_udata
,
4076 kev
= (struct kevent_qos_s
) { };
4077 result
= filter_call(knote_fops(kn
), f_process(kn
, &kev
));
4082 * Determine how to dispatch the knote for future event handling.
4083 * not-fired: just return (do not callout, leave deactivated).
4084 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
4085 * is the deferred delete event delivery itself). Otherwise,
4087 * Dispatch: don't clear state, just mark it disabled.
4088 * Cleared: just leave it deactivated.
4089 * Others: re-activate as there may be more events to handle.
4090 * This will not wake up more handlers right now, but
4091 * at the completion of handling events it may trigger
4092 * more handler threads (TODO: optimize based on more than
4093 * just this one event being detected by the filter).
4095 if ((result
& FILTER_ACTIVE
) == 0) {
4096 if ((kn
->kn_status
& (KN_ACTIVE
| KN_STAYACTIVE
)) == 0) {
4098 * Stay active knotes should not be unsuppressed or we'd create an
4101 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4102 * within f_process() but that doesn't necessarily make them
4103 * ready to process, so we should leave them be.
4105 * For other knotes, since we will not return an event,
4106 * there's no point keeping the knote suppressed.
4108 knote_unsuppress(kq
, kn
);
4110 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ALWAYS
);
4114 if (result
& FILTER_ADJUST_EVENT_QOS_BIT
) {
4115 knote_adjust_qos(kq
, kn
, result
);
4117 kev
.qos
= _pthread_priority_combine(kn
->kn_qos
, kn
->kn_qos_override
);
4119 if (kev
.flags
& EV_ONESHOT
) {
4120 if ((kn
->kn_flags
& EV_DISPATCH2
) == EV_DISPATCH2
&&
4121 (kn
->kn_status
& KN_DEFERDELETE
) == 0) {
4122 /* defer dropping non-delete oneshot dispatch2 events */
4123 kn
->kn_status
|= KN_DEFERDELETE
| KN_DISABLED
;
4127 } else if (kn
->kn_flags
& EV_DISPATCH
) {
4128 /* disable all dispatch knotes */
4129 kn
->kn_status
|= KN_DISABLED
;
4130 } else if ((kn
->kn_flags
& EV_CLEAR
) == 0) {
4131 /* re-activate in case there are more events */
4132 knote_activate(kq
, kn
, FILTER_ACTIVE
);
4136 * callback to handle each event as we find it.
4137 * If we have to detach and drop the knote, do
4138 * it while we have the kq unlocked.
4141 knote_drop(kq
, kn
, &knlc
);
4143 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_UNLOCK
);
4146 if (kev
.flags
& EV_VANISHED
) {
4147 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED
),
4148 kev
.ident
, kn
->kn_udata
, kn
->kn_status
| (kn
->kn_id
<< 32),
4152 error
= (callback
)(&kev
, kectx
);
4158 * Returns -1 if the kqueue was unbound and processing should not happen
4160 #define KQWQAE_BEGIN_PROCESSING 1
4161 #define KQWQAE_END_PROCESSING 2
4162 #define KQWQAE_UNBIND 3
4164 kqworkq_acknowledge_events(struct kqworkq
*kqwq
, workq_threadreq_t kqr
,
4165 int kevent_flags
, int kqwqae_op
)
4167 thread_qos_t old_override
= THREAD_QOS_UNSPECIFIED
;
4168 thread_t thread
= kqr_thread_fast(kqr
);
4172 struct kqtailq
*suppressq
= &kqwq
->kqwq_suppressed
[kqr
->tr_kq_qos_index
];
4174 kqlock_held(&kqwq
->kqwq_kqueue
);
4176 if (!TAILQ_EMPTY(suppressq
)) {
4178 * Return suppressed knotes to their original state.
4179 * For workq kqueues, suppressed ones that are still
4180 * truly active (not just forced into the queue) will
4181 * set flags we check below to see if anything got
4184 while ((kn
= TAILQ_FIRST(suppressq
)) != NULL
) {
4185 assert(kn
->kn_status
& KN_SUPPRESSED
);
4186 knote_unsuppress(kqwq
, kn
);
4190 #if DEBUG || DEVELOPMENT
4191 thread_t self
= current_thread();
4192 struct uthread
*ut
= get_bsdthread_info(self
);
4194 assert(thread
== self
);
4195 assert(ut
->uu_kqr_bound
== kqr
);
4196 #endif // DEBUG || DEVELOPMENT
4198 if (kqwqae_op
== KQWQAE_UNBIND
) {
4200 } else if ((kevent_flags
& KEVENT_FLAG_PARKING
) == 0) {
4203 unbind
= !kqr
->tr_kq_wakeup
;
4206 old_override
= kqworkq_unbind_locked(kqwq
, kqr
, thread
);
4209 * request a new thread if we didn't process the whole queue or real events
4210 * have happened (not just putting stay-active events back).
4212 if (kqr
->tr_kq_wakeup
) {
4213 kqueue_threadreq_initiate(&kqwq
->kqwq_kqueue
, kqr
,
4214 kqr
->tr_kq_qos_index
, 0);
4220 * Reset wakeup bit to notice events firing while we are processing,
4221 * as we cannot rely on the bucket queue emptiness because of stay
4224 kqr
->tr_kq_wakeup
= false;
4228 thread_drop_kevent_override(thread
);
4235 * Return 0 to indicate that processing should proceed,
4236 * -1 if there is nothing to process.
4238 * Called with kqueue locked and returns the same way,
4239 * but may drop lock temporarily.
4242 kqworkq_begin_processing(struct kqworkq
*kqwq
, workq_threadreq_t kqr
,
4247 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN
) | DBG_FUNC_START
,
4248 0, kqr
->tr_kq_qos_index
);
4250 rc
= kqworkq_acknowledge_events(kqwq
, kqr
, kevent_flags
,
4251 KQWQAE_BEGIN_PROCESSING
);
4253 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN
) | DBG_FUNC_END
,
4254 thread_tid(kqr_thread(kqr
)), kqr
->tr_kq_wakeup
);
4260 kqworkloop_acknowledge_events(struct kqworkloop
*kqwl
)
4262 kq_index_t qos
= THREAD_QOS_UNSPECIFIED
;
4263 struct knote
*kn
, *tmp
;
4267 TAILQ_FOREACH_SAFE(kn
, &kqwl
->kqwl_suppressed
, kn_tqe
, tmp
) {
4269 * If a knote that can adjust QoS is disabled because of the automatic
4270 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4271 * further overrides keep pushing.
4273 if (knote_fops(kn
)->f_adjusts_qos
&& (kn
->kn_status
& KN_DISABLED
) &&
4274 (kn
->kn_status
& (KN_STAYACTIVE
| KN_DROPPING
)) == 0 &&
4275 (kn
->kn_flags
& (EV_DISPATCH
| EV_DISABLE
)) == EV_DISPATCH
) {
4276 qos
= MAX(qos
, kn
->kn_qos_override
);
4279 knote_unsuppress(kqwl
, kn
);
4286 kqworkloop_begin_processing(struct kqworkloop
*kqwl
, unsigned int kevent_flags
)
4288 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
4289 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
4290 thread_qos_t qos_override
;
4291 thread_t thread
= kqr_thread_fast(kqr
);
4292 int rc
= 0, op
= KQWL_UTQ_NONE
;
4296 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN
) | DBG_FUNC_START
,
4297 kqwl
->kqwl_dynamicid
, 0, 0);
4299 /* nobody else should still be processing */
4300 assert((kq
->kq_state
& KQ_PROCESSING
) == 0);
4302 kq
->kq_state
|= KQ_PROCESSING
;
4304 if (!TAILQ_EMPTY(&kqwl
->kqwl_suppressed
)) {
4305 op
= KQWL_UTQ_RESET_WAKEUP_OVERRIDE
;
4308 if (kevent_flags
& KEVENT_FLAG_PARKING
) {
4310 * When "parking" we want to process events and if no events are found
4313 * However, non overcommit threads sometimes park even when they have
4314 * more work so that the pool can narrow. For these, we need to unbind
4315 * early, so that calling kqworkloop_update_threads_qos() can ask the
4316 * workqueue subsystem whether the thread should park despite having
4319 if (kqr
->tr_flags
& WORKQ_TR_FLAG_OVERCOMMIT
) {
4320 op
= KQWL_UTQ_PARKING
;
4322 op
= KQWL_UTQ_UNBINDING
;
4325 if (op
== KQWL_UTQ_NONE
) {
4329 qos_override
= kqworkloop_acknowledge_events(kqwl
);
4331 if (op
== KQWL_UTQ_UNBINDING
) {
4332 kqworkloop_unbind_locked(kqwl
, thread
, KQWL_OVERRIDE_DROP_IMMEDIATELY
);
4333 kqworkloop_release_live(kqwl
);
4335 kqworkloop_update_threads_qos(kqwl
, op
, qos_override
);
4336 if (op
== KQWL_UTQ_PARKING
) {
4337 if (!TAILQ_EMPTY(&kqwl
->kqwl_queue
[KQWL_BUCKET_STAYACTIVE
])) {
4339 * We cannot trust tr_kq_wakeup when looking at stay active knotes.
4340 * We need to process once, and kqworkloop_end_processing will
4341 * handle the unbind.
4343 } else if (!kqr
->tr_kq_wakeup
|| kqwl
->kqwl_owner
) {
4344 kqworkloop_unbind_locked(kqwl
, thread
, KQWL_OVERRIDE_DROP_DELAYED
);
4345 kqworkloop_release_live(kqwl
);
4348 } else if (op
== KQWL_UTQ_UNBINDING
) {
4349 if (kqr_thread(kqr
) == thread
) {
4351 * The thread request fired again, passed the admission check and
4352 * got bound to the current thread again.
4361 * Reset wakeup bit to notice stay active events firing while we are
4362 * processing, as we cannot rely on the stayactive bucket emptiness.
4364 kqwl
->kqwl_wakeup_indexes
&= ~KQWL_STAYACTIVE_FIRED_BIT
;
4366 kq
->kq_state
&= ~KQ_PROCESSING
;
4370 kqworkloop_unbind_delayed_override_drop(thread
);
4374 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN
) | DBG_FUNC_END
,
4375 kqwl
->kqwl_dynamicid
, 0, 0);
4381 * Return 0 to indicate that processing should proceed,
4382 * -1 if there is nothing to process.
4383 * EBADF if the kqueue is draining
4385 * Called with kqueue locked and returns the same way,
4386 * but may drop lock temporarily.
4390 kqfile_begin_processing(struct kqfile
*kq
)
4392 struct kqtailq
*suppressq
;
4396 assert((kq
->kqf_state
& (KQ_WORKQ
| KQ_WORKLOOP
)) == 0);
4397 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN
) | DBG_FUNC_START
,
4398 VM_KERNEL_UNSLIDE_OR_PERM(kq
), 0);
4400 /* wait to become the exclusive processing thread */
4402 if (kq
->kqf_state
& KQ_DRAIN
) {
4403 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN
) | DBG_FUNC_END
,
4404 VM_KERNEL_UNSLIDE_OR_PERM(kq
), 2);
4408 if ((kq
->kqf_state
& KQ_PROCESSING
) == 0) {
4412 /* if someone else is processing the queue, wait */
4413 kq
->kqf_state
|= KQ_PROCWAIT
;
4414 suppressq
= &kq
->kqf_suppressed
;
4415 waitq_assert_wait64((struct waitq
*)&kq
->kqf_wqs
,
4416 CAST_EVENT64_T(suppressq
), THREAD_UNINT
| THREAD_WAIT_NOREPORT
,
4417 TIMEOUT_WAIT_FOREVER
);
4420 thread_block(THREAD_CONTINUE_NULL
);
4424 /* Nobody else processing */
4426 /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
4427 waitq_set_clear_preposts(&kq
->kqf_wqs
);
4428 kq
->kqf_state
&= ~KQ_WAKEUP
;
4430 /* anything left to process? */
4431 if (TAILQ_EMPTY(&kq
->kqf_queue
)) {
4432 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN
) | DBG_FUNC_END
,
4433 VM_KERNEL_UNSLIDE_OR_PERM(kq
), 1);
4437 /* convert to processing mode */
4438 kq
->kqf_state
|= KQ_PROCESSING
;
4440 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN
) | DBG_FUNC_END
,
4441 VM_KERNEL_UNSLIDE_OR_PERM(kq
));
4447 * Try to end the processing, only called when a workq thread is attempting to
4448 * park (KEVENT_FLAG_PARKING is set).
4450 * When returning -1, the kqworkq is setup again so that it is ready to be
4454 kqworkq_end_processing(struct kqworkq
*kqwq
, workq_threadreq_t kqr
,
4457 if (!TAILQ_EMPTY(&kqwq
->kqwq_queue
[kqr
->tr_kq_qos_index
])) {
4458 /* remember we didn't process everything */
4459 kqr
->tr_kq_wakeup
= true;
4462 if (kevent_flags
& KEVENT_FLAG_PARKING
) {
4464 * if acknowledge events "succeeds" it means there are events,
4465 * which is a failure condition for end_processing.
4467 int rc
= kqworkq_acknowledge_events(kqwq
, kqr
, kevent_flags
,
4468 KQWQAE_END_PROCESSING
);
4478 * Try to end the processing, only called when a workq thread is attempting to
4479 * park (KEVENT_FLAG_PARKING is set).
4481 * When returning -1, the kqworkq is setup again so that it is ready to be
4482 * processed (as if kqworkloop_begin_processing had just been called).
4484 * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
4485 * the kqworkloop is unbound from its servicer as a side effect.
4488 kqworkloop_end_processing(struct kqworkloop
*kqwl
, int flags
, int kevent_flags
)
4490 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
4491 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
4492 thread_qos_t qos_override
;
4493 thread_t thread
= kqr_thread_fast(kqr
);
4498 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END
) | DBG_FUNC_START
,
4499 kqwl
->kqwl_dynamicid
, 0, 0);
4501 if (flags
& KQ_PROCESSING
) {
4502 assert(kq
->kq_state
& KQ_PROCESSING
);
4505 * If we still have queued stayactive knotes, remember we didn't finish
4506 * processing all of them. This should be extremely rare and would
4507 * require to have a lot of them registered and fired.
4509 if (!TAILQ_EMPTY(&kqwl
->kqwl_queue
[KQWL_BUCKET_STAYACTIVE
])) {
4510 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_UPDATE_WAKEUP_QOS
,
4511 KQWL_BUCKET_STAYACTIVE
);
4515 * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
4516 * still under the lock.
4518 * So we do everything kqworkloop_unbind() would do, but because we're
4519 * inside kqueue_process(), if the workloop actually received events
4520 * while our locks were dropped, we have the opportunity to fail the end
4521 * processing and loop again.
4523 * This avoids going through the process-wide workqueue lock hence
4526 if (kevent_flags
& KEVENT_FLAG_PARKING
) {
4527 qos_override
= kqworkloop_acknowledge_events(kqwl
);
4531 if (kevent_flags
& KEVENT_FLAG_PARKING
) {
4532 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_PARKING
, qos_override
);
4533 if (kqr
->tr_kq_wakeup
&& !kqwl
->kqwl_owner
) {
4535 * Reset wakeup bit to notice stay active events firing while we are
4536 * processing, as we cannot rely on the stayactive bucket emptiness.
4538 kqwl
->kqwl_wakeup_indexes
&= ~KQWL_STAYACTIVE_FIRED_BIT
;
4541 kqworkloop_unbind_locked(kqwl
, thread
, KQWL_OVERRIDE_DROP_DELAYED
);
4542 kqworkloop_release_live(kqwl
);
4543 kq
->kq_state
&= ~flags
;
4546 kq
->kq_state
&= ~flags
;
4547 kq
->kq_state
|= KQ_R2K_ARMED
;
4548 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS
, 0);
4551 if ((kevent_flags
& KEVENT_FLAG_PARKING
) && rc
== 0) {
4552 kqworkloop_unbind_delayed_override_drop(thread
);
4555 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END
) | DBG_FUNC_END
,
4556 kqwl
->kqwl_dynamicid
, 0, 0);
4562 * Called with kqueue lock held.
4565 * -1: has more events
4566 * EBADF: kqueue is in draining mode
4569 kqfile_end_processing(struct kqfile
*kq
)
4571 struct kqtailq
*suppressq
= &kq
->kqf_suppressed
;
4577 assert((kq
->kqf_state
& (KQ_WORKQ
| KQ_WORKLOOP
)) == 0);
4579 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END
),
4580 VM_KERNEL_UNSLIDE_OR_PERM(kq
), 0);
4583 * Return suppressed knotes to their original state.
4585 while ((kn
= TAILQ_FIRST(suppressq
)) != NULL
) {
4586 assert(kn
->kn_status
& KN_SUPPRESSED
);
4587 knote_unsuppress(kq
, kn
);
4590 procwait
= (kq
->kqf_state
& KQ_PROCWAIT
);
4591 kq
->kqf_state
&= ~(KQ_PROCESSING
| KQ_PROCWAIT
);
4594 /* first wake up any thread already waiting to process */
4595 waitq_wakeup64_all((struct waitq
*)&kq
->kqf_wqs
,
4596 CAST_EVENT64_T(suppressq
), THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
4599 if (kq
->kqf_state
& KQ_DRAIN
) {
4602 return (kq
->kqf_state
& KQ_WAKEUP
) ? -1 : 0;
4606 kqueue_workloop_ctl_internal(proc_t p
, uintptr_t cmd
, uint64_t __unused options
,
4607 struct kqueue_workloop_params
*params
, int *retval
)
4610 struct kqworkloop
*kqwl
;
4611 struct filedesc
*fdp
= p
->p_fd
;
4612 workq_threadreq_param_t trp
= { };
4615 case KQ_WORKLOOP_CREATE
:
4616 if (!params
->kqwlp_flags
) {
4621 if ((params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_SCHED_PRI
) &&
4622 (params
->kqwlp_sched_pri
< 1 ||
4623 params
->kqwlp_sched_pri
> 63 /* MAXPRI_USER */)) {
4628 if ((params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_SCHED_POL
) &&
4629 invalid_policy(params
->kqwlp_sched_pol
)) {
4634 if ((params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_CPU_PERCENT
) &&
4635 (params
->kqwlp_cpu_percent
<= 0 ||
4636 params
->kqwlp_cpu_percent
> 100 ||
4637 params
->kqwlp_cpu_refillms
<= 0 ||
4638 params
->kqwlp_cpu_refillms
> 0x00ffffff)) {
4643 if (params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_SCHED_PRI
) {
4644 trp
.trp_flags
|= TRP_PRIORITY
;
4645 trp
.trp_pri
= (uint8_t)params
->kqwlp_sched_pri
;
4647 if (params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_SCHED_POL
) {
4648 trp
.trp_flags
|= TRP_POLICY
;
4649 trp
.trp_pol
= (uint8_t)params
->kqwlp_sched_pol
;
4651 if (params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_CPU_PERCENT
) {
4652 trp
.trp_flags
|= TRP_CPUPERCENT
;
4653 trp
.trp_cpupercent
= (uint8_t)params
->kqwlp_cpu_percent
;
4654 trp
.trp_refillms
= params
->kqwlp_cpu_refillms
;
4657 error
= kqworkloop_get_or_create(p
, params
->kqwlp_id
, &trp
,
4658 KEVENT_FLAG_DYNAMIC_KQUEUE
| KEVENT_FLAG_WORKLOOP
|
4659 KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
, &kqwl
);
4664 if (!(fdp
->fd_flags
& FD_WORKLOOP
)) {
4665 /* FD_WORKLOOP indicates we've ever created a workloop
4666 * via this syscall but its only ever added to a process, never
4670 fdp
->fd_flags
|= FD_WORKLOOP
;
4674 case KQ_WORKLOOP_DESTROY
:
4675 error
= kqworkloop_get_or_create(p
, params
->kqwlp_id
, NULL
,
4676 KEVENT_FLAG_DYNAMIC_KQUEUE
| KEVENT_FLAG_WORKLOOP
|
4677 KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST
, &kqwl
);
4682 trp
.trp_value
= kqwl
->kqwl_params
;
4683 if (trp
.trp_flags
&& !(trp
.trp_flags
& TRP_RELEASED
)) {
4684 trp
.trp_flags
|= TRP_RELEASED
;
4685 kqwl
->kqwl_params
= trp
.trp_value
;
4686 kqworkloop_release_live(kqwl
);
4691 kqworkloop_release(kqwl
);
4699 kqueue_workloop_ctl(proc_t p
, struct kqueue_workloop_ctl_args
*uap
, int *retval
)
4701 struct kqueue_workloop_params params
= {
4704 if (uap
->sz
< sizeof(params
.kqwlp_version
)) {
4708 size_t copyin_sz
= MIN(sizeof(params
), uap
->sz
);
4709 int rv
= copyin(uap
->addr
, ¶ms
, copyin_sz
);
4714 if (params
.kqwlp_version
!= (int)uap
->sz
) {
4718 return kqueue_workloop_ctl_internal(p
, uap
->cmd
, uap
->options
, ¶ms
,
4724 kqueue_select(struct fileproc
*fp
, int which
, void *wq_link_id
,
4725 __unused vfs_context_t ctx
)
4727 struct kqfile
*kq
= (struct kqfile
*)fp
->f_data
;
4728 struct kqtailq
*suppressq
= &kq
->kqf_suppressed
;
4729 struct kqtailq
*queue
= &kq
->kqf_queue
;
4733 if (which
!= FREAD
) {
4739 assert((kq
->kqf_state
& KQ_WORKQ
) == 0);
4742 * If this is the first pass, link the wait queue associated with the
4743 * the kqueue onto the wait queue set for the select(). Normally we
4744 * use selrecord() for this, but it uses the wait queue within the
4745 * selinfo structure and we need to use the main one for the kqueue to
4746 * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
4747 * (The select() call will unlink them when it ends).
4749 if (wq_link_id
!= NULL
) {
4750 thread_t cur_act
= current_thread();
4751 struct uthread
* ut
= get_bsdthread_info(cur_act
);
4753 kq
->kqf_state
|= KQ_SEL
;
4754 waitq_link((struct waitq
*)&kq
->kqf_wqs
, ut
->uu_wqset
,
4755 WAITQ_SHOULD_LOCK
, (uint64_t *)wq_link_id
);
4757 /* always consume the reserved link object */
4758 waitq_link_release(*(uint64_t *)wq_link_id
);
4759 *(uint64_t *)wq_link_id
= 0;
4762 * selprocess() is expecting that we send it back the waitq
4763 * that was just added to the thread's waitq set. In order
4764 * to not change the selrecord() API (which is exported to
4765 * kexts), we pass this value back through the
4766 * void *wq_link_id pointer we were passed. We need to use
4767 * memcpy here because the pointer may not be properly aligned
4768 * on 32-bit systems.
4770 void *wqptr
= &kq
->kqf_wqs
;
4771 memcpy(wq_link_id
, (void *)&wqptr
, sizeof(void *));
4774 if (kqfile_begin_processing(kq
) == -1) {
4779 if (!TAILQ_EMPTY(queue
)) {
4781 * there is something queued - but it might be a
4782 * KN_STAYACTIVE knote, which may or may not have
4783 * any events pending. Otherwise, we have to walk
4784 * the list of knotes to see, and peek at the
4785 * (non-vanished) stay-active ones to be really sure.
4787 while ((kn
= (struct knote
*)TAILQ_FIRST(queue
)) != NULL
) {
4788 if (kn
->kn_status
& KN_ACTIVE
) {
4792 assert(kn
->kn_status
& KN_STAYACTIVE
);
4793 knote_suppress(kq
, kn
);
4797 * There were no regular events on the queue, so take
4798 * a deeper look at the stay-queued ones we suppressed.
4800 while ((kn
= (struct knote
*)TAILQ_FIRST(suppressq
)) != NULL
) {
4801 KNOTE_LOCK_CTX(knlc
);
4804 /* If didn't vanish while suppressed - peek at it */
4805 if ((kn
->kn_status
& KN_DROPPING
) || !knote_lock(kq
, kn
, &knlc
,
4806 KNOTE_KQ_LOCK_ON_FAILURE
)) {
4810 result
= filter_call(knote_fops(kn
), f_peek(kn
));
4813 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ALWAYS
);
4816 knote_unsuppress(kq
, kn
);
4818 /* has data or it has to report a vanish */
4819 if (result
& FILTER_ACTIVE
) {
4827 kqfile_end_processing(kq
);
4837 kqueue_close(struct fileglob
*fg
, __unused vfs_context_t ctx
)
4839 struct kqfile
*kqf
= (struct kqfile
*)fg
->fg_data
;
4841 assert((kqf
->kqf_state
& KQ_WORKQ
) == 0);
4842 kqueue_dealloc(&kqf
->kqf_kqueue
);
4848 * Max depth of the nested kq path that can be created.
4849 * Note that this has to be less than the size of kq_level
4850 * to avoid wrapping around and mislabeling the level.
4852 #define MAX_NESTED_KQ 1000
4856 * The callers has taken a use-count reference on this kqueue and will donate it
4857 * to the kqueue we are being added to. This keeps the kqueue from closing until
4858 * that relationship is torn down.
4861 kqueue_kqfilter(struct fileproc
*fp
, struct knote
*kn
,
4862 __unused
struct kevent_qos_s
*kev
)
4864 struct kqfile
*kqf
= (struct kqfile
*)fp
->f_data
;
4865 struct kqueue
*kq
= &kqf
->kqf_kqueue
;
4866 struct kqueue
*parentkq
= knote_get_kq(kn
);
4868 assert((kqf
->kqf_state
& KQ_WORKQ
) == 0);
4870 if (parentkq
== kq
|| kn
->kn_filter
!= EVFILT_READ
) {
4871 knote_set_error(kn
, EINVAL
);
4876 * We have to avoid creating a cycle when nesting kqueues
4877 * inside another. Rather than trying to walk the whole
4878 * potential DAG of nested kqueues, we just use a simple
4879 * ceiling protocol. When a kqueue is inserted into another,
4880 * we check that the (future) parent is not already nested
4881 * into another kqueue at a lower level than the potenial
4882 * child (because it could indicate a cycle). If that test
4883 * passes, we just mark the nesting levels accordingly.
4885 * Only up to MAX_NESTED_KQ can be nested.
4887 * Note: kqworkq and kqworkloop cannot be nested and have reused their
4888 * kq_level field, so ignore these as parent.
4893 if ((parentkq
->kq_state
& (KQ_WORKQ
| KQ_WORKLOOP
)) == 0) {
4894 if (parentkq
->kq_level
> 0 &&
4895 parentkq
->kq_level
< kq
->kq_level
) {
4897 knote_set_error(kn
, EINVAL
);
4901 /* set parent level appropriately */
4902 uint16_t plevel
= (parentkq
->kq_level
== 0)? 2: parentkq
->kq_level
;
4903 if (plevel
< kq
->kq_level
+ 1) {
4904 if (kq
->kq_level
+ 1 > MAX_NESTED_KQ
) {
4906 knote_set_error(kn
, EINVAL
);
4909 plevel
= kq
->kq_level
+ 1;
4912 parentkq
->kq_level
= plevel
;
4917 kn
->kn_filtid
= EVFILTID_KQREAD
;
4919 KNOTE_ATTACH(&kqf
->kqf_sel
.si_note
, kn
);
4920 /* indicate nesting in child, if needed */
4921 if (kq
->kq_level
== 0) {
4925 int count
= kq
->kq_count
;
4931 * kqueue_drain - called when kq is closed
4935 kqueue_drain(struct fileproc
*fp
, __unused vfs_context_t ctx
)
4937 struct kqfile
*kqf
= (struct kqfile
*)fp
->fp_glob
->fg_data
;
4939 assert((kqf
->kqf_state
& KQ_WORKQ
) == 0);
4942 kqf
->kqf_state
|= KQ_DRAIN
;
4944 /* wakeup sleeping threads */
4945 if ((kqf
->kqf_state
& (KQ_SLEEP
| KQ_SEL
)) != 0) {
4946 kqf
->kqf_state
&= ~(KQ_SLEEP
| KQ_SEL
);
4947 (void)waitq_wakeup64_all((struct waitq
*)&kqf
->kqf_wqs
,
4950 WAITQ_ALL_PRIORITIES
);
4953 /* wakeup threads waiting their turn to process */
4954 if (kqf
->kqf_state
& KQ_PROCWAIT
) {
4955 assert(kqf
->kqf_state
& KQ_PROCESSING
);
4957 kqf
->kqf_state
&= ~KQ_PROCWAIT
;
4958 (void)waitq_wakeup64_all((struct waitq
*)&kqf
->kqf_wqs
,
4959 CAST_EVENT64_T(&kqf
->kqf_suppressed
),
4960 THREAD_RESTART
, WAITQ_ALL_PRIORITIES
);
4969 kqueue_stat(struct kqueue
*kq
, void *ub
, int isstat64
, proc_t p
)
4971 assert((kq
->kq_state
& KQ_WORKQ
) == 0);
4974 if (isstat64
!= 0) {
4975 struct stat64
*sb64
= (struct stat64
*)ub
;
4977 bzero((void *)sb64
, sizeof(*sb64
));
4978 sb64
->st_size
= kq
->kq_count
;
4979 if (kq
->kq_state
& KQ_KEV_QOS
) {
4980 sb64
->st_blksize
= sizeof(struct kevent_qos_s
);
4981 } else if (kq
->kq_state
& KQ_KEV64
) {
4982 sb64
->st_blksize
= sizeof(struct kevent64_s
);
4983 } else if (IS_64BIT_PROCESS(p
)) {
4984 sb64
->st_blksize
= sizeof(struct user64_kevent
);
4986 sb64
->st_blksize
= sizeof(struct user32_kevent
);
4988 sb64
->st_mode
= S_IFIFO
;
4990 struct stat
*sb
= (struct stat
*)ub
;
4992 bzero((void *)sb
, sizeof(*sb
));
4993 sb
->st_size
= kq
->kq_count
;
4994 if (kq
->kq_state
& KQ_KEV_QOS
) {
4995 sb
->st_blksize
= sizeof(struct kevent_qos_s
);
4996 } else if (kq
->kq_state
& KQ_KEV64
) {
4997 sb
->st_blksize
= sizeof(struct kevent64_s
);
4998 } else if (IS_64BIT_PROCESS(p
)) {
4999 sb
->st_blksize
= sizeof(struct user64_kevent
);
5001 sb
->st_blksize
= sizeof(struct user32_kevent
);
5003 sb
->st_mode
= S_IFIFO
;
5010 kqueue_threadreq_can_use_ast(struct kqueue
*kq
)
5012 if (current_proc() == kq
->kq_p
) {
5014 * Setting an AST from a non BSD syscall is unsafe: mach_msg_trap() can
5015 * do combined send/receive and in the case of self-IPC, the AST may bet
5016 * set on a thread that will not return to userspace and needs the
5017 * thread the AST would create to unblock itself.
5019 * At this time, we really want to target:
5021 * - kevent variants that can cause thread creations, and dispatch
5022 * really only uses kevent_qos and kevent_id,
5024 * - workq_kernreturn (directly about thread creations)
5026 * - bsdthread_ctl which is used for qos changes and has direct impact
5027 * on the creator thread scheduling decisions.
5029 switch (current_uthread()->syscall_code
) {
5030 case SYS_kevent_qos
:
5032 case SYS_workq_kernreturn
:
5033 case SYS_bsdthread_ctl
:
5041 * Interact with the pthread kext to request a servicing there at a specific QoS
5044 * - Caller holds the workq request lock
5046 * - May be called with the kqueue's wait queue set locked,
5047 * so cannot do anything that could recurse on that.
5050 kqueue_threadreq_initiate(struct kqueue
*kq
, workq_threadreq_t kqr
,
5051 kq_index_t qos
, int flags
)
5053 assert(kqr
->tr_kq_wakeup
);
5054 assert(kqr_thread(kqr
) == THREAD_NULL
);
5055 assert(!kqr_thread_requested(kqr
));
5056 struct turnstile
*ts
= TURNSTILE_NULL
;
5058 if (workq_is_exiting(kq
->kq_p
)) {
5064 if (kq
->kq_state
& KQ_WORKLOOP
) {
5065 __assert_only
struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
5067 assert(kqwl
->kqwl_owner
== THREAD_NULL
);
5068 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST
),
5069 kqwl
->kqwl_dynamicid
, 0, qos
, kqr
->tr_kq_wakeup
);
5070 ts
= kqwl
->kqwl_turnstile
;
5071 /* Add a thread request reference on the kqueue. */
5072 kqworkloop_retain(kqwl
);
5074 assert(kq
->kq_state
& KQ_WORKQ
);
5075 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST
),
5076 -1, 0, qos
, kqr
->tr_kq_wakeup
);
5080 * New-style thread request supported.
5081 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5082 * its use until a corresponding kqueue_threadreq_bind callback.
5084 if (kqueue_threadreq_can_use_ast(kq
)) {
5085 flags
|= WORKQ_THREADREQ_SET_AST_ON_FAILURE
;
5087 if (qos
== KQWQ_QOS_MANAGER
) {
5088 qos
= WORKQ_THREAD_QOS_MANAGER
;
5090 if (!workq_kern_threadreq_initiate(kq
->kq_p
, kqr
, ts
, qos
, flags
)) {
5092 * Process is shutting down or exec'ing.
5093 * All the kqueues are going to be cleaned up
5094 * soon. Forget we even asked for a thread -
5095 * and make sure we don't ask for more.
5097 kq
->kq_state
&= ~KQ_R2K_ARMED
;
5098 kqueue_release_live(kq
);
5103 * kqueue_threadreq_bind_prepost - prepost the bind to kevent
5105 * This is used when kqueue_threadreq_bind may cause a lock inversion.
5107 __attribute__((always_inline
))
5109 kqueue_threadreq_bind_prepost(struct proc
*p __unused
, workq_threadreq_t kqr
,
5112 ut
->uu_kqr_bound
= kqr
;
5113 kqr
->tr_thread
= ut
->uu_thread
;
5114 kqr
->tr_state
= WORKQ_TR_STATE_BINDING
;
5118 * kqueue_threadreq_bind_commit - commit a bind prepost
5120 * The workq code has to commit any binding prepost before the thread has
5121 * a chance to come back to userspace (and do kevent syscalls) or be aborted.
5124 kqueue_threadreq_bind_commit(struct proc
*p
, thread_t thread
)
5126 struct uthread
*ut
= get_bsdthread_info(thread
);
5127 workq_threadreq_t kqr
= ut
->uu_kqr_bound
;
5128 kqueue_t kqu
= kqr_kqueue(p
, kqr
);
5131 if (kqr
->tr_state
== WORKQ_TR_STATE_BINDING
) {
5132 kqueue_threadreq_bind(p
, kqr
, thread
, 0);
5138 kqueue_threadreq_modify(kqueue_t kqu
, workq_threadreq_t kqr
, kq_index_t qos
,
5139 workq_kern_threadreq_flags_t flags
)
5141 assert(kqr_thread_requested_pending(kqr
));
5145 if (kqueue_threadreq_can_use_ast(kqu
.kq
)) {
5146 flags
|= WORKQ_THREADREQ_SET_AST_ON_FAILURE
;
5148 workq_kern_threadreq_modify(kqu
.kq
->kq_p
, kqr
, qos
, flags
);
5152 * kqueue_threadreq_bind - bind thread to processing kqrequest
5154 * The provided thread will be responsible for delivering events
5155 * associated with the given kqrequest. Bind it and get ready for
5156 * the thread to eventually arrive.
5159 kqueue_threadreq_bind(struct proc
*p
, workq_threadreq_t kqr
, thread_t thread
,
5162 kqueue_t kqu
= kqr_kqueue(p
, kqr
);
5163 struct uthread
*ut
= get_bsdthread_info(thread
);
5167 assert(ut
->uu_kqueue_override
== 0);
5169 if (kqr
->tr_state
== WORKQ_TR_STATE_BINDING
) {
5170 assert(ut
->uu_kqr_bound
== kqr
);
5171 assert(kqr
->tr_thread
== thread
);
5173 assert(kqr_thread_requested_pending(kqr
));
5174 assert(kqr
->tr_thread
== THREAD_NULL
);
5175 assert(ut
->uu_kqr_bound
== NULL
);
5176 ut
->uu_kqr_bound
= kqr
;
5177 kqr
->tr_thread
= thread
;
5180 kqr
->tr_state
= WORKQ_TR_STATE_BOUND
;
5182 if (kqu
.kq
->kq_state
& KQ_WORKLOOP
) {
5183 struct turnstile
*ts
= kqu
.kqwl
->kqwl_turnstile
;
5185 if (__improbable(thread
== kqu
.kqwl
->kqwl_owner
)) {
5187 * <rdar://problem/38626999> shows that asserting here is not ok.
5189 * This is not supposed to happen for correct use of the interface,
5190 * but it is sadly possible for userspace (with the help of memory
5191 * corruption, such as over-release of a dispatch queue) to make
5192 * the creator thread the "owner" of a workloop.
5194 * Once that happens, and that creator thread picks up the same
5195 * workloop as a servicer, we trip this codepath. We need to fixup
5196 * the state to forget about this thread being the owner, as the
5197 * entire workloop state machine expects servicers to never be
5198 * owners and everything would basically go downhill from here.
5200 kqu
.kqwl
->kqwl_owner
= THREAD_NULL
;
5201 if (kqworkloop_override(kqu
.kqwl
)) {
5202 thread_drop_kevent_override(thread
);
5206 if (ts
&& (flags
& KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE
) == 0) {
5208 * Past this point, the interlock is the kq req lock again,
5209 * so we can fix the inheritor for good.
5211 filt_wlupdate_inheritor(kqu
.kqwl
, ts
, TURNSTILE_IMMEDIATE_UPDATE
);
5212 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_HELD
);
5215 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_BIND
), kqu
.kqwl
->kqwl_dynamicid
,
5216 thread_tid(thread
), kqr
->tr_kq_qos_index
,
5217 (kqr
->tr_kq_override_index
<< 16) | kqr
->tr_kq_wakeup
);
5219 ut
->uu_kqueue_override
= kqr
->tr_kq_override_index
;
5220 if (kqr
->tr_kq_override_index
) {
5221 thread_add_servicer_override(thread
, kqr
->tr_kq_override_index
);
5224 assert(kqr
->tr_kq_override_index
== 0);
5226 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_BIND
), -1,
5227 thread_tid(thread
), kqr
->tr_kq_qos_index
,
5228 (kqr
->tr_kq_override_index
<< 16) | kqr
->tr_kq_wakeup
);
5233 * kqueue_threadreq_cancel - abort a pending thread request
5235 * Called when exiting/exec'ing. Forget our pending request.
5238 kqueue_threadreq_cancel(struct proc
*p
, workq_threadreq_t kqr
)
5240 kqueue_release(kqr_kqueue(p
, kqr
));
5243 workq_threadreq_param_t
5244 kqueue_threadreq_workloop_param(workq_threadreq_t kqr
)
5246 struct kqworkloop
*kqwl
;
5247 workq_threadreq_param_t trp
;
5249 assert(kqr
->tr_flags
& WORKQ_TR_FLAG_WORKLOOP
);
5250 kqwl
= __container_of(kqr
, struct kqworkloop
, kqwl_request
);
5251 trp
.trp_value
= kqwl
->kqwl_params
;
5256 * kqueue_threadreq_unbind - unbind thread from processing kqueue
5258 * End processing the per-QoS bucket of events and allow other threads
5259 * to be requested for future servicing.
5261 * caller holds a reference on the kqueue.
5264 kqueue_threadreq_unbind(struct proc
*p
, workq_threadreq_t kqr
)
5266 if (kqr
->tr_flags
& WORKQ_TR_FLAG_WORKLOOP
) {
5267 kqworkloop_unbind(kqr_kqworkloop(kqr
));
5269 kqworkq_unbind(p
, kqr
);
5274 * If we aren't already busy processing events [for this QoS],
5275 * request workq thread support as appropriate.
5277 * TBD - for now, we don't segregate out processing by QoS.
5279 * - May be called with the kqueue's wait queue set locked,
5280 * so cannot do anything that could recurse on that.
5283 kqworkq_wakeup(struct kqworkq
*kqwq
, kq_index_t qos_index
)
5285 workq_threadreq_t kqr
= kqworkq_get_request(kqwq
, qos_index
);
5287 /* convert to thread qos value */
5288 assert(qos_index
< KQWQ_NBUCKETS
);
5290 if (!kqr
->tr_kq_wakeup
) {
5291 kqr
->tr_kq_wakeup
= true;
5292 if (!kqr_thread_requested(kqr
)) {
5293 kqueue_threadreq_initiate(&kqwq
->kqwq_kqueue
, kqr
, qos_index
, 0);
5299 * This represent the asynchronous QoS a given workloop contributes,
5300 * hence is the max of the current active knotes (override index)
5301 * and the workloop max qos (userspace async qos).
5304 kqworkloop_override(struct kqworkloop
*kqwl
)
5306 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
5307 return MAX(kqr
->tr_kq_qos_index
, kqr
->tr_kq_override_index
);
5311 kqworkloop_request_fire_r2k_notification(struct kqworkloop
*kqwl
)
5313 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
5317 if (kqwl
->kqwl_state
& KQ_R2K_ARMED
) {
5318 kqwl
->kqwl_state
&= ~KQ_R2K_ARMED
;
5319 act_set_astkevent(kqr_thread_fast(kqr
), AST_KEVENT_RETURN_TO_KERNEL
);
5324 kqworkloop_update_threads_qos(struct kqworkloop
*kqwl
, int op
, kq_index_t qos
)
5326 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
5327 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
5328 kq_index_t old_override
= kqworkloop_override(kqwl
);
5334 case KQWL_UTQ_UPDATE_WAKEUP_QOS
:
5335 if (qos
== KQWL_BUCKET_STAYACTIVE
) {
5337 * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
5338 * a high watermark (kqwl_stayactive_qos) of any stay active knote
5339 * that was ever registered with this workloop.
5341 * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
5342 * knote, we use this high-watermark as a wakeup-index, and also set
5343 * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
5344 * there is at least one stay active knote fired until the next full
5345 * processing of this bucket.
5347 kqwl
->kqwl_wakeup_indexes
|= KQWL_STAYACTIVE_FIRED_BIT
;
5348 qos
= kqwl
->kqwl_stayactive_qos
;
5351 if (kqwl
->kqwl_wakeup_indexes
& (1 << qos
)) {
5352 assert(kqr
->tr_kq_wakeup
);
5356 kqwl
->kqwl_wakeup_indexes
|= (1 << qos
);
5357 kqr
->tr_kq_wakeup
= true;
5358 kqworkloop_request_fire_r2k_notification(kqwl
);
5361 case KQWL_UTQ_UPDATE_STAYACTIVE_QOS
:
5363 if (kqwl
->kqwl_stayactive_qos
< qos
) {
5364 kqwl
->kqwl_stayactive_qos
= qos
;
5365 if (kqwl
->kqwl_wakeup_indexes
& KQWL_STAYACTIVE_FIRED_BIT
) {
5366 assert(kqr
->tr_kq_wakeup
);
5367 kqwl
->kqwl_wakeup_indexes
|= (1 << qos
);
5373 case KQWL_UTQ_PARKING
:
5374 case KQWL_UTQ_UNBINDING
:
5375 kqr
->tr_kq_override_index
= qos
;
5377 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS
:
5378 if (op
== KQWL_UTQ_RECOMPUTE_WAKEUP_QOS
) {
5379 assert(qos
== THREAD_QOS_UNSPECIFIED
);
5381 i
= KQWL_BUCKET_STAYACTIVE
;
5382 if (TAILQ_EMPTY(&kqwl
->kqwl_suppressed
)) {
5383 kqr
->tr_kq_override_index
= THREAD_QOS_UNSPECIFIED
;
5385 if (!TAILQ_EMPTY(&kqwl
->kqwl_queue
[i
]) &&
5386 (kqwl
->kqwl_wakeup_indexes
& KQWL_STAYACTIVE_FIRED_BIT
)) {
5388 * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
5389 * knote may have fired, so we need to merge in kqwl_stayactive_qos.
5391 * Unlike other buckets, this one is never empty but could be idle.
5393 kqwl
->kqwl_wakeup_indexes
&= KQWL_STAYACTIVE_FIRED_BIT
;
5394 kqwl
->kqwl_wakeup_indexes
|= (1 << kqwl
->kqwl_stayactive_qos
);
5396 kqwl
->kqwl_wakeup_indexes
= 0;
5398 for (i
= THREAD_QOS_UNSPECIFIED
+ 1; i
< KQWL_BUCKET_STAYACTIVE
; i
++) {
5399 if (!TAILQ_EMPTY(&kqwl
->kqwl_queue
[i
])) {
5400 kqwl
->kqwl_wakeup_indexes
|= (1 << i
);
5403 if (kqwl
->kqwl_wakeup_indexes
) {
5404 kqr
->tr_kq_wakeup
= true;
5405 kqworkloop_request_fire_r2k_notification(kqwl
);
5407 kqr
->tr_kq_wakeup
= false;
5411 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE
:
5412 kqr
->tr_kq_override_index
= qos
;
5415 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE
:
5418 * When modifying the wakeup QoS or the override QoS, we always need to
5419 * maintain our invariant that kqr_override_index is at least as large
5420 * as the highest QoS for which an event is fired.
5422 * However this override index can be larger when there is an overriden
5423 * suppressed knote pushing on the kqueue.
5425 if (kqwl
->kqwl_wakeup_indexes
> (1 << qos
)) {
5426 qos
= (uint8_t)(fls(kqwl
->kqwl_wakeup_indexes
) - 1); /* fls is 1-based */
5428 if (kqr
->tr_kq_override_index
< qos
) {
5429 kqr
->tr_kq_override_index
= qos
;
5433 case KQWL_UTQ_REDRIVE_EVENTS
:
5436 case KQWL_UTQ_SET_QOS_INDEX
:
5437 kqr
->tr_kq_qos_index
= qos
;
5441 panic("unknown kqwl thread qos update operation: %d", op
);
5444 thread_t kqwl_owner
= kqwl
->kqwl_owner
;
5445 thread_t servicer
= kqr_thread(kqr
);
5446 boolean_t qos_changed
= FALSE
;
5447 kq_index_t new_override
= kqworkloop_override(kqwl
);
5450 * Apply the diffs to the owner if applicable
5454 /* JMM - need new trace hooks for owner overrides */
5455 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST
),
5456 kqwl
->kqwl_dynamicid
, thread_tid(kqwl_owner
), kqr
->tr_kq_qos_index
,
5457 (kqr
->tr_kq_override_index
<< 16) | kqr
->tr_kq_wakeup
);
5459 if (new_override
== old_override
) {
5461 } else if (old_override
== THREAD_QOS_UNSPECIFIED
) {
5462 thread_add_kevent_override(kqwl_owner
, new_override
);
5463 } else if (new_override
== THREAD_QOS_UNSPECIFIED
) {
5464 thread_drop_kevent_override(kqwl_owner
);
5465 } else { /* old_override != new_override */
5466 thread_update_kevent_override(kqwl_owner
, new_override
);
5471 * apply the diffs to the servicer
5473 if (!kqr_thread_requested(kqr
)) {
5475 * No servicer, nor thread-request
5477 * Make a new thread request, unless there is an owner (or the workloop
5478 * is suspended in userland) or if there is no asynchronous work in the
5482 if (kqwl_owner
== NULL
&& kqr
->tr_kq_wakeup
) {
5483 int initiate_flags
= 0;
5484 if (op
== KQWL_UTQ_UNBINDING
) {
5485 initiate_flags
= WORKQ_THREADREQ_ATTEMPT_REBIND
;
5487 kqueue_threadreq_initiate(kq
, kqr
, new_override
, initiate_flags
);
5489 } else if (servicer
) {
5491 * Servicer in flight
5493 * Just apply the diff to the servicer
5495 struct uthread
*ut
= get_bsdthread_info(servicer
);
5496 if (ut
->uu_kqueue_override
!= new_override
) {
5497 if (ut
->uu_kqueue_override
== THREAD_QOS_UNSPECIFIED
) {
5498 thread_add_servicer_override(servicer
, new_override
);
5499 } else if (new_override
== THREAD_QOS_UNSPECIFIED
) {
5500 thread_drop_servicer_override(servicer
);
5501 } else { /* ut->uu_kqueue_override != new_override */
5502 thread_update_servicer_override(servicer
, new_override
);
5504 ut
->uu_kqueue_override
= new_override
;
5507 } else if (new_override
== THREAD_QOS_UNSPECIFIED
) {
5509 * No events to deliver anymore.
5511 * However canceling with turnstiles is challenging, so the fact that
5512 * the request isn't useful will be discovered by the servicer himself
5515 } else if (old_override
!= new_override
) {
5517 * Request is in flight
5519 * Apply the diff to the thread request
5521 kqueue_threadreq_modify(kq
, kqr
, new_override
, WORKQ_THREADREQ_NONE
);
5526 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST
), kqwl
->kqwl_dynamicid
,
5527 thread_tid(servicer
), kqr
->tr_kq_qos_index
,
5528 (kqr
->tr_kq_override_index
<< 16) | kqr
->tr_kq_wakeup
);
5533 kqworkloop_wakeup(struct kqworkloop
*kqwl
, kq_index_t qos
)
5535 if ((kqwl
->kqwl_state
& KQ_PROCESSING
) &&
5536 kqr_thread(&kqwl
->kqwl_request
) == current_thread()) {
5538 * kqworkloop_end_processing() will perform the required QoS
5539 * computations when it unsets the processing mode.
5544 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_UPDATE_WAKEUP_QOS
, qos
);
5547 static struct kqtailq
*
5548 kqueue_get_suppressed_queue(kqueue_t kq
, struct knote
*kn
)
5550 if (kq
.kq
->kq_state
& KQ_WORKLOOP
) {
5551 return &kq
.kqwl
->kqwl_suppressed
;
5552 } else if (kq
.kq
->kq_state
& KQ_WORKQ
) {
5553 return &kq
.kqwq
->kqwq_suppressed
[kn
->kn_qos_index
];
5555 return &kq
.kqf
->kqf_suppressed
;
5560 kqueue_alloc_turnstile(kqueue_t kqu
)
5562 struct kqworkloop
*kqwl
= kqu
.kqwl
;
5563 kq_state_t kq_state
;
5565 kq_state
= os_atomic_load(&kqu
.kq
->kq_state
, dependency
);
5566 if (kq_state
& KQ_HAS_TURNSTILE
) {
5567 /* force a dependency to pair with the atomic or with release below */
5568 return os_atomic_load_with_dependency_on(&kqwl
->kqwl_turnstile
,
5569 (uintptr_t)kq_state
);
5572 if (!(kq_state
& KQ_WORKLOOP
)) {
5573 return TURNSTILE_NULL
;
5576 struct turnstile
*ts
= turnstile_alloc(), *free_ts
= TURNSTILE_NULL
;
5577 bool workq_locked
= false;
5581 if (filt_wlturnstile_interlock_is_workq(kqwl
)) {
5582 workq_locked
= true;
5583 workq_kern_threadreq_lock(kqwl
->kqwl_p
);
5586 if (kqwl
->kqwl_state
& KQ_HAS_TURNSTILE
) {
5588 ts
= kqwl
->kqwl_turnstile
;
5590 ts
= turnstile_prepare((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
,
5591 ts
, TURNSTILE_WORKLOOPS
);
5593 /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
5594 os_atomic_or(&kqwl
->kqwl_state
, KQ_HAS_TURNSTILE
, release
);
5596 if (filt_wlturnstile_interlock_is_workq(kqwl
)) {
5597 workq_kern_threadreq_update_inheritor(kqwl
->kqwl_p
,
5598 &kqwl
->kqwl_request
, kqwl
->kqwl_owner
,
5599 ts
, TURNSTILE_IMMEDIATE_UPDATE
);
5601 * The workq may no longer be the interlock after this.
5602 * In which case the inheritor wasn't updated.
5605 if (!filt_wlturnstile_interlock_is_workq(kqwl
)) {
5606 filt_wlupdate_inheritor(kqwl
, ts
, TURNSTILE_IMMEDIATE_UPDATE
);
5611 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
5617 turnstile_deallocate(free_ts
);
5619 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_NOT_HELD
);
5624 __attribute__((always_inline
))
5626 kqueue_turnstile(kqueue_t kqu
)
5628 kq_state_t kq_state
= os_atomic_load(&kqu
.kq
->kq_state
, relaxed
);
5629 if (kq_state
& KQ_WORKLOOP
) {
5630 return os_atomic_load(&kqu
.kqwl
->kqwl_turnstile
, relaxed
);
5632 return TURNSTILE_NULL
;
5635 __attribute__((always_inline
))
5637 kqueue_threadreq_get_turnstile(workq_threadreq_t kqr
)
5639 struct kqworkloop
*kqwl
= kqr_kqworkloop(kqr
);
5641 return os_atomic_load(&kqwl
->kqwl_turnstile
, relaxed
);
5643 return TURNSTILE_NULL
;
5647 kqworkloop_set_overcommit(struct kqworkloop
*kqwl
)
5649 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
5652 * This test is racy, but since we never remove this bit,
5653 * it allows us to avoid taking a lock.
5655 if (kqr
->tr_flags
& WORKQ_TR_FLAG_OVERCOMMIT
) {
5661 if (kqr_thread_requested_pending(kqr
)) {
5662 kqueue_threadreq_modify(kqwl
, kqr
, kqr
->tr_qos
,
5663 WORKQ_THREADREQ_MAKE_OVERCOMMIT
);
5665 kqr
->tr_flags
|= WORKQ_TR_FLAG_OVERCOMMIT
;
5670 kqworkq_update_override(struct kqworkq
*kqwq
, struct knote
*kn
,
5671 kq_index_t override_index
)
5673 workq_threadreq_t kqr
;
5674 kq_index_t old_override_index
;
5675 kq_index_t queue_index
= kn
->kn_qos_index
;
5677 if (override_index
<= queue_index
) {
5681 kqr
= kqworkq_get_request(kqwq
, queue_index
);
5685 old_override_index
= kqr
->tr_kq_override_index
;
5686 if (override_index
> MAX(kqr
->tr_kq_qos_index
, old_override_index
)) {
5687 thread_t servicer
= kqr_thread(kqr
);
5688 kqr
->tr_kq_override_index
= override_index
;
5690 /* apply the override to [incoming?] servicing thread */
5692 if (old_override_index
) {
5693 thread_update_kevent_override(servicer
, override_index
);
5695 thread_add_kevent_override(servicer
, override_index
);
5702 kqueue_update_override(kqueue_t kqu
, struct knote
*kn
, thread_qos_t qos
)
5704 if (kqu
.kq
->kq_state
& KQ_WORKLOOP
) {
5705 kqworkloop_update_threads_qos(kqu
.kqwl
, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE
,
5708 kqworkq_update_override(kqu
.kqwq
, kn
, qos
);
5713 kqworkloop_unbind_locked(struct kqworkloop
*kqwl
, thread_t thread
,
5714 enum kqwl_unbind_locked_mode how
)
5716 struct uthread
*ut
= get_bsdthread_info(thread
);
5717 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
5719 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND
), kqwl
->kqwl_dynamicid
,
5720 thread_tid(thread
), 0, 0);
5724 assert(ut
->uu_kqr_bound
== kqr
);
5725 ut
->uu_kqr_bound
= NULL
;
5726 if (how
== KQWL_OVERRIDE_DROP_IMMEDIATELY
&&
5727 ut
->uu_kqueue_override
!= THREAD_QOS_UNSPECIFIED
) {
5728 thread_drop_servicer_override(thread
);
5729 ut
->uu_kqueue_override
= THREAD_QOS_UNSPECIFIED
;
5732 if (kqwl
->kqwl_owner
== NULL
&& kqwl
->kqwl_turnstile
) {
5733 turnstile_update_inheritor(kqwl
->kqwl_turnstile
,
5734 TURNSTILE_INHERITOR_NULL
, TURNSTILE_IMMEDIATE_UPDATE
);
5735 turnstile_update_inheritor_complete(kqwl
->kqwl_turnstile
,
5736 TURNSTILE_INTERLOCK_HELD
);
5739 kqr
->tr_thread
= THREAD_NULL
;
5740 kqr
->tr_state
= WORKQ_TR_STATE_IDLE
;
5741 kqwl
->kqwl_state
&= ~KQ_R2K_ARMED
;
5745 kqworkloop_unbind_delayed_override_drop(thread_t thread
)
5747 struct uthread
*ut
= get_bsdthread_info(thread
);
5748 assert(ut
->uu_kqr_bound
== NULL
);
5749 if (ut
->uu_kqueue_override
!= THREAD_QOS_UNSPECIFIED
) {
5750 thread_drop_servicer_override(thread
);
5751 ut
->uu_kqueue_override
= THREAD_QOS_UNSPECIFIED
;
5756 * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
5758 * It will acknowledge events, and possibly request a new thread if:
5759 * - there were active events left
5760 * - we pended waitq hook callouts during processing
5761 * - we pended wakeups while processing (or unsuppressing)
5763 * Called with kqueue lock held.
5766 kqworkloop_unbind(struct kqworkloop
*kqwl
)
5768 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
5769 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
5770 thread_t thread
= kqr_thread_fast(kqr
);
5771 int op
= KQWL_UTQ_PARKING
;
5772 kq_index_t qos_override
= THREAD_QOS_UNSPECIFIED
;
5774 assert(thread
== current_thread());
5779 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
5780 * unsuppressing knotes not to be applied until the eventual call to
5781 * kqworkloop_update_threads_qos() below.
5783 assert((kq
->kq_state
& KQ_PROCESSING
) == 0);
5784 if (!TAILQ_EMPTY(&kqwl
->kqwl_suppressed
)) {
5785 kq
->kq_state
|= KQ_PROCESSING
;
5786 qos_override
= kqworkloop_acknowledge_events(kqwl
);
5787 kq
->kq_state
&= ~KQ_PROCESSING
;
5790 kqworkloop_unbind_locked(kqwl
, thread
, KQWL_OVERRIDE_DROP_DELAYED
);
5791 kqworkloop_update_threads_qos(kqwl
, op
, qos_override
);
5796 * Drop the override on the current thread last, after the call to
5797 * kqworkloop_update_threads_qos above.
5799 kqworkloop_unbind_delayed_override_drop(thread
);
5801 /* If last reference, dealloc the workloop kq */
5802 kqworkloop_release(kqwl
);
5806 kqworkq_unbind_locked(struct kqworkq
*kqwq
,
5807 workq_threadreq_t kqr
, thread_t thread
)
5809 struct uthread
*ut
= get_bsdthread_info(thread
);
5810 kq_index_t old_override
= kqr
->tr_kq_override_index
;
5812 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND
), -1,
5813 thread_tid(kqr_thread(kqr
)), kqr
->tr_kq_qos_index
, 0);
5817 assert(ut
->uu_kqr_bound
== kqr
);
5818 ut
->uu_kqr_bound
= NULL
;
5819 kqr
->tr_thread
= THREAD_NULL
;
5820 kqr
->tr_state
= WORKQ_TR_STATE_IDLE
;
5821 kqr
->tr_kq_override_index
= THREAD_QOS_UNSPECIFIED
;
5822 kqwq
->kqwq_state
&= ~KQ_R2K_ARMED
;
5824 return old_override
;
5828 * kqworkq_unbind - unbind of a workq kqueue from a thread
5830 * We may have to request new threads.
5831 * This can happen there are no waiting processing threads and:
5832 * - there were active events we never got to (count > 0)
5833 * - we pended waitq hook callouts during processing
5834 * - we pended wakeups while processing (or unsuppressing)
5837 kqworkq_unbind(proc_t p
, workq_threadreq_t kqr
)
5839 struct kqworkq
*kqwq
= (struct kqworkq
*)p
->p_fd
->fd_wqkqueue
;
5840 __assert_only
int rc
;
5843 rc
= kqworkq_acknowledge_events(kqwq
, kqr
, 0, KQWQAE_UNBIND
);
5849 kqworkq_get_request(struct kqworkq
*kqwq
, kq_index_t qos_index
)
5851 assert(qos_index
< KQWQ_NBUCKETS
);
5852 return &kqwq
->kqwq_request
[qos_index
];
5856 knote_reset_priority(kqueue_t kqu
, struct knote
*kn
, pthread_priority_t pp
)
5858 kq_index_t qos
= _pthread_priority_thread_qos(pp
);
5860 if (kqu
.kq
->kq_state
& KQ_WORKLOOP
) {
5861 assert((pp
& _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
) == 0);
5862 pp
= _pthread_priority_normalize(pp
);
5863 } else if (kqu
.kq
->kq_state
& KQ_WORKQ
) {
5864 if (qos
== THREAD_QOS_UNSPECIFIED
) {
5865 /* On workqueues, outside of QoS means MANAGER */
5866 qos
= KQWQ_QOS_MANAGER
;
5867 pp
= _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
5869 pp
= _pthread_priority_normalize(pp
);
5872 pp
= _pthread_unspecified_priority();
5873 qos
= THREAD_QOS_UNSPECIFIED
;
5876 kn
->kn_qos
= (int32_t)pp
;
5878 if ((kn
->kn_status
& KN_MERGE_QOS
) == 0 || qos
> kn
->kn_qos_override
) {
5879 /* Never lower QoS when in "Merge" mode */
5880 kn
->kn_qos_override
= qos
;
5883 /* only adjust in-use qos index when not suppressed */
5884 if (kn
->kn_status
& KN_SUPPRESSED
) {
5885 kqueue_update_override(kqu
, kn
, qos
);
5886 } else if (kn
->kn_qos_index
!= qos
) {
5887 knote_dequeue(kqu
, kn
);
5888 kn
->kn_qos_index
= qos
;
5893 knote_adjust_qos(struct kqueue
*kq
, struct knote
*kn
, int result
)
5895 thread_qos_t qos_index
= (result
>> FILTER_ADJUST_EVENT_QOS_SHIFT
) & 7;
5899 assert(result
& FILTER_ADJUST_EVENT_QOS_BIT
);
5900 assert(qos_index
< THREAD_QOS_LAST
);
5903 * Early exit for knotes that should not change QoS
5905 if (__improbable(!knote_fops(kn
)->f_adjusts_qos
)) {
5906 panic("filter %d cannot change QoS", kn
->kn_filtid
);
5907 } else if (__improbable(!knote_has_qos(kn
))) {
5912 * knotes with the FALLBACK flag will only use their registration QoS if the
5913 * incoming event has no QoS, else, the registration QoS acts as a floor.
5915 thread_qos_t req_qos
= _pthread_priority_thread_qos_fast(kn
->kn_qos
);
5916 if (kn
->kn_qos
& _PTHREAD_PRIORITY_FALLBACK_FLAG
) {
5917 if (qos_index
== THREAD_QOS_UNSPECIFIED
) {
5918 qos_index
= req_qos
;
5921 if (qos_index
< req_qos
) {
5922 qos_index
= req_qos
;
5925 if ((kn
->kn_status
& KN_MERGE_QOS
) && (qos_index
< kn
->kn_qos_override
)) {
5926 /* Never lower QoS when in "Merge" mode */
5930 if ((kn
->kn_status
& KN_LOCKED
) && (kn
->kn_status
& KN_POSTING
)) {
5932 * When we're trying to update the QoS override and that both an
5933 * f_event() and other f_* calls are running concurrently, any of these
5934 * in flight calls may want to perform overrides that aren't properly
5935 * serialized with each other.
5937 * The first update that observes this racy situation enters a "Merge"
5938 * mode which causes subsequent override requests to saturate the
5939 * override instead of replacing its value.
5941 * This mode is left when knote_unlock() or knote_post()
5942 * observe that no other f_* routine is in flight.
5944 kn
->kn_status
|= KN_MERGE_QOS
;
5948 * Now apply the override if it changed.
5951 if (kn
->kn_qos_override
== qos_index
) {
5955 kn
->kn_qos_override
= qos_index
;
5957 if (kn
->kn_status
& KN_SUPPRESSED
) {
5959 * For suppressed events, the kn_qos_index field cannot be touched as it
5960 * allows us to know on which supress queue the knote is for a kqworkq.
5962 * Also, there's no natural push applied on the kqueues when this field
5963 * changes anyway. We hence need to apply manual overrides in this case,
5964 * which will be cleared when the events are later acknowledged.
5966 kqueue_update_override(kq
, kn
, qos_index
);
5967 } else if (kn
->kn_qos_index
!= qos_index
) {
5968 knote_dequeue(kq
, kn
);
5969 kn
->kn_qos_index
= qos_index
;
5974 * Called back from waitq code when no threads waiting and the hook was set.
5976 * Preemption is disabled - minimal work can be done in this context!!!
5979 waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t
*kq_hook
)
5983 kqu
.kq
= __container_of(kq_hook
, struct kqueue
, kq_waitq_hook
);
5984 assert(kqu
.kq
->kq_state
& (KQ_WORKQ
| KQ_WORKLOOP
));
5988 if (kqu
.kq
->kq_count
> 0) {
5989 if (kqu
.kq
->kq_state
& KQ_WORKLOOP
) {
5990 kqworkloop_wakeup(kqu
.kqwl
, KQWL_BUCKET_STAYACTIVE
);
5992 kqworkq_wakeup(kqu
.kqwq
, KQWQ_QOS_MANAGER
);
6000 klist_init(struct klist
*list
)
6007 * Query/Post each knote in the object's list
6009 * The object lock protects the list. It is assumed
6010 * that the filter/event routine for the object can
6011 * determine that the object is already locked (via
6012 * the hint) and not deadlock itself.
6014 * The object lock should also hold off pending
6015 * detach/drop operations.
6018 knote(struct klist
*list
, long hint
)
6022 SLIST_FOREACH(kn
, list
, kn_selnext
) {
6023 knote_post(kn
, hint
);
6028 * attach a knote to the specified list. Return true if this is the first entry.
6029 * The list is protected by whatever lock the object it is associated with uses.
6032 knote_attach(struct klist
*list
, struct knote
*kn
)
6034 int ret
= SLIST_EMPTY(list
);
6035 SLIST_INSERT_HEAD(list
, kn
, kn_selnext
);
6040 * detach a knote from the specified list. Return true if that was the last entry.
6041 * The list is protected by whatever lock the object it is associated with uses.
6044 knote_detach(struct klist
*list
, struct knote
*kn
)
6046 SLIST_REMOVE(list
, kn
, knote
, kn_selnext
);
6047 return SLIST_EMPTY(list
);
6051 * knote_vanish - Indicate that the source has vanished
6053 * If the knote has requested EV_VANISHED delivery,
6054 * arrange for that. Otherwise, deliver a NOTE_REVOKE
6055 * event for backward compatibility.
6057 * The knote is marked as having vanished, but is not
6058 * actually detached from the source in this instance.
6059 * The actual detach is deferred until the knote drop.
6061 * Our caller already has the object lock held. Calling
6062 * the detach routine would try to take that lock
6063 * recursively - which likely is not supported.
6066 knote_vanish(struct klist
*list
, bool make_active
)
6069 struct knote
*kn_next
;
6071 SLIST_FOREACH_SAFE(kn
, list
, kn_selnext
, kn_next
) {
6072 struct kqueue
*kq
= knote_get_kq(kn
);
6075 if (__probable(kn
->kn_status
& KN_REQVANISH
)) {
6077 * If EV_VANISH supported - prepare to deliver one
6079 kn
->kn_status
|= KN_VANISHED
;
6082 * Handle the legacy way to indicate that the port/portset was
6083 * deallocated or left the current Mach portspace (modern technique
6084 * is with an EV_VANISHED protocol).
6086 * Deliver an EV_EOF event for these changes (hopefully it will get
6087 * delivered before the port name recycles to the same generation
6088 * count and someone tries to re-register a kevent for it or the
6089 * events are udata-specific - avoiding a conflict).
6091 kn
->kn_flags
|= EV_EOF
| EV_ONESHOT
;
6094 knote_activate(kq
, kn
, FILTER_ACTIVE
);
6101 * Force a lazy allocation of the waitqset link
6102 * of the kq_wqs associated with the kn
6103 * if it wasn't already allocated.
6105 * This allows knote_link_waitq to never block
6106 * if reserved_link is not NULL.
6109 knote_link_waitqset_lazy_alloc(struct knote
*kn
)
6111 struct kqueue
*kq
= knote_get_kq(kn
);
6112 waitq_set_lazy_init_link(&kq
->kq_wqs
);
6116 * Check if a lazy allocation for the waitqset link
6117 * of the kq_wqs is needed.
6120 knote_link_waitqset_should_lazy_alloc(struct knote
*kn
)
6122 struct kqueue
*kq
= knote_get_kq(kn
);
6123 return waitq_set_should_lazy_init_link(&kq
->kq_wqs
);
6127 * For a given knote, link a provided wait queue directly with the kqueue.
6128 * Wakeups will happen via recursive wait queue support. But nothing will move
6129 * the knote to the active list at wakeup (nothing calls knote()). Instead,
6130 * we permanently enqueue them here.
6132 * kqueue and knote references are held by caller.
6133 * waitq locked by caller.
6135 * caller provides the wait queue link structure and insures that the kq->kq_wqs
6136 * is linked by previously calling knote_link_waitqset_lazy_alloc.
6139 knote_link_waitq(struct knote
*kn
, struct waitq
*wq
, uint64_t *reserved_link
)
6141 struct kqueue
*kq
= knote_get_kq(kn
);
6144 kr
= waitq_link(wq
, &kq
->kq_wqs
, WAITQ_ALREADY_LOCKED
, reserved_link
);
6145 if (kr
== KERN_SUCCESS
) {
6146 knote_markstayactive(kn
);
6154 * Unlink the provided wait queue from the kqueue associated with a knote.
6155 * Also remove it from the magic list of directly attached knotes.
6157 * Note that the unlink may have already happened from the other side, so
6158 * ignore any failures to unlink and just remove it from the kqueue list.
6160 * On success, caller is responsible for the link structure
6163 knote_unlink_waitq(struct knote
*kn
, struct waitq
*wq
)
6165 struct kqueue
*kq
= knote_get_kq(kn
);
6168 kr
= waitq_unlink(wq
, &kq
->kq_wqs
);
6169 knote_clearstayactive(kn
);
6170 return (kr
!= KERN_SUCCESS
) ? EINVAL
: 0;
6174 * remove all knotes referencing a specified fd
6176 * Entered with the proc_fd lock already held.
6177 * It returns the same way, but may drop it temporarily.
6180 knote_fdclose(struct proc
*p
, int fd
)
6184 KNOTE_LOCK_CTX(knlc
);
6187 list
= &p
->p_fd
->fd_knlist
[fd
];
6188 SLIST_FOREACH(kn
, list
, kn_link
) {
6189 struct kqueue
*kq
= knote_get_kq(kn
);
6193 if (kq
->kq_p
!= p
) {
6194 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
6195 __func__
, kq
->kq_p
, p
);
6199 * If the knote supports EV_VANISHED delivery,
6200 * transition it to vanished mode (or skip over
6201 * it if already vanished).
6203 if (kn
->kn_status
& KN_VANISHED
) {
6209 if (!knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ON_SUCCESS
)) {
6210 /* the knote was dropped by someone, nothing to do */
6211 } else if (kn
->kn_status
& KN_REQVANISH
) {
6212 kn
->kn_status
|= KN_VANISHED
;
6215 knote_fops(kn
)->f_detach(kn
);
6217 fp_drop(p
, (int)kn
->kn_id
, kn
->kn_fp
, 0);
6219 kn
->kn_filtid
= EVFILTID_DETACHED
;
6222 knote_activate(kq
, kn
, FILTER_ACTIVE
);
6223 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_UNLOCK
);
6225 knote_drop(kq
, kn
, &knlc
);
6234 * knote_fdfind - lookup a knote in the fd table for process
6236 * If the filter is file-based, lookup based on fd index.
6237 * Otherwise use a hash based on the ident.
6239 * Matching is based on kq, filter, and ident. Optionally,
6240 * it may also be based on the udata field in the kevent -
6241 * allowing multiple event registration for the file object
6244 * fd_knhashlock or fdlock held on entry (and exit)
6246 static struct knote
*
6247 knote_fdfind(struct kqueue
*kq
,
6248 const struct kevent_internal_s
*kev
,
6252 struct filedesc
*fdp
= p
->p_fd
;
6253 struct klist
*list
= NULL
;
6254 struct knote
*kn
= NULL
;
6257 * determine where to look for the knote
6260 /* fd-based knotes are linked off the fd table */
6261 if (kev
->kei_ident
< (u_int
)fdp
->fd_knlistsize
) {
6262 list
= &fdp
->fd_knlist
[kev
->kei_ident
];
6264 } else if (fdp
->fd_knhashmask
!= 0) {
6265 /* hash non-fd knotes here too */
6266 list
= &fdp
->fd_knhash
[KN_HASH((u_long
)kev
->kei_ident
, fdp
->fd_knhashmask
)];
6270 * scan the selected list looking for a match
6273 SLIST_FOREACH(kn
, list
, kn_link
) {
6274 if (kq
== knote_get_kq(kn
) &&
6275 kev
->kei_ident
== kn
->kn_id
&&
6276 kev
->kei_filter
== kn
->kn_filter
) {
6277 if (kev
->kei_flags
& EV_UDATA_SPECIFIC
) {
6278 if ((kn
->kn_flags
& EV_UDATA_SPECIFIC
) &&
6279 kev
->kei_udata
== kn
->kn_udata
) {
6280 break; /* matching udata-specific knote */
6282 } else if ((kn
->kn_flags
& EV_UDATA_SPECIFIC
) == 0) {
6283 break; /* matching non-udata-specific knote */
6292 * kq_add_knote- Add knote to the fd table for process
6293 * while checking for duplicates.
6295 * All file-based filters associate a list of knotes by file
6296 * descriptor index. All other filters hash the knote by ident.
6298 * May have to grow the table of knote lists to cover the
6299 * file descriptor index presented.
6301 * fd_knhashlock and fdlock unheld on entry (and exit).
6303 * Takes a rwlock boost if inserting the knote is successful.
6306 kq_add_knote(struct kqueue
*kq
, struct knote
*kn
, struct knote_lock_ctx
*knlc
,
6309 struct filedesc
*fdp
= p
->p_fd
;
6310 struct klist
*list
= NULL
;
6312 bool is_fd
= kn
->kn_is_fd
;
6313 uint64_t nofile
= proc_limitgetcur(p
, RLIMIT_NOFILE
, TRUE
);
6321 if (knote_fdfind(kq
, &kn
->kn_kevent
, is_fd
, p
) != NULL
) {
6322 /* found an existing knote: we can't add this one */
6327 /* knote was not found: add it now */
6329 if (fdp
->fd_knhashmask
== 0) {
6332 list
= hashinit(CONFIG_KN_HASHSIZE
, M_KQUEUE
, &size
);
6338 fdp
->fd_knhash
= list
;
6339 fdp
->fd_knhashmask
= size
;
6342 list
= &fdp
->fd_knhash
[KN_HASH(kn
->kn_id
, fdp
->fd_knhashmask
)];
6343 SLIST_INSERT_HEAD(list
, kn
, kn_link
);
6347 /* knote is fd based */
6349 if ((u_int
)fdp
->fd_knlistsize
<= kn
->kn_id
) {
6352 /* Make sure that fd stays below current process's soft limit AND system allowed per-process limits */
6353 if (kn
->kn_id
>= (uint64_t) nofile
6354 || kn
->kn_id
>= (uint64_t)maxfilesperproc
) {
6358 /* have to grow the fd_knlist */
6359 size
= fdp
->fd_knlistsize
;
6360 while (size
<= kn
->kn_id
) {
6364 if (size
>= (UINT_MAX
/ sizeof(struct klist
*))) {
6369 MALLOC(list
, struct klist
*,
6370 size
* sizeof(struct klist
*), M_KQUEUE
, M_WAITOK
);
6376 bcopy((caddr_t
)fdp
->fd_knlist
, (caddr_t
)list
,
6377 fdp
->fd_knlistsize
* sizeof(struct klist
*));
6378 bzero((caddr_t
)list
+
6379 fdp
->fd_knlistsize
* sizeof(struct klist
*),
6380 (size
- fdp
->fd_knlistsize
) * sizeof(struct klist
*));
6381 FREE(fdp
->fd_knlist
, M_KQUEUE
);
6382 fdp
->fd_knlist
= list
;
6383 fdp
->fd_knlistsize
= size
;
6386 list
= &fdp
->fd_knlist
[kn
->kn_id
];
6387 SLIST_INSERT_HEAD(list
, kn
, kn_link
);
6395 assert((kn
->kn_status
& KN_LOCKED
) == 0);
6396 (void)knote_lock(kq
, kn
, knlc
, KNOTE_KQ_UNLOCK
);
6397 kqueue_retain(kq
); /* retain a kq ref */
6409 * kq_remove_knote - remove a knote from the fd table for process
6411 * If the filter is file-based, remove based on fd index.
6412 * Otherwise remove from the hash based on the ident.
6414 * fd_knhashlock and fdlock unheld on entry (and exit).
6417 kq_remove_knote(struct kqueue
*kq
, struct knote
*kn
, struct proc
*p
,
6418 struct knote_lock_ctx
*knlc
)
6420 struct filedesc
*fdp
= p
->p_fd
;
6421 struct klist
*list
= NULL
;
6423 bool is_fd
= kn
->kn_is_fd
;
6432 assert((u_int
)fdp
->fd_knlistsize
> kn
->kn_id
);
6433 list
= &fdp
->fd_knlist
[kn
->kn_id
];
6435 list
= &fdp
->fd_knhash
[KN_HASH(kn
->kn_id
, fdp
->fd_knhashmask
)];
6437 SLIST_REMOVE(list
, kn
, knote
, kn_link
);
6440 kq_state
= kq
->kq_state
;
6442 knote_unlock_cancel(kq
, kn
, knlc
);
6452 if (kq_state
& KQ_DYNAMIC
) {
6453 kqworkloop_release((struct kqworkloop
*)kq
);
6458 * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
6459 * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
6461 * fd_knhashlock or fdlock unheld on entry (and exit)
6464 static struct knote
*
6465 kq_find_knote_and_kq_lock(struct kqueue
*kq
, struct kevent_qos_s
*kev
,
6466 bool is_fd
, struct proc
*p
)
6468 struct filedesc
*fdp
= p
->p_fd
;
6478 * Temporary horrible hack:
6479 * this cast is gross and will go away in a future change.
6480 * It is OK to do because we don't look at xflags/s_fflags,
6481 * and that when we cast down the kev this way,
6482 * the truncated filter field works.
6484 kn
= knote_fdfind(kq
, (struct kevent_internal_s
*)kev
, is_fd
, p
);
6488 assert(knote_get_kq(kn
) == kq
);
6500 __attribute__((noinline
))
6502 kqfile_wakeup(struct kqfile
*kqf
, __unused kq_index_t qos
)
6504 /* flag wakeups during processing */
6505 if (kqf
->kqf_state
& KQ_PROCESSING
) {
6506 kqf
->kqf_state
|= KQ_WAKEUP
;
6509 /* wakeup a thread waiting on this queue */
6510 if (kqf
->kqf_state
& (KQ_SLEEP
| KQ_SEL
)) {
6511 kqf
->kqf_state
&= ~(KQ_SLEEP
| KQ_SEL
);
6512 waitq_wakeup64_all((struct waitq
*)&kqf
->kqf_wqs
, KQ_EVENT
,
6513 THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
6516 /* wakeup other kqueues/select sets we're inside */
6517 KNOTE(&kqf
->kqf_sel
.si_note
, 0);
6520 static struct kqtailq
*
6521 knote_get_tailq(kqueue_t kqu
, struct knote
*kn
)
6523 kq_index_t qos_index
= kn
->kn_qos_index
;
6525 if (kqu
.kq
->kq_state
& KQ_WORKLOOP
) {
6526 assert(qos_index
< KQWL_NBUCKETS
);
6527 } else if (kqu
.kq
->kq_state
& KQ_WORKQ
) {
6528 assert(qos_index
< KQWQ_NBUCKETS
);
6530 assert(qos_index
== QOS_INDEX_KQFILE
);
6532 static_assert(offsetof(struct kqueue
, kq_queue
) == sizeof(struct kqueue
),
6533 "struct kqueue::kq_queue must be exactly at the end");
6534 return &kqu
.kq
->kq_queue
[qos_index
];
6538 knote_enqueue(kqueue_t kqu
, struct knote
*kn
, kn_status_t wakeup_mask
)
6542 if ((kn
->kn_status
& (KN_ACTIVE
| KN_STAYACTIVE
)) == 0) {
6546 if (kn
->kn_status
& (KN_DISABLED
| KN_SUPPRESSED
| KN_DROPPING
)) {
6550 if ((kn
->kn_status
& KN_QUEUED
) == 0) {
6551 struct kqtailq
*queue
= knote_get_tailq(kqu
, kn
);
6553 TAILQ_INSERT_TAIL(queue
, kn
, kn_tqe
);
6554 kn
->kn_status
|= KN_QUEUED
;
6556 } else if ((kn
->kn_status
& KN_STAYACTIVE
) == 0) {
6560 if (kn
->kn_status
& wakeup_mask
) {
6561 if (kqu
.kq
->kq_state
& KQ_WORKLOOP
) {
6562 kqworkloop_wakeup(kqu
.kqwl
, kn
->kn_qos_index
);
6563 } else if (kqu
.kq
->kq_state
& KQ_WORKQ
) {
6564 kqworkq_wakeup(kqu
.kqwq
, kn
->kn_qos_index
);
6566 kqfile_wakeup(kqu
.kqf
, kn
->kn_qos_index
);
6571 __attribute__((always_inline
))
6573 knote_dequeue(kqueue_t kqu
, struct knote
*kn
)
6575 if (kn
->kn_status
& KN_QUEUED
) {
6576 struct kqtailq
*queue
= knote_get_tailq(kqu
, kn
);
6578 // attaching the knote calls knote_reset_priority() without
6579 // the kqlock which is fine, so we can't call kqlock_held()
6580 // if we're not queued.
6583 TAILQ_REMOVE(queue
, kn
, kn_tqe
);
6584 kn
->kn_status
&= ~KN_QUEUED
;
6589 /* called with kqueue lock held */
6591 knote_suppress(kqueue_t kqu
, struct knote
*kn
)
6593 struct kqtailq
*suppressq
;
6597 assert((kn
->kn_status
& KN_SUPPRESSED
) == 0);
6598 assert(kn
->kn_status
& KN_QUEUED
);
6600 knote_dequeue(kqu
, kn
);
6601 /* deactivate - so new activations indicate a wakeup */
6602 kn
->kn_status
&= ~KN_ACTIVE
;
6603 kn
->kn_status
|= KN_SUPPRESSED
;
6604 suppressq
= kqueue_get_suppressed_queue(kqu
, kn
);
6605 TAILQ_INSERT_TAIL(suppressq
, kn
, kn_tqe
);
6608 __attribute__((always_inline
))
6610 knote_unsuppress_noqueue(kqueue_t kqu
, struct knote
*kn
)
6612 struct kqtailq
*suppressq
;
6616 assert(kn
->kn_status
& KN_SUPPRESSED
);
6618 kn
->kn_status
&= ~KN_SUPPRESSED
;
6619 suppressq
= kqueue_get_suppressed_queue(kqu
, kn
);
6620 TAILQ_REMOVE(suppressq
, kn
, kn_tqe
);
6623 * If the knote is no longer active, reset its push,
6624 * and resynchronize kn_qos_index with kn_qos_override
6625 * for knotes with a real qos.
6627 if ((kn
->kn_status
& KN_ACTIVE
) == 0 && knote_has_qos(kn
)) {
6628 kn
->kn_qos_override
= _pthread_priority_thread_qos_fast(kn
->kn_qos
);
6630 kn
->kn_qos_index
= kn
->kn_qos_override
;
6633 /* called with kqueue lock held */
6635 knote_unsuppress(kqueue_t kqu
, struct knote
*kn
)
6637 if (kn
->kn_status
& KN_SUPPRESSED
) {
6638 knote_unsuppress_noqueue(kqu
, kn
);
6640 /* don't wakeup if unsuppressing just a stay-active knote */
6641 knote_enqueue(kqu
, kn
, KN_ACTIVE
);
6645 __attribute__((always_inline
))
6647 knote_mark_active(struct knote
*kn
)
6649 if ((kn
->kn_status
& KN_ACTIVE
) == 0) {
6650 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE
),
6651 kn
->kn_udata
, kn
->kn_status
| (kn
->kn_id
<< 32),
6655 kn
->kn_status
|= KN_ACTIVE
;
6658 /* called with kqueue lock held */
6660 knote_activate(kqueue_t kqu
, struct knote
*kn
, int result
)
6662 assert(result
& FILTER_ACTIVE
);
6663 if (result
& FILTER_ADJUST_EVENT_QOS_BIT
) {
6664 // may dequeue the knote
6665 knote_adjust_qos(kqu
.kq
, kn
, result
);
6667 knote_mark_active(kn
);
6668 knote_enqueue(kqu
, kn
, KN_ACTIVE
| KN_STAYACTIVE
);
6672 * This function applies changes requested by f_attach or f_touch for
6673 * a given filter. It proceeds in a carefully chosen order to help
6674 * every single transition do the minimal amount of work possible.
6677 knote_apply_touch(kqueue_t kqu
, struct knote
*kn
, struct kevent_qos_s
*kev
,
6680 kn_status_t wakeup_mask
= KN_ACTIVE
;
6682 if ((kev
->flags
& EV_ENABLE
) && (kn
->kn_status
& KN_DISABLED
)) {
6684 * When a stayactive knote is reenabled, we may have missed wakeups
6685 * while it was disabled, so we need to poll it. To do so, ask
6686 * knote_enqueue() below to reenqueue it.
6688 wakeup_mask
|= KN_STAYACTIVE
;
6689 kn
->kn_status
&= ~KN_DISABLED
;
6692 * it is possible for userland to have knotes registered for a given
6693 * workloop `wl_orig` but really handled on another workloop `wl_new`.
6695 * In that case, rearming will happen from the servicer thread of
6696 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
6697 * this knote to stay suppressed forever if we only relied on
6698 * kqworkloop_acknowledge_events to be called by `wl_orig`.
6700 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
6701 * unsuppress because that would mess with the processing phase of
6702 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
6705 if (__improbable(kn
->kn_status
& KN_SUPPRESSED
)) {
6706 if ((kqu
.kq
->kq_state
& KQ_PROCESSING
) == 0) {
6707 knote_unsuppress_noqueue(kqu
, kn
);
6712 if ((result
& FILTER_UPDATE_REQ_QOS
) && kev
->qos
&& kev
->qos
!= kn
->kn_qos
) {
6713 // may dequeue the knote
6714 knote_reset_priority(kqu
, kn
, kev
->qos
);
6718 * When we unsuppress above, or because of knote_reset_priority(),
6719 * the knote may have been dequeued, we need to restore the invariant
6720 * that if the knote is active it needs to be queued now that
6721 * we're done applying changes.
6723 if (result
& FILTER_ACTIVE
) {
6724 knote_activate(kqu
, kn
, result
);
6726 knote_enqueue(kqu
, kn
, wakeup_mask
);
6729 if ((result
& FILTER_THREADREQ_NODEFEER
) &&
6730 act_clear_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ
)) {
6731 workq_kern_threadreq_redrive(kqu
.kq
->kq_p
, WORKQ_THREADREQ_NONE
);
6736 * knote_drop - disconnect and drop the knote
6738 * Called with the kqueue locked, returns with the kqueue unlocked.
6740 * If a knote locking context is passed, it is canceled.
6742 * The knote may have already been detached from
6743 * (or not yet attached to) its source object.
6746 knote_drop(struct kqueue
*kq
, struct knote
*kn
, struct knote_lock_ctx
*knlc
)
6748 struct proc
*p
= kq
->kq_p
;
6752 assert((kn
->kn_status
& KN_DROPPING
) == 0);
6754 assert((kn
->kn_status
& KN_LOCKED
) == 0);
6756 kn
->kn_status
|= KN_DROPPING
;
6758 if (kn
->kn_status
& KN_SUPPRESSED
) {
6759 knote_unsuppress_noqueue(kq
, kn
);
6761 knote_dequeue(kq
, kn
);
6763 knote_wait_for_post(kq
, kn
);
6765 knote_fops(kn
)->f_detach(kn
);
6767 /* kq may be freed when kq_remove_knote() returns */
6768 kq_remove_knote(kq
, kn
, p
, knlc
);
6769 if (kn
->kn_is_fd
&& ((kn
->kn_status
& KN_VANISHED
) == 0)) {
6770 fp_drop(p
, (int)kn
->kn_id
, kn
->kn_fp
, 0);
6779 #if CONFIG_MEMORYSTATUS
6780 /* Initialize the memorystatus list lock */
6781 memorystatus_kevent_init(&kq_lck_grp
, LCK_ATTR_NULL
);
6784 SYSINIT(knote
, SI_SUB_PSEUDO
, SI_ORDER_ANY
, knote_init
, NULL
);
6786 const struct filterops
*
6787 knote_fops(struct knote
*kn
)
6789 return sysfilt_ops
[kn
->kn_filtid
];
6792 static struct knote
*
6795 return zalloc_flags(knote_zone
, Z_WAITOK
| Z_ZERO
);
6799 knote_free(struct knote
*kn
)
6801 assert((kn
->kn_status
& (KN_LOCKED
| KN_POSTING
)) == 0);
6802 zfree(knote_zone
, kn
);
6805 #pragma mark - syscalls: kevent, kevent64, kevent_qos, kevent_id
6808 kevent_get_context(thread_t thread
)
6810 uthread_t ut
= get_bsdthread_info(thread
);
6811 return &ut
->uu_save
.uus_kevent
;
6815 kevent_args_requesting_events(unsigned int flags
, int nevents
)
6817 return !(flags
& KEVENT_FLAG_ERROR_EVENTS
) && nevents
> 0;
6821 kevent_adjust_flags_for_proc(proc_t p
, int flags
)
6823 __builtin_assume(p
);
6824 return flags
| (IS_64BIT_PROCESS(p
) ? KEVENT_FLAG_PROC64
: 0);
6828 * @function kevent_get_kqfile
6831 * Lookup a kqfile by fd.
6834 * Callers: kevent, kevent64, kevent_qos
6836 * This is not assumed to be a fastpath (kqfile interfaces are legacy)
6840 kevent_get_kqfile(struct proc
*p
, int fd
, int flags
,
6841 struct fileproc
**fpp
, struct kqueue
**kqp
)
6846 error
= fp_get_ftype(p
, fd
, DTYPE_KQUEUE
, EBADF
, fpp
);
6847 if (__improbable(error
)) {
6850 kq
= (struct kqueue
*)(*fpp
)->f_data
;
6852 uint16_t kq_state
= os_atomic_load(&kq
->kq_state
, relaxed
);
6853 if (__improbable((kq_state
& (KQ_KEV32
| KQ_KEV64
| KQ_KEV_QOS
)) == 0)) {
6855 kq_state
= kq
->kq_state
;
6856 if (!(kq_state
& (KQ_KEV32
| KQ_KEV64
| KQ_KEV_QOS
))) {
6857 if (flags
& KEVENT_FLAG_LEGACY32
) {
6858 kq_state
|= KQ_KEV32
;
6859 } else if (flags
& KEVENT_FLAG_LEGACY64
) {
6860 kq_state
|= KQ_KEV64
;
6862 kq_state
|= KQ_KEV_QOS
;
6864 kq
->kq_state
= kq_state
;
6870 * kqfiles can't be used through the legacy kevent()
6871 * and other interfaces at the same time.
6873 if (__improbable((bool)(flags
& KEVENT_FLAG_LEGACY32
) !=
6874 (bool)(kq_state
& KQ_KEV32
))) {
6875 fp_drop(p
, fd
, *fpp
, 0);
6884 * @function kevent_get_kqwq
6887 * Lookup or create the process kqwq (faspath).
6890 * Callers: kevent64, kevent_qos
6894 kevent_get_kqwq(proc_t p
, int flags
, int nevents
, struct kqueue
**kqp
)
6896 struct kqworkq
*kqwq
= p
->p_fd
->fd_wqkqueue
;
6898 if (__improbable(kevent_args_requesting_events(flags
, nevents
))) {
6901 if (__improbable(kqwq
== NULL
)) {
6902 kqwq
= kqworkq_alloc(p
, flags
);
6903 if (__improbable(kqwq
== NULL
)) {
6908 *kqp
= &kqwq
->kqwq_kqueue
;
6912 #pragma mark kevent copyio
6915 * @function kevent_get_data_size
6918 * Copies in the extra data size from user-space.
6921 kevent_get_data_size(int flags
, user_addr_t data_avail
, user_addr_t data_out
,
6924 if (!data_avail
|| !data_out
) {
6925 kectx
->kec_data_size
= 0;
6926 kectx
->kec_data_resid
= 0;
6927 } else if (flags
& KEVENT_FLAG_PROC64
) {
6928 user64_size_t usize
= 0;
6929 int error
= copyin((user_addr_t
)data_avail
, &usize
, sizeof(usize
));
6930 if (__improbable(error
)) {
6933 kectx
->kec_data_resid
= kectx
->kec_data_size
= (user_size_t
)usize
;
6935 user32_size_t usize
= 0;
6936 int error
= copyin((user_addr_t
)data_avail
, &usize
, sizeof(usize
));
6937 if (__improbable(error
)) {
6940 kectx
->kec_data_avail
= data_avail
;
6941 kectx
->kec_data_resid
= kectx
->kec_data_size
= (user_size_t
)usize
;
6943 kectx
->kec_data_out
= data_out
;
6944 kectx
->kec_data_avail
= data_avail
;
6949 * @function kevent_put_data_size
6952 * Copies out the residual data size to user-space if any has been used.
6955 kevent_put_data_size(unsigned int flags
, kevent_ctx_t kectx
)
6957 if (kectx
->kec_data_resid
== kectx
->kec_data_size
) {
6960 if (flags
& KEVENT_FLAG_KERNEL
) {
6961 *(user_size_t
*)(uintptr_t)kectx
->kec_data_avail
= kectx
->kec_data_resid
;
6964 if (flags
& KEVENT_FLAG_PROC64
) {
6965 user64_size_t usize
= (user64_size_t
)kectx
->kec_data_resid
;
6966 return copyout(&usize
, (user_addr_t
)kectx
->kec_data_avail
, sizeof(usize
));
6968 user32_size_t usize
= (user32_size_t
)kectx
->kec_data_resid
;
6969 return copyout(&usize
, (user_addr_t
)kectx
->kec_data_avail
, sizeof(usize
));
6974 * @function kevent_legacy_copyin
6977 * Handles the copyin of a kevent/kevent64 event.
6980 kevent_legacy_copyin(user_addr_t
*addrp
, struct kevent_qos_s
*kevp
, unsigned int flags
)
6984 assert((flags
& (KEVENT_FLAG_LEGACY32
| KEVENT_FLAG_LEGACY64
)) != 0);
6986 if (flags
& KEVENT_FLAG_LEGACY64
) {
6987 struct kevent64_s kev64
;
6989 error
= copyin(*addrp
, (caddr_t
)&kev64
, sizeof(kev64
));
6990 if (__improbable(error
)) {
6993 *addrp
+= sizeof(kev64
);
6994 *kevp
= (struct kevent_qos_s
){
6995 .ident
= kev64
.ident
,
6996 .filter
= kev64
.filter
,
6997 /* Make sure user doesn't pass in any system flags */
6998 .flags
= kev64
.flags
& ~EV_SYSFLAGS
,
6999 .udata
= kev64
.udata
,
7000 .fflags
= kev64
.fflags
,
7002 .ext
[0] = kev64
.ext
[0],
7003 .ext
[1] = kev64
.ext
[1],
7005 } else if (flags
& KEVENT_FLAG_PROC64
) {
7006 struct user64_kevent kev64
;
7008 error
= copyin(*addrp
, (caddr_t
)&kev64
, sizeof(kev64
));
7009 if (__improbable(error
)) {
7012 *addrp
+= sizeof(kev64
);
7013 *kevp
= (struct kevent_qos_s
){
7014 .ident
= kev64
.ident
,
7015 .filter
= kev64
.filter
,
7016 /* Make sure user doesn't pass in any system flags */
7017 .flags
= kev64
.flags
& ~EV_SYSFLAGS
,
7018 .udata
= kev64
.udata
,
7019 .fflags
= kev64
.fflags
,
7023 struct user32_kevent kev32
;
7025 error
= copyin(*addrp
, (caddr_t
)&kev32
, sizeof(kev32
));
7026 if (__improbable(error
)) {
7029 *addrp
+= sizeof(kev32
);
7030 *kevp
= (struct kevent_qos_s
){
7031 .ident
= (uintptr_t)kev32
.ident
,
7032 .filter
= kev32
.filter
,
7033 /* Make sure user doesn't pass in any system flags */
7034 .flags
= kev32
.flags
& ~EV_SYSFLAGS
,
7035 .udata
= CAST_USER_ADDR_T(kev32
.udata
),
7036 .fflags
= kev32
.fflags
,
7037 .data
= (intptr_t)kev32
.data
,
7045 * @function kevent_modern_copyin
7048 * Handles the copyin of a kevent_qos/kevent_id event.
7051 kevent_modern_copyin(user_addr_t
*addrp
, struct kevent_qos_s
*kevp
)
7053 int error
= copyin(*addrp
, (caddr_t
)kevp
, sizeof(struct kevent_qos_s
));
7054 if (__probable(!error
)) {
7055 /* Make sure user doesn't pass in any system flags */
7056 *addrp
+= sizeof(struct kevent_qos_s
);
7057 kevp
->flags
&= ~EV_SYSFLAGS
;
7063 * @function kevent_legacy_copyout
7066 * Handles the copyout of a kevent/kevent64 event.
7069 kevent_legacy_copyout(struct kevent_qos_s
*kevp
, user_addr_t
*addrp
, unsigned int flags
)
7074 assert((flags
& (KEVENT_FLAG_LEGACY32
| KEVENT_FLAG_LEGACY64
)) != 0);
7077 * fully initialize the differnt output event structure
7078 * types from the internal kevent (and some universal
7079 * defaults for fields not represented in the internal
7082 * Note: these structures have no padding hence the C99
7083 * initializers below do not leak kernel info.
7085 if (flags
& KEVENT_FLAG_LEGACY64
) {
7086 struct kevent64_s kev64
= {
7087 .ident
= kevp
->ident
,
7088 .filter
= kevp
->filter
,
7089 .flags
= kevp
->flags
,
7090 .fflags
= kevp
->fflags
,
7091 .data
= (int64_t)kevp
->data
,
7092 .udata
= kevp
->udata
,
7093 .ext
[0] = kevp
->ext
[0],
7094 .ext
[1] = kevp
->ext
[1],
7096 advance
= sizeof(struct kevent64_s
);
7097 error
= copyout((caddr_t
)&kev64
, *addrp
, advance
);
7098 } else if (flags
& KEVENT_FLAG_PROC64
) {
7100 * deal with the special case of a user-supplied
7101 * value of (uintptr_t)-1.
7103 uint64_t ident
= (kevp
->ident
== (uintptr_t)-1) ?
7104 (uint64_t)-1LL : (uint64_t)kevp
->ident
;
7105 struct user64_kevent kev64
= {
7107 .filter
= kevp
->filter
,
7108 .flags
= kevp
->flags
,
7109 .fflags
= kevp
->fflags
,
7110 .data
= (int64_t) kevp
->data
,
7111 .udata
= (user_addr_t
) kevp
->udata
,
7113 advance
= sizeof(kev64
);
7114 error
= copyout((caddr_t
)&kev64
, *addrp
, advance
);
7116 struct user32_kevent kev32
= {
7117 .ident
= (uint32_t)kevp
->ident
,
7118 .filter
= kevp
->filter
,
7119 .flags
= kevp
->flags
,
7120 .fflags
= kevp
->fflags
,
7121 .data
= (int32_t)kevp
->data
,
7122 .udata
= (uint32_t)kevp
->udata
,
7124 advance
= sizeof(kev32
);
7125 error
= copyout((caddr_t
)&kev32
, *addrp
, advance
);
7127 if (__probable(!error
)) {
7134 * @function kevent_modern_copyout
7137 * Handles the copyout of a kevent_qos/kevent_id event.
7141 kevent_modern_copyout(struct kevent_qos_s
*kevp
, user_addr_t
*addrp
)
7143 int error
= copyout((caddr_t
)kevp
, *addrp
, sizeof(struct kevent_qos_s
));
7144 if (__probable(!error
)) {
7145 *addrp
+= sizeof(struct kevent_qos_s
);
7150 #pragma mark kevent core implementation
7153 * @function kevent_callback_inline
7156 * Callback for each individual event
7159 * This is meant to be inlined in kevent_modern_callback and
7160 * kevent_legacy_callback.
7164 kevent_callback_inline(struct kevent_qos_s
*kevp
, kevent_ctx_t kectx
, bool legacy
)
7168 assert(kectx
->kec_process_noutputs
< kectx
->kec_process_nevents
);
7171 * Copy out the appropriate amount of event data for this user.
7174 error
= kevent_legacy_copyout(kevp
, &kectx
->kec_process_eventlist
,
7175 kectx
->kec_process_flags
);
7177 error
= kevent_modern_copyout(kevp
, &kectx
->kec_process_eventlist
);
7181 * If there isn't space for additional events, return
7182 * a harmless error to stop the processing here
7184 if (error
== 0 && ++kectx
->kec_process_noutputs
== kectx
->kec_process_nevents
) {
7185 error
= EWOULDBLOCK
;
7191 * @function kevent_modern_callback
7194 * Callback for each individual modern event.
7197 * This callback handles kevent_qos/kevent_id events.
7200 kevent_modern_callback(struct kevent_qos_s
*kevp
, kevent_ctx_t kectx
)
7202 return kevent_callback_inline(kevp
, kectx
, /*legacy*/ false);
7206 * @function kevent_legacy_callback
7209 * Callback for each individual legacy event.
7212 * This callback handles kevent/kevent64 events.
7215 kevent_legacy_callback(struct kevent_qos_s
*kevp
, kevent_ctx_t kectx
)
7217 return kevent_callback_inline(kevp
, kectx
, /*legacy*/ true);
7221 * @function kevent_cleanup
7224 * Handles the cleanup returning from a kevent call.
7227 * kevent entry points will take a reference on workloops,
7228 * and a usecount on the fileglob of kqfiles.
7230 * This function undoes this on the exit paths of kevents.
7233 * The error to return to userspace.
7236 kevent_cleanup(kqueue_t kqu
, int flags
, int error
, kevent_ctx_t kectx
)
7238 // poll should not call any codepath leading to this
7239 assert((flags
& KEVENT_FLAG_POLL
) == 0);
7241 if (flags
& KEVENT_FLAG_WORKLOOP
) {
7242 kqworkloop_release(kqu
.kqwl
);
7243 } else if (flags
& KEVENT_FLAG_WORKQ
) {
7246 fp_drop(kqu
.kqf
->kqf_p
, kectx
->kec_fd
, kectx
->kec_fp
, 0);
7249 /* don't restart after signals... */
7250 if (error
== ERESTART
) {
7252 } else if (error
== 0) {
7253 /* don't abandon other output just because of residual copyout failures */
7254 (void)kevent_put_data_size(flags
, kectx
);
7257 if (flags
& KEVENT_FLAG_PARKING
) {
7258 thread_t th
= current_thread();
7259 struct uthread
*uth
= get_bsdthread_info(th
);
7260 if (uth
->uu_kqr_bound
) {
7261 thread_unfreeze_base_pri(th
);
7268 * @function kqueue_process
7271 * Process the triggered events in a kqueue.
7274 * Walk the queued knotes and validate that they are really still triggered
7275 * events by calling the filter routines (if necessary).
7277 * For each event that is still considered triggered, invoke the callback
7280 * caller holds a reference on the kqueue.
7281 * kqueue locked on entry and exit - but may be dropped
7282 * kqueue list locked (held for duration of call)
7284 * This is only called by kqueue_scan() so that the compiler can inline it.
7287 * - 0: no event was returned, no other error occured
7288 * - EBADF: the kqueue is being destroyed (KQ_DRAIN is set)
7289 * - EWOULDBLOCK: (not an error) events have been found and we should return
7290 * - EFAULT: copyout failed
7291 * - filter specific errors
7294 kqueue_process(kqueue_t kqu
, int flags
, kevent_ctx_t kectx
,
7295 kevent_callback_t callback
)
7297 workq_threadreq_t kqr
= current_uthread()->uu_kqr_bound
;
7299 int error
= 0, rc
= 0;
7300 struct kqtailq
*base_queue
, *queue
;
7301 #if DEBUG || DEVELOPMENT
7304 uint16_t kq_type
= (kqu
.kq
->kq_state
& (KQ_WORKQ
| KQ_WORKLOOP
));
7306 if (kq_type
& KQ_WORKQ
) {
7307 rc
= kqworkq_begin_processing(kqu
.kqwq
, kqr
, flags
);
7308 } else if (kq_type
& KQ_WORKLOOP
) {
7309 rc
= kqworkloop_begin_processing(kqu
.kqwl
, flags
);
7312 rc
= kqfile_begin_processing(kqu
.kqf
);
7319 /* Nothing to process */
7324 * loop through the enqueued knotes associated with this request,
7325 * processing each one. Each request may have several queues
7326 * of knotes to process (depending on the type of kqueue) so we
7327 * have to loop through all the queues as long as we have additional
7332 if (kq_type
& KQ_WORKQ
) {
7333 base_queue
= queue
= &kqu
.kqwq
->kqwq_queue
[kqr
->tr_kq_qos_index
];
7334 } else if (kq_type
& KQ_WORKLOOP
) {
7335 base_queue
= &kqu
.kqwl
->kqwl_queue
[0];
7336 queue
= &kqu
.kqwl
->kqwl_queue
[KQWL_NBUCKETS
- 1];
7338 base_queue
= queue
= &kqu
.kqf
->kqf_queue
;
7342 while ((kn
= TAILQ_FIRST(queue
)) != NULL
) {
7343 error
= knote_process(kn
, kectx
, callback
);
7344 if (error
== EJUSTRETURN
) {
7346 } else if (__improbable(error
)) {
7347 /* error is EWOULDBLOCK when the out event array is full */
7348 goto stop_processing
;
7351 } while (queue
-- > base_queue
);
7353 if (kectx
->kec_process_noutputs
) {
7354 /* callers will transform this into no error */
7355 error
= EWOULDBLOCK
;
7360 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
7361 * we want to unbind the kqrequest from the thread.
7363 * However, because the kq locks are dropped several times during process,
7364 * new knotes may have fired again, in which case, we want to fail the end
7365 * processing and process again, until it converges.
7367 * If we have an error or returned events, end processing never fails.
7370 flags
&= ~KEVENT_FLAG_PARKING
;
7372 if (kq_type
& KQ_WORKQ
) {
7373 rc
= kqworkq_end_processing(kqu
.kqwq
, kqr
, flags
);
7374 } else if (kq_type
& KQ_WORKLOOP
) {
7375 rc
= kqworkloop_end_processing(kqu
.kqwl
, KQ_PROCESSING
, flags
);
7377 rc
= kqfile_end_processing(kqu
.kqf
);
7380 if (__probable(error
)) {
7384 if (__probable(rc
>= 0)) {
7385 assert(rc
== 0 || rc
== EBADF
);
7389 #if DEBUG || DEVELOPMENT
7390 if (retries
-- == 0) {
7391 panic("kevent: way too many knote_process retries, kq: %p (0x%04x)",
7392 kqu
.kq
, kqu
.kq
->kq_state
);
7395 if (kq_type
& (KQ_WORKQ
| KQ_WORKLOOP
)) {
7396 assert(flags
& KEVENT_FLAG_PARKING
);
7404 * @function kqueue_scan_continue
7407 * The continuation used by kqueue_scan for kevent entry points.
7410 * Assumes we inherit a use/ref count on the kq or its fileglob.
7412 * This is called by kqueue_scan if neither KEVENT_FLAG_POLL nor
7413 * KEVENT_FLAG_KERNEL was set, and the caller had to wait.
7415 OS_NORETURN OS_NOINLINE
7417 kqueue_scan_continue(void *data
, wait_result_t wait_result
)
7419 uthread_t ut
= current_uthread();
7420 kevent_ctx_t kectx
= &ut
->uu_save
.uus_kevent
;
7421 int error
= 0, flags
= kectx
->kec_process_flags
;
7422 struct kqueue
*kq
= data
;
7425 * only kevent variants call in here, so we know the callback is
7426 * kevent_legacy_callback or kevent_modern_callback.
7428 assert((flags
& (KEVENT_FLAG_POLL
| KEVENT_FLAG_KERNEL
)) == 0);
7430 switch (wait_result
) {
7431 case THREAD_AWAKENED
:
7432 if (__improbable(flags
& (KEVENT_FLAG_LEGACY32
| KEVENT_FLAG_LEGACY64
))) {
7433 error
= kqueue_scan(kq
, flags
, kectx
, kevent_legacy_callback
);
7435 error
= kqueue_scan(kq
, flags
, kectx
, kevent_modern_callback
);
7438 case THREAD_TIMED_OUT
:
7441 case THREAD_INTERRUPTED
:
7444 case THREAD_RESTART
:
7448 panic("%s: - invalid wait_result (%d)", __func__
, wait_result
);
7452 error
= kevent_cleanup(kq
, flags
, error
, kectx
);
7453 *(int32_t *)&ut
->uu_rval
= kectx
->kec_process_noutputs
;
7454 unix_syscall_return(error
);
7458 * @function kqueue_scan
7461 * Scan and wait for events in a kqueue (used by poll & kevent).
7464 * Process the triggered events in a kqueue.
7466 * If there are no events triggered arrange to wait for them:
7467 * - unless KEVENT_FLAG_IMMEDIATE is set in kectx->kec_process_flags
7468 * - possibly until kectx->kec_deadline expires
7470 * When it waits, and that neither KEVENT_FLAG_POLL nor KEVENT_FLAG_KERNEL
7471 * are set, then it will wait in the kqueue_scan_continue continuation.
7473 * poll() will block in place, and KEVENT_FLAG_KERNEL calls
7474 * all pass KEVENT_FLAG_IMMEDIATE and will not wait.
7477 * The kqueue being scanned.
7480 * The KEVENT_FLAG_* flags for this call.
7483 * The context used for this scan.
7484 * The uthread_t::uu_save.uus_kevent storage is used for this purpose.
7487 * The callback to be called on events sucessfully processed.
7488 * (Either kevent_legacy_callback, kevent_modern_callback or poll_callback)
7491 kqueue_scan(struct kqueue
*kq
, int flags
, kevent_ctx_t kectx
,
7492 kevent_callback_t callback
)
7498 error
= kqueue_process(kq
, flags
, kectx
, callback
);
7501 * If we got an error, events returned (EWOULDBLOCK)
7502 * or blocking was disallowed (KEVENT_FLAG_IMMEDIATE),
7505 if (__probable(error
|| (flags
& KEVENT_FLAG_IMMEDIATE
))) {
7507 return error
== EWOULDBLOCK
? 0 : error
;
7510 waitq_assert_wait64_leeway((struct waitq
*)&kq
->kq_wqs
,
7511 KQ_EVENT
, THREAD_ABORTSAFE
, TIMEOUT_URGENCY_USER_NORMAL
,
7512 kectx
->kec_deadline
, TIMEOUT_NO_LEEWAY
);
7513 kq
->kq_state
|= KQ_SLEEP
;
7517 if (__probable((flags
& (KEVENT_FLAG_POLL
| KEVENT_FLAG_KERNEL
)) == 0)) {
7518 thread_block_parameter(kqueue_scan_continue
, kq
);
7519 __builtin_unreachable();
7522 wait_result_t wr
= thread_block(THREAD_CONTINUE_NULL
);
7524 case THREAD_AWAKENED
:
7526 case THREAD_TIMED_OUT
:
7528 case THREAD_INTERRUPTED
:
7530 case THREAD_RESTART
:
7533 panic("%s: - bad wait_result (%d)", __func__
, wr
);
7539 * @function kevent_internal
7542 * Common kevent code.
7545 * Needs to be inlined to specialize for legacy or modern and
7546 * eliminate dead code.
7548 * This is the core logic of kevent entry points, that will:
7549 * - register kevents
7550 * - optionally scan the kqueue for events
7552 * The caller is giving kevent_internal a reference on the kqueue
7553 * or its fileproc that needs to be cleaned up by kevent_cleanup().
7557 kevent_internal(kqueue_t kqu
,
7558 user_addr_t changelist
, int nchanges
,
7559 user_addr_t ueventlist
, int nevents
,
7560 int flags
, kevent_ctx_t kectx
, int32_t *retval
,
7563 int error
= 0, noutputs
= 0, register_rc
;
7565 /* only bound threads can receive events on workloops */
7566 if (!legacy
&& (flags
& KEVENT_FLAG_WORKLOOP
)) {
7567 #if CONFIG_WORKLOOP_DEBUG
7568 UU_KEVENT_HISTORY_WRITE_ENTRY(current_uthread(), {
7569 .uu_kqid
= kqu
.kqwl
->kqwl_dynamicid
,
7570 .uu_kq
= error
? NULL
: kqu
.kq
,
7572 .uu_nchanges
= nchanges
,
7573 .uu_nevents
= nevents
,
7576 #endif // CONFIG_WORKLOOP_DEBUG
7578 if (flags
& KEVENT_FLAG_KERNEL
) {
7579 /* see kevent_workq_internal */
7580 error
= copyout(&kqu
.kqwl
->kqwl_dynamicid
,
7581 ueventlist
- sizeof(kqueue_id_t
), sizeof(kqueue_id_t
));
7582 kectx
->kec_data_resid
-= sizeof(kqueue_id_t
);
7583 if (__improbable(error
)) {
7588 if (kevent_args_requesting_events(flags
, nevents
)) {
7590 * Disable the R2K notification while doing a register, if the
7591 * caller wants events too, we don't want the AST to be set if we
7592 * will process these events soon.
7595 kqu
.kq
->kq_state
&= ~KQ_R2K_ARMED
;
7597 flags
|= KEVENT_FLAG_NEEDS_END_PROCESSING
;
7601 /* register all the change requests the user provided... */
7602 while (nchanges
> 0 && error
== 0) {
7603 struct kevent_qos_s kev
;
7604 struct knote
*kn
= NULL
;
7607 error
= kevent_legacy_copyin(&changelist
, &kev
, flags
);
7609 error
= kevent_modern_copyin(&changelist
, &kev
);
7615 register_rc
= kevent_register(kqu
.kq
, &kev
, &kn
);
7616 if (__improbable(!legacy
&& (register_rc
& FILTER_REGISTER_WAIT
))) {
7617 thread_t thread
= current_thread();
7621 if (act_clear_astkevent(thread
, AST_KEVENT_REDRIVE_THREADREQ
)) {
7622 workq_kern_threadreq_redrive(kqu
.kq
->kq_p
, WORKQ_THREADREQ_NONE
);
7625 // f_post_register_wait is meant to call a continuation and not to
7626 // return, which is why we don't support FILTER_REGISTER_WAIT if
7627 // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
7628 // waits isn't the last.
7630 // It is implementable, but not used by any userspace code at the
7631 // moment, so for now return ENOTSUP if someone tries to do it.
7632 if (nchanges
== 1 && noutputs
< nevents
&&
7633 (flags
& KEVENT_FLAG_KERNEL
) == 0 &&
7634 (flags
& KEVENT_FLAG_PARKING
) == 0 &&
7635 (flags
& KEVENT_FLAG_ERROR_EVENTS
) &&
7636 (flags
& KEVENT_FLAG_WORKLOOP
)) {
7637 uthread_t ut
= get_bsdthread_info(thread
);
7640 * store the continuation/completion data in the uthread
7642 * Note: the kectx aliases with this,
7643 * and is destroyed in the process.
7645 ut
->uu_save
.uus_kevent_register
= (struct _kevent_register
){
7648 .eventout
= noutputs
,
7649 .ueventlist
= ueventlist
,
7651 knote_fops(kn
)->f_post_register_wait(ut
, kn
,
7652 &ut
->uu_save
.uus_kevent_register
);
7653 __builtin_unreachable();
7657 kev
.flags
|= EV_ERROR
;
7660 assert((register_rc
& FILTER_REGISTER_WAIT
) == 0);
7663 // keep in sync with kevent_register_wait_return()
7664 if (noutputs
< nevents
&& (kev
.flags
& (EV_ERROR
| EV_RECEIPT
))) {
7665 if ((kev
.flags
& EV_ERROR
) == 0) {
7666 kev
.flags
|= EV_ERROR
;
7670 error
= kevent_legacy_copyout(&kev
, &ueventlist
, flags
);
7672 error
= kevent_modern_copyout(&kev
, &ueventlist
);
7677 } else if (kev
.flags
& EV_ERROR
) {
7678 error
= (int)kev
.data
;
7683 if ((flags
& KEVENT_FLAG_ERROR_EVENTS
) == 0 &&
7684 nevents
> 0 && noutputs
== 0 && error
== 0) {
7685 kectx
->kec_process_flags
= flags
;
7686 kectx
->kec_process_nevents
= nevents
;
7687 kectx
->kec_process_noutputs
= 0;
7688 kectx
->kec_process_eventlist
= ueventlist
;
7691 error
= kqueue_scan(kqu
.kq
, flags
, kectx
, kevent_legacy_callback
);
7693 error
= kqueue_scan(kqu
.kq
, flags
, kectx
, kevent_modern_callback
);
7696 noutputs
= kectx
->kec_process_noutputs
;
7697 } else if (!legacy
&& (flags
& KEVENT_FLAG_NEEDS_END_PROCESSING
)) {
7699 * If we didn't through kqworkloop_end_processing(),
7700 * we need to do it here.
7702 * kqueue_scan will call kqworkloop_end_processing(),
7703 * so we only need to do it if we didn't scan.
7706 kqworkloop_end_processing(kqu
.kqwl
, 0, 0);
7712 return kevent_cleanup(kqu
.kq
, flags
, error
, kectx
);
7715 #pragma mark modern syscalls: kevent_qos, kevent_id, kevent_workq_internal
7718 * @function kevent_modern_internal
7721 * The backend of the kevent_id and kevent_workq_internal entry points.
7724 * Needs to be inline due to the number of arguments.
7728 kevent_modern_internal(kqueue_t kqu
,
7729 user_addr_t changelist
, int nchanges
,
7730 user_addr_t ueventlist
, int nevents
,
7731 int flags
, kevent_ctx_t kectx
, int32_t *retval
)
7733 return kevent_internal(kqu
.kq
, changelist
, nchanges
,
7734 ueventlist
, nevents
, flags
, kectx
, retval
, /*legacy*/ false);
7738 * @function kevent_id
7741 * The kevent_id() syscall.
7744 kevent_id(struct proc
*p
, struct kevent_id_args
*uap
, int32_t *retval
)
7746 int error
, flags
= uap
->flags
& KEVENT_FLAG_USER
;
7747 uthread_t uth
= current_uthread();
7748 workq_threadreq_t kqr
= uth
->uu_kqr_bound
;
7749 kevent_ctx_t kectx
= &uth
->uu_save
.uus_kevent
;
7752 flags
= kevent_adjust_flags_for_proc(p
, flags
);
7753 flags
|= KEVENT_FLAG_DYNAMIC_KQUEUE
;
7755 if (__improbable((flags
& (KEVENT_FLAG_WORKQ
| KEVENT_FLAG_WORKLOOP
)) !=
7756 KEVENT_FLAG_WORKLOOP
)) {
7760 error
= kevent_get_data_size(flags
, uap
->data_available
, uap
->data_out
, kectx
);
7761 if (__improbable(error
)) {
7765 kectx
->kec_deadline
= 0;
7766 kectx
->kec_fp
= NULL
;
7768 /* the kec_process_* fields are filled if kqueue_scann is called only */
7771 * Get the kq we are going to be working on
7772 * As a fastpath, look at the currently bound workloop.
7774 kqu
.kqwl
= kqr
? kqr_kqworkloop(kqr
) : NULL
;
7775 if (kqu
.kqwl
&& kqu
.kqwl
->kqwl_dynamicid
== uap
->id
) {
7776 if (__improbable(flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
)) {
7779 kqworkloop_retain(kqu
.kqwl
);
7780 } else if (__improbable(kevent_args_requesting_events(flags
, uap
->nevents
))) {
7783 error
= kqworkloop_get_or_create(p
, uap
->id
, NULL
, flags
, &kqu
.kqwl
);
7784 if (__improbable(error
)) {
7789 return kevent_modern_internal(kqu
, uap
->changelist
, uap
->nchanges
,
7790 uap
->eventlist
, uap
->nevents
, flags
, kectx
, retval
);
7794 * @function kevent_workq_internal
7797 * This function is exported for the sake of the workqueue subsystem.
7799 * It is called in two ways:
7800 * - when a thread is about to go to userspace to ask for pending event
7801 * - when a thread is returning from userspace with events back
7803 * the workqueue subsystem will only use the following flags:
7804 * - KEVENT_FLAG_STACK_DATA (always)
7805 * - KEVENT_FLAG_IMMEDIATE (always)
7806 * - KEVENT_FLAG_PARKING (depending on whether it is going to or returning from
7809 * It implicitly acts on the bound kqueue, and for the case of workloops
7810 * will copyout the kqueue ID before anything else.
7813 * Pthread will have setup the various arguments to fit this stack layout:
7815 * +-------....----+--------------+-----------+--------------------+
7816 * | user stack | data avail | nevents | pthread_self() |
7817 * +-------....----+--------------+-----------+--------------------+
7819 * data_out eventlist
7821 * When a workloop is used, the workloop ID is copied out right before
7822 * the eventlist and is taken from the data buffer.
7825 * This function is carefuly tailored to not make any call except the final tail
7826 * call into kevent_modern_internal. (LTO inlines current_uthread()).
7828 * This function is performance sensitive due to the workq subsystem.
7831 kevent_workq_internal(struct proc
*p
,
7832 user_addr_t changelist
, int nchanges
,
7833 user_addr_t eventlist
, int nevents
,
7834 user_addr_t data_out
, user_size_t
*data_available
,
7835 unsigned int flags
, int32_t *retval
)
7837 uthread_t uth
= current_uthread();
7838 workq_threadreq_t kqr
= uth
->uu_kqr_bound
;
7839 kevent_ctx_t kectx
= &uth
->uu_save
.uus_kevent
;
7842 assert(flags
== (KEVENT_FLAG_STACK_DATA
| KEVENT_FLAG_IMMEDIATE
) ||
7843 flags
== (KEVENT_FLAG_STACK_DATA
| KEVENT_FLAG_IMMEDIATE
| KEVENT_FLAG_PARKING
));
7845 kectx
->kec_data_out
= data_out
;
7846 kectx
->kec_data_avail
= (uint64_t)data_available
;
7847 kectx
->kec_data_size
= *data_available
;
7848 kectx
->kec_data_resid
= *data_available
;
7849 kectx
->kec_deadline
= 0;
7850 kectx
->kec_fp
= NULL
;
7852 /* the kec_process_* fields are filled if kqueue_scann is called only */
7854 flags
= kevent_adjust_flags_for_proc(p
, flags
);
7856 if (kqr
->tr_flags
& WORKQ_TR_FLAG_WORKLOOP
) {
7857 kqu
.kqwl
= __container_of(kqr
, struct kqworkloop
, kqwl_request
);
7858 kqworkloop_retain(kqu
.kqwl
);
7860 flags
|= KEVENT_FLAG_WORKLOOP
| KEVENT_FLAG_DYNAMIC_KQUEUE
|
7863 kqu
.kqwq
= p
->p_fd
->fd_wqkqueue
;
7865 flags
|= KEVENT_FLAG_WORKQ
| KEVENT_FLAG_KERNEL
;
7868 return kevent_modern_internal(kqu
, changelist
, nchanges
,
7869 eventlist
, nevents
, flags
, kectx
, retval
);
7873 * @function kevent_qos
7876 * The kevent_qos() syscall.
7879 kevent_qos(struct proc
*p
, struct kevent_qos_args
*uap
, int32_t *retval
)
7881 uthread_t uth
= current_uthread();
7882 kevent_ctx_t kectx
= &uth
->uu_save
.uus_kevent
;
7883 int error
, flags
= uap
->flags
& KEVENT_FLAG_USER
;
7886 if (__improbable(flags
& KEVENT_ID_FLAG_USER
)) {
7890 flags
= kevent_adjust_flags_for_proc(p
, flags
);
7892 error
= kevent_get_data_size(flags
, uap
->data_available
, uap
->data_out
, kectx
);
7893 if (__improbable(error
)) {
7897 kectx
->kec_deadline
= 0;
7898 kectx
->kec_fp
= NULL
;
7899 kectx
->kec_fd
= uap
->fd
;
7900 /* the kec_process_* fields are filled if kqueue_scann is called only */
7902 /* get the kq we are going to be working on */
7903 if (__probable(flags
& KEVENT_FLAG_WORKQ
)) {
7904 error
= kevent_get_kqwq(p
, flags
, uap
->nevents
, &kq
);
7906 error
= kevent_get_kqfile(p
, uap
->fd
, flags
, &kectx
->kec_fp
, &kq
);
7908 if (__improbable(error
)) {
7912 return kevent_modern_internal(kq
, uap
->changelist
, uap
->nchanges
,
7913 uap
->eventlist
, uap
->nevents
, flags
, kectx
, retval
);
7916 #pragma mark legacy syscalls: kevent, kevent64
7919 * @function kevent_legacy_get_deadline
7922 * Compute the deadline for the legacy kevent syscalls.
7925 * This is not necessary if KEVENT_FLAG_IMMEDIATE is specified,
7926 * as this takes precedence over the deadline.
7928 * This function will fail if utimeout is USER_ADDR_NULL
7929 * (the caller should check).
7932 kevent_legacy_get_deadline(int flags
, user_addr_t utimeout
, uint64_t *deadline
)
7936 if (flags
& KEVENT_FLAG_PROC64
) {
7937 struct user64_timespec ts64
;
7938 int error
= copyin(utimeout
, &ts64
, sizeof(ts64
));
7939 if (__improbable(error
)) {
7942 ts
.tv_sec
= (unsigned long)ts64
.tv_sec
;
7943 ts
.tv_nsec
= (long)ts64
.tv_nsec
;
7945 struct user32_timespec ts32
;
7946 int error
= copyin(utimeout
, &ts32
, sizeof(ts32
));
7947 if (__improbable(error
)) {
7950 ts
.tv_sec
= ts32
.tv_sec
;
7951 ts
.tv_nsec
= ts32
.tv_nsec
;
7953 if (!timespec_is_valid(&ts
)) {
7957 clock_absolutetime_interval_to_deadline(tstoabstime(&ts
), deadline
);
7962 * @function kevent_legacy_internal
7965 * The core implementation for kevent and kevent64
7969 kevent_legacy_internal(struct proc
*p
, struct kevent64_args
*uap
,
7970 int32_t *retval
, int flags
)
7972 uthread_t uth
= current_uthread();
7973 kevent_ctx_t kectx
= &uth
->uu_save
.uus_kevent
;
7977 if (__improbable(uap
->flags
& KEVENT_ID_FLAG_USER
)) {
7981 flags
= kevent_adjust_flags_for_proc(p
, flags
);
7983 kectx
->kec_data_out
= 0;
7984 kectx
->kec_data_avail
= 0;
7985 kectx
->kec_data_size
= 0;
7986 kectx
->kec_data_resid
= 0;
7987 kectx
->kec_deadline
= 0;
7988 kectx
->kec_fp
= NULL
;
7989 kectx
->kec_fd
= uap
->fd
;
7990 /* the kec_process_* fields are filled if kqueue_scann is called only */
7992 /* convert timeout to absolute - if we have one (and not immediate) */
7993 if (__improbable(uap
->timeout
&& !(flags
& KEVENT_FLAG_IMMEDIATE
))) {
7994 error
= kevent_legacy_get_deadline(flags
, uap
->timeout
,
7995 &kectx
->kec_deadline
);
7996 if (__improbable(error
)) {
8001 /* get the kq we are going to be working on */
8002 if (flags
& KEVENT_FLAG_WORKQ
) {
8003 error
= kevent_get_kqwq(p
, flags
, uap
->nevents
, &kq
);
8005 error
= kevent_get_kqfile(p
, uap
->fd
, flags
, &kectx
->kec_fp
, &kq
);
8007 if (__improbable(error
)) {
8011 return kevent_internal(kq
, uap
->changelist
, uap
->nchanges
,
8012 uap
->eventlist
, uap
->nevents
, flags
, kectx
, retval
,
8020 * The legacy kevent() syscall.
8023 kevent(struct proc
*p
, struct kevent_args
*uap
, int32_t *retval
)
8025 struct kevent64_args args
= {
8027 .changelist
= uap
->changelist
,
8028 .nchanges
= uap
->nchanges
,
8029 .eventlist
= uap
->eventlist
,
8030 .nevents
= uap
->nevents
,
8031 .timeout
= uap
->timeout
,
8034 return kevent_legacy_internal(p
, &args
, retval
, KEVENT_FLAG_LEGACY32
);
8038 * @function kevent64
8041 * The legacy kevent64() syscall.
8044 kevent64(struct proc
*p
, struct kevent64_args
*uap
, int32_t *retval
)
8046 int flags
= (uap
->flags
& KEVENT_FLAG_USER
) | KEVENT_FLAG_LEGACY64
;
8047 return kevent_legacy_internal(p
, uap
, retval
, flags
);
8050 #pragma mark - socket interface
8053 #include <sys/param.h>
8054 #include <sys/socket.h>
8055 #include <sys/protosw.h>
8056 #include <sys/domain.h>
8057 #include <sys/mbuf.h>
8058 #include <sys/kern_event.h>
8059 #include <sys/malloc.h>
8060 #include <sys/sys_domain.h>
8061 #include <sys/syslog.h>
8064 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8068 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8071 static LCK_GRP_DECLARE(kev_lck_grp
, "Kernel Event Protocol");
8072 static LCK_RW_DECLARE(kev_rwlock
, &kev_lck_grp
);
8074 static int kev_attach(struct socket
*so
, int proto
, struct proc
*p
);
8075 static int kev_detach(struct socket
*so
);
8076 static int kev_control(struct socket
*so
, u_long cmd
, caddr_t data
,
8077 struct ifnet
*ifp
, struct proc
*p
);
8078 static lck_mtx_t
* event_getlock(struct socket
*, int);
8079 static int event_lock(struct socket
*, int, void *);
8080 static int event_unlock(struct socket
*, int, void *);
8082 static int event_sofreelastref(struct socket
*);
8083 static void kev_delete(struct kern_event_pcb
*);
8085 static struct pr_usrreqs event_usrreqs
= {
8086 .pru_attach
= kev_attach
,
8087 .pru_control
= kev_control
,
8088 .pru_detach
= kev_detach
,
8089 .pru_soreceive
= soreceive
,
8092 static struct protosw eventsw
[] = {
8094 .pr_type
= SOCK_RAW
,
8095 .pr_protocol
= SYSPROTO_EVENT
,
8096 .pr_flags
= PR_ATOMIC
,
8097 .pr_usrreqs
= &event_usrreqs
,
8098 .pr_lock
= event_lock
,
8099 .pr_unlock
= event_unlock
,
8100 .pr_getlock
= event_getlock
,
8104 __private_extern__
int kevt_getstat SYSCTL_HANDLER_ARGS
;
8105 __private_extern__
int kevt_pcblist SYSCTL_HANDLER_ARGS
;
8107 SYSCTL_NODE(_net_systm
, OID_AUTO
, kevt
,
8108 CTLFLAG_RW
| CTLFLAG_LOCKED
, 0, "Kernel event family");
8110 struct kevtstat kevtstat
;
8111 SYSCTL_PROC(_net_systm_kevt
, OID_AUTO
, stats
,
8112 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
, 0, 0,
8113 kevt_getstat
, "S,kevtstat", "");
8115 SYSCTL_PROC(_net_systm_kevt
, OID_AUTO
, pcblist
,
8116 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
, 0, 0,
8117 kevt_pcblist
, "S,xkevtpcb", "");
8120 event_getlock(struct socket
*so
, int flags
)
8122 #pragma unused(flags)
8123 struct kern_event_pcb
*ev_pcb
= (struct kern_event_pcb
*)so
->so_pcb
;
8125 if (so
->so_pcb
!= NULL
) {
8126 if (so
->so_usecount
< 0) {
8127 panic("%s: so=%p usecount=%d lrh= %s\n", __func__
,
8128 so
, so
->so_usecount
, solockhistory_nr(so
));
8132 panic("%s: so=%p NULL NO so_pcb %s\n", __func__
,
8133 so
, solockhistory_nr(so
));
8136 return &ev_pcb
->evp_mtx
;
8140 event_lock(struct socket
*so
, int refcount
, void *lr
)
8145 lr_saved
= __builtin_return_address(0);
8150 if (so
->so_pcb
!= NULL
) {
8151 lck_mtx_lock(&((struct kern_event_pcb
*)so
->so_pcb
)->evp_mtx
);
8153 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__
,
8154 so
, lr_saved
, solockhistory_nr(so
));
8158 if (so
->so_usecount
< 0) {
8159 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__
,
8160 so
, so
->so_pcb
, lr_saved
, so
->so_usecount
,
8161 solockhistory_nr(so
));
8169 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
8170 so
->next_lock_lr
= (so
->next_lock_lr
+ 1) % SO_LCKDBG_MAX
;
8175 event_unlock(struct socket
*so
, int refcount
, void *lr
)
8178 lck_mtx_t
*mutex_held
;
8181 lr_saved
= __builtin_return_address(0);
8189 if (so
->so_usecount
< 0) {
8190 panic("%s: so=%p usecount=%d lrh= %s\n", __func__
,
8191 so
, so
->so_usecount
, solockhistory_nr(so
));
8194 if (so
->so_pcb
== NULL
) {
8195 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__
,
8196 so
, so
->so_usecount
, (void *)lr_saved
,
8197 solockhistory_nr(so
));
8200 mutex_held
= (&((struct kern_event_pcb
*)so
->so_pcb
)->evp_mtx
);
8202 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
8203 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
8204 so
->next_unlock_lr
= (so
->next_unlock_lr
+ 1) % SO_LCKDBG_MAX
;
8206 if (so
->so_usecount
== 0) {
8207 VERIFY(so
->so_flags
& SOF_PCBCLEARING
);
8208 event_sofreelastref(so
);
8210 lck_mtx_unlock(mutex_held
);
8217 event_sofreelastref(struct socket
*so
)
8219 struct kern_event_pcb
*ev_pcb
= (struct kern_event_pcb
*)so
->so_pcb
;
8221 LCK_MTX_ASSERT(&(ev_pcb
->evp_mtx
), LCK_MTX_ASSERT_OWNED
);
8226 * Disable upcall in the event another thread is in kev_post_msg()
8227 * appending record to the receive socket buffer, since sbwakeup()
8228 * may release the socket lock otherwise.
8230 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
8231 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
8232 so
->so_event
= sonullevent
;
8233 lck_mtx_unlock(&(ev_pcb
->evp_mtx
));
8235 LCK_MTX_ASSERT(&(ev_pcb
->evp_mtx
), LCK_MTX_ASSERT_NOTOWNED
);
8236 lck_rw_lock_exclusive(&kev_rwlock
);
8237 LIST_REMOVE(ev_pcb
, evp_link
);
8238 kevtstat
.kes_pcbcount
--;
8239 kevtstat
.kes_gencnt
++;
8240 lck_rw_done(&kev_rwlock
);
8243 sofreelastref(so
, 1);
8247 static int event_proto_count
= (sizeof(eventsw
) / sizeof(struct protosw
));
8250 struct kern_event_head kern_event_head
;
8252 static u_int32_t static_event_id
= 0;
8254 static ZONE_DECLARE(ev_pcb_zone
, "kerneventpcb",
8255 sizeof(struct kern_event_pcb
), ZC_ZFREE_CLEARMEM
);
8258 * Install the protosw's for the NKE manager. Invoked at extension load time
8261 kern_event_init(struct domain
*dp
)
8266 VERIFY(!(dp
->dom_flags
& DOM_INITIALIZED
));
8267 VERIFY(dp
== systemdomain
);
8269 for (i
= 0, pr
= &eventsw
[0]; i
< event_proto_count
; i
++, pr
++) {
8270 net_add_proto(pr
, dp
, 1);
8275 kev_attach(struct socket
*so
, __unused
int proto
, __unused
struct proc
*p
)
8278 struct kern_event_pcb
*ev_pcb
;
8280 error
= soreserve(so
, KEV_SNDSPACE
, KEV_RECVSPACE
);
8285 ev_pcb
= zalloc_flags(ev_pcb_zone
, Z_WAITOK
| Z_ZERO
);
8286 lck_mtx_init(&ev_pcb
->evp_mtx
, &kev_lck_grp
, LCK_ATTR_NULL
);
8288 ev_pcb
->evp_socket
= so
;
8289 ev_pcb
->evp_vendor_code_filter
= 0xffffffff;
8291 so
->so_pcb
= (caddr_t
) ev_pcb
;
8292 lck_rw_lock_exclusive(&kev_rwlock
);
8293 LIST_INSERT_HEAD(&kern_event_head
, ev_pcb
, evp_link
);
8294 kevtstat
.kes_pcbcount
++;
8295 kevtstat
.kes_gencnt
++;
8296 lck_rw_done(&kev_rwlock
);
8302 kev_delete(struct kern_event_pcb
*ev_pcb
)
8304 VERIFY(ev_pcb
!= NULL
);
8305 lck_mtx_destroy(&ev_pcb
->evp_mtx
, &kev_lck_grp
);
8306 zfree(ev_pcb_zone
, ev_pcb
);
8310 kev_detach(struct socket
*so
)
8312 struct kern_event_pcb
*ev_pcb
= (struct kern_event_pcb
*) so
->so_pcb
;
8314 if (ev_pcb
!= NULL
) {
8315 soisdisconnected(so
);
8316 so
->so_flags
|= SOF_PCBCLEARING
;
8323 * For now, kev_vendor_code and mbuf_tags use the same
8327 kev_vendor_code_find(
8329 u_int32_t
*out_vendor_code
)
8331 if (strlen(string
) >= KEV_VENDOR_CODE_MAX_STR_LEN
) {
8334 return net_str_id_find_internal(string
, out_vendor_code
,
8335 NSI_VENDOR_CODE
, 1);
8339 kev_msg_post(struct kev_msg
*event_msg
)
8341 mbuf_tag_id_t min_vendor
, max_vendor
;
8343 net_str_id_first_last(&min_vendor
, &max_vendor
, NSI_VENDOR_CODE
);
8345 if (event_msg
== NULL
) {
8350 * Limit third parties to posting events for registered vendor codes
8353 if (event_msg
->vendor_code
< min_vendor
||
8354 event_msg
->vendor_code
> max_vendor
) {
8355 os_atomic_inc(&kevtstat
.kes_badvendor
, relaxed
);
8358 return kev_post_msg(event_msg
);
8362 kev_post_msg(struct kev_msg
*event_msg
)
8364 struct mbuf
*m
, *m2
;
8365 struct kern_event_pcb
*ev_pcb
;
8366 struct kern_event_msg
*ev
;
8368 u_int32_t total_size
;
8371 /* Verify the message is small enough to fit in one mbuf w/o cluster */
8372 total_size
= KEV_MSG_HEADER_SIZE
;
8374 for (i
= 0; i
< 5; i
++) {
8375 if (event_msg
->dv
[i
].data_length
== 0) {
8378 total_size
+= event_msg
->dv
[i
].data_length
;
8381 if (total_size
> MLEN
) {
8382 os_atomic_inc(&kevtstat
.kes_toobig
, relaxed
);
8386 m
= m_get(M_WAIT
, MT_DATA
);
8388 os_atomic_inc(&kevtstat
.kes_nomem
, relaxed
);
8391 ev
= mtod(m
, struct kern_event_msg
*);
8392 total_size
= KEV_MSG_HEADER_SIZE
;
8394 tmp
= (char *) &ev
->event_data
[0];
8395 for (i
= 0; i
< 5; i
++) {
8396 if (event_msg
->dv
[i
].data_length
== 0) {
8400 total_size
+= event_msg
->dv
[i
].data_length
;
8401 bcopy(event_msg
->dv
[i
].data_ptr
, tmp
,
8402 event_msg
->dv
[i
].data_length
);
8403 tmp
+= event_msg
->dv
[i
].data_length
;
8406 ev
->id
= ++static_event_id
;
8407 ev
->total_size
= total_size
;
8408 ev
->vendor_code
= event_msg
->vendor_code
;
8409 ev
->kev_class
= event_msg
->kev_class
;
8410 ev
->kev_subclass
= event_msg
->kev_subclass
;
8411 ev
->event_code
= event_msg
->event_code
;
8413 m
->m_len
= total_size
;
8414 lck_rw_lock_shared(&kev_rwlock
);
8415 for (ev_pcb
= LIST_FIRST(&kern_event_head
);
8417 ev_pcb
= LIST_NEXT(ev_pcb
, evp_link
)) {
8418 lck_mtx_lock(&ev_pcb
->evp_mtx
);
8419 if (ev_pcb
->evp_socket
->so_pcb
== NULL
) {
8420 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8423 if (ev_pcb
->evp_vendor_code_filter
!= KEV_ANY_VENDOR
) {
8424 if (ev_pcb
->evp_vendor_code_filter
!= ev
->vendor_code
) {
8425 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8429 if (ev_pcb
->evp_class_filter
!= KEV_ANY_CLASS
) {
8430 if (ev_pcb
->evp_class_filter
!= ev
->kev_class
) {
8431 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8435 if ((ev_pcb
->evp_subclass_filter
!=
8436 KEV_ANY_SUBCLASS
) &&
8437 (ev_pcb
->evp_subclass_filter
!=
8438 ev
->kev_subclass
)) {
8439 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8445 m2
= m_copym(m
, 0, m
->m_len
, M_WAIT
);
8447 os_atomic_inc(&kevtstat
.kes_nomem
, relaxed
);
8449 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8450 lck_rw_done(&kev_rwlock
);
8453 if (sbappendrecord(&ev_pcb
->evp_socket
->so_rcv
, m2
)) {
8455 * We use "m" for the socket stats as it would be
8456 * unsafe to use "m2"
8458 so_inc_recv_data_stat(ev_pcb
->evp_socket
,
8459 1, m
->m_len
, MBUF_TC_BE
);
8461 sorwakeup(ev_pcb
->evp_socket
);
8462 os_atomic_inc(&kevtstat
.kes_posted
, relaxed
);
8464 os_atomic_inc(&kevtstat
.kes_fullsock
, relaxed
);
8466 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8469 lck_rw_done(&kev_rwlock
);
8475 kev_control(struct socket
*so
,
8478 __unused
struct ifnet
*ifp
,
8479 __unused
struct proc
*p
)
8481 struct kev_request
*kev_req
= (struct kev_request
*) data
;
8482 struct kern_event_pcb
*ev_pcb
;
8483 struct kev_vendor_code
*kev_vendor
;
8484 u_int32_t
*id_value
= (u_int32_t
*) data
;
8488 *id_value
= static_event_id
;
8491 ev_pcb
= (struct kern_event_pcb
*) so
->so_pcb
;
8492 ev_pcb
->evp_vendor_code_filter
= kev_req
->vendor_code
;
8493 ev_pcb
->evp_class_filter
= kev_req
->kev_class
;
8494 ev_pcb
->evp_subclass_filter
= kev_req
->kev_subclass
;
8497 ev_pcb
= (struct kern_event_pcb
*) so
->so_pcb
;
8498 kev_req
->vendor_code
= ev_pcb
->evp_vendor_code_filter
;
8499 kev_req
->kev_class
= ev_pcb
->evp_class_filter
;
8500 kev_req
->kev_subclass
= ev_pcb
->evp_subclass_filter
;
8502 case SIOCGKEVVENDOR
:
8503 kev_vendor
= (struct kev_vendor_code
*)data
;
8504 /* Make sure string is NULL terminated */
8505 kev_vendor
->vendor_string
[KEV_VENDOR_CODE_MAX_STR_LEN
- 1] = 0;
8506 return net_str_id_find_internal(kev_vendor
->vendor_string
,
8507 &kev_vendor
->vendor_code
, NSI_VENDOR_CODE
, 0);
8516 kevt_getstat SYSCTL_HANDLER_ARGS
8518 #pragma unused(oidp, arg1, arg2)
8521 lck_rw_lock_shared(&kev_rwlock
);
8523 if (req
->newptr
!= USER_ADDR_NULL
) {
8527 if (req
->oldptr
== USER_ADDR_NULL
) {
8528 req
->oldidx
= sizeof(struct kevtstat
);
8532 error
= SYSCTL_OUT(req
, &kevtstat
,
8533 MIN(sizeof(struct kevtstat
), req
->oldlen
));
8535 lck_rw_done(&kev_rwlock
);
8540 __private_extern__
int
8541 kevt_pcblist SYSCTL_HANDLER_ARGS
8543 #pragma unused(oidp, arg1, arg2)
8546 struct xsystmgen xsg
;
8548 size_t item_size
= ROUNDUP64(sizeof(struct xkevtpcb
)) +
8549 ROUNDUP64(sizeof(struct xsocket_n
)) +
8550 2 * ROUNDUP64(sizeof(struct xsockbuf_n
)) +
8551 ROUNDUP64(sizeof(struct xsockstat_n
));
8552 struct kern_event_pcb
*ev_pcb
;
8554 buf
= _MALLOC(item_size
, M_TEMP
, M_WAITOK
| M_ZERO
);
8559 lck_rw_lock_shared(&kev_rwlock
);
8561 n
= kevtstat
.kes_pcbcount
;
8563 if (req
->oldptr
== USER_ADDR_NULL
) {
8564 req
->oldidx
= (size_t) ((n
+ n
/ 8) * item_size
);
8567 if (req
->newptr
!= USER_ADDR_NULL
) {
8571 bzero(&xsg
, sizeof(xsg
));
8572 xsg
.xg_len
= sizeof(xsg
);
8574 xsg
.xg_gen
= kevtstat
.kes_gencnt
;
8575 xsg
.xg_sogen
= so_gencnt
;
8576 error
= SYSCTL_OUT(req
, &xsg
, sizeof(xsg
));
8581 * We are done if there is no pcb
8588 for (i
= 0, ev_pcb
= LIST_FIRST(&kern_event_head
);
8589 i
< n
&& ev_pcb
!= NULL
;
8590 i
++, ev_pcb
= LIST_NEXT(ev_pcb
, evp_link
)) {
8591 struct xkevtpcb
*xk
= (struct xkevtpcb
*)buf
;
8592 struct xsocket_n
*xso
= (struct xsocket_n
*)
8593 ADVANCE64(xk
, sizeof(*xk
));
8594 struct xsockbuf_n
*xsbrcv
= (struct xsockbuf_n
*)
8595 ADVANCE64(xso
, sizeof(*xso
));
8596 struct xsockbuf_n
*xsbsnd
= (struct xsockbuf_n
*)
8597 ADVANCE64(xsbrcv
, sizeof(*xsbrcv
));
8598 struct xsockstat_n
*xsostats
= (struct xsockstat_n
*)
8599 ADVANCE64(xsbsnd
, sizeof(*xsbsnd
));
8601 bzero(buf
, item_size
);
8603 lck_mtx_lock(&ev_pcb
->evp_mtx
);
8605 xk
->kep_len
= sizeof(struct xkevtpcb
);
8606 xk
->kep_kind
= XSO_EVT
;
8607 xk
->kep_evtpcb
= (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb
);
8608 xk
->kep_vendor_code_filter
= ev_pcb
->evp_vendor_code_filter
;
8609 xk
->kep_class_filter
= ev_pcb
->evp_class_filter
;
8610 xk
->kep_subclass_filter
= ev_pcb
->evp_subclass_filter
;
8612 sotoxsocket_n(ev_pcb
->evp_socket
, xso
);
8613 sbtoxsockbuf_n(ev_pcb
->evp_socket
?
8614 &ev_pcb
->evp_socket
->so_rcv
: NULL
, xsbrcv
);
8615 sbtoxsockbuf_n(ev_pcb
->evp_socket
?
8616 &ev_pcb
->evp_socket
->so_snd
: NULL
, xsbsnd
);
8617 sbtoxsockstat_n(ev_pcb
->evp_socket
, xsostats
);
8619 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8621 error
= SYSCTL_OUT(req
, buf
, item_size
);
8626 * Give the user an updated idea of our state.
8627 * If the generation differs from what we told
8628 * her before, she knows that something happened
8629 * while we were processing this request, and it
8630 * might be necessary to retry.
8632 bzero(&xsg
, sizeof(xsg
));
8633 xsg
.xg_len
= sizeof(xsg
);
8635 xsg
.xg_gen
= kevtstat
.kes_gencnt
;
8636 xsg
.xg_sogen
= so_gencnt
;
8637 error
= SYSCTL_OUT(req
, &xsg
, sizeof(xsg
));
8644 lck_rw_done(&kev_rwlock
);
8653 #endif /* SOCKETS */
8657 fill_kqueueinfo(struct kqueue
*kq
, struct kqueue_info
* kinfo
)
8659 struct vinfo_stat
* st
;
8661 st
= &kinfo
->kq_stat
;
8663 st
->vst_size
= kq
->kq_count
;
8664 if (kq
->kq_state
& KQ_KEV_QOS
) {
8665 st
->vst_blksize
= sizeof(struct kevent_qos_s
);
8666 } else if (kq
->kq_state
& KQ_KEV64
) {
8667 st
->vst_blksize
= sizeof(struct kevent64_s
);
8669 st
->vst_blksize
= sizeof(struct kevent
);
8671 st
->vst_mode
= S_IFIFO
;
8672 st
->vst_ino
= (kq
->kq_state
& KQ_DYNAMIC
) ?
8673 ((struct kqworkloop
*)kq
)->kqwl_dynamicid
: 0;
8675 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
8676 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
8677 kinfo
->kq_state
= kq
->kq_state
& PROC_KQUEUE_MASK
;
8683 fill_kqueue_dyninfo(struct kqworkloop
*kqwl
, struct kqueue_dyninfo
*kqdi
)
8685 workq_threadreq_t kqr
= &kqwl
->kqwl_request
;
8686 workq_threadreq_param_t trp
= {};
8689 if ((kqwl
->kqwl_state
& KQ_WORKLOOP
) == 0) {
8693 if ((err
= fill_kqueueinfo(&kqwl
->kqwl_kqueue
, &kqdi
->kqdi_info
))) {
8699 kqdi
->kqdi_servicer
= thread_tid(kqr_thread(kqr
));
8700 kqdi
->kqdi_owner
= thread_tid(kqwl
->kqwl_owner
);
8701 kqdi
->kqdi_request_state
= kqr
->tr_state
;
8702 kqdi
->kqdi_async_qos
= kqr
->tr_kq_qos_index
;
8703 kqdi
->kqdi_events_qos
= kqr
->tr_kq_override_index
;
8704 kqdi
->kqdi_sync_waiters
= 0;
8705 kqdi
->kqdi_sync_waiter_qos
= 0;
8707 trp
.trp_value
= kqwl
->kqwl_params
;
8708 if (trp
.trp_flags
& TRP_PRIORITY
) {
8709 kqdi
->kqdi_pri
= trp
.trp_pri
;
8714 if (trp
.trp_flags
& TRP_POLICY
) {
8715 kqdi
->kqdi_pol
= trp
.trp_pol
;
8720 if (trp
.trp_flags
& TRP_CPUPERCENT
) {
8721 kqdi
->kqdi_cpupercent
= trp
.trp_cpupercent
;
8723 kqdi
->kqdi_cpupercent
= 0;
8733 knote_markstayactive(struct knote
*kn
)
8735 struct kqueue
*kq
= knote_get_kq(kn
);
8739 kn
->kn_status
|= KN_STAYACTIVE
;
8742 * Making a knote stay active is a property of the knote that must be
8743 * established before it is fully attached.
8745 assert((kn
->kn_status
& (KN_QUEUED
| KN_SUPPRESSED
)) == 0);
8747 /* handle all stayactive knotes on the (appropriate) manager */
8748 if (kq
->kq_state
& KQ_WORKLOOP
) {
8749 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
8751 qos
= _pthread_priority_thread_qos(kn
->kn_qos
);
8752 assert(qos
&& qos
< THREAD_QOS_LAST
);
8753 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_UPDATE_STAYACTIVE_QOS
, qos
);
8754 qos
= KQWL_BUCKET_STAYACTIVE
;
8755 } else if (kq
->kq_state
& KQ_WORKQ
) {
8756 qos
= KQWQ_QOS_MANAGER
;
8758 qos
= THREAD_QOS_UNSPECIFIED
;
8761 kn
->kn_qos_override
= qos
;
8762 kn
->kn_qos_index
= qos
;
8764 knote_activate(kq
, kn
, FILTER_ACTIVE
);
8769 knote_clearstayactive(struct knote
*kn
)
8771 struct kqueue
*kq
= knote_get_kq(kn
);
8773 kn
->kn_status
&= ~(KN_STAYACTIVE
| KN_ACTIVE
);
8774 knote_dequeue(kq
, kn
);
8778 static unsigned long
8779 kevent_extinfo_emit(struct kqueue
*kq
, struct knote
*kn
, struct kevent_extinfo
*buf
,
8780 unsigned long buflen
, unsigned long nknotes
)
8782 for (; kn
; kn
= SLIST_NEXT(kn
, kn_link
)) {
8783 if (kq
== knote_get_kq(kn
)) {
8784 if (nknotes
< buflen
) {
8785 struct kevent_extinfo
*info
= &buf
[nknotes
];
8789 info
->kqext_kev
= *(struct kevent_qos_s
*)&kn
->kn_kevent
;
8790 if (knote_has_qos(kn
)) {
8791 info
->kqext_kev
.qos
=
8792 _pthread_priority_thread_qos_fast(kn
->kn_qos
);
8794 info
->kqext_kev
.qos
= kn
->kn_qos_override
;
8796 info
->kqext_kev
.filter
|= 0xff00; /* sign extend filter */
8797 info
->kqext_kev
.xflags
= 0; /* this is where sfflags lives */
8798 info
->kqext_kev
.data
= 0; /* this is where sdata lives */
8799 info
->kqext_sdata
= kn
->kn_sdata
;
8800 info
->kqext_status
= kn
->kn_status
;
8801 info
->kqext_sfflags
= kn
->kn_sfflags
;
8806 /* we return total number of knotes, which may be more than requested */
8815 kevent_copyout_proc_dynkqids(void *proc
, user_addr_t ubuf
, uint32_t ubufsize
,
8816 int32_t *nkqueues_out
)
8818 proc_t p
= (proc_t
)proc
;
8819 struct filedesc
*fdp
= p
->p_fd
;
8820 unsigned int nkqueues
= 0;
8821 unsigned long ubuflen
= ubufsize
/ sizeof(kqueue_id_t
);
8822 size_t buflen
, bufsize
;
8823 kqueue_id_t
*kq_ids
= NULL
;
8828 if (ubuf
== USER_ADDR_NULL
&& ubufsize
!= 0) {
8833 buflen
= MIN(ubuflen
, PROC_PIDDYNKQUEUES_MAX
);
8836 if (os_mul_overflow(sizeof(kqueue_id_t
), buflen
, &bufsize
)) {
8840 kq_ids
= kheap_alloc(KHEAP_TEMP
, bufsize
, Z_WAITOK
| Z_ZERO
);
8849 if (fdp
->fd_kqhashmask
> 0) {
8850 for (uint32_t i
= 0; i
< fdp
->fd_kqhashmask
+ 1; i
++) {
8851 struct kqworkloop
*kqwl
;
8853 LIST_FOREACH(kqwl
, &fdp
->fd_kqhash
[i
], kqwl_hashlink
) {
8854 /* report the number of kqueues, even if they don't all fit */
8855 if (nkqueues
< buflen
) {
8856 kq_ids
[nkqueues
] = kqwl
->kqwl_dynamicid
;
8867 if (os_mul_overflow(sizeof(kqueue_id_t
), MIN(buflen
, nkqueues
), ©size
)) {
8872 assert(ubufsize
>= copysize
);
8873 err
= copyout(kq_ids
, ubuf
, copysize
);
8878 kheap_free(KHEAP_TEMP
, kq_ids
, bufsize
);
8882 *nkqueues_out
= (int)min(nkqueues
, PROC_PIDDYNKQUEUES_MAX
);
8888 kevent_copyout_dynkqinfo(void *proc
, kqueue_id_t kq_id
, user_addr_t ubuf
,
8889 uint32_t ubufsize
, int32_t *size_out
)
8891 proc_t p
= (proc_t
)proc
;
8892 struct kqworkloop
*kqwl
;
8894 struct kqueue_dyninfo kqdi
= { };
8898 if (ubufsize
< sizeof(struct kqueue_info
)) {
8902 kqwl
= kqworkloop_hash_lookup_and_retain(p
->p_fd
, kq_id
);
8908 * backward compatibility: allow the argument to this call to only be
8909 * a struct kqueue_info
8911 if (ubufsize
>= sizeof(struct kqueue_dyninfo
)) {
8912 ubufsize
= sizeof(struct kqueue_dyninfo
);
8913 err
= fill_kqueue_dyninfo(kqwl
, &kqdi
);
8915 ubufsize
= sizeof(struct kqueue_info
);
8916 err
= fill_kqueueinfo(&kqwl
->kqwl_kqueue
, &kqdi
.kqdi_info
);
8918 if (err
== 0 && (err
= copyout(&kqdi
, ubuf
, ubufsize
)) == 0) {
8919 *size_out
= ubufsize
;
8921 kqworkloop_release(kqwl
);
8926 kevent_copyout_dynkqextinfo(void *proc
, kqueue_id_t kq_id
, user_addr_t ubuf
,
8927 uint32_t ubufsize
, int32_t *nknotes_out
)
8929 proc_t p
= (proc_t
)proc
;
8930 struct kqworkloop
*kqwl
;
8933 kqwl
= kqworkloop_hash_lookup_and_retain(p
->p_fd
, kq_id
);
8938 err
= pid_kqueue_extinfo(p
, &kqwl
->kqwl_kqueue
, ubuf
, ubufsize
, nknotes_out
);
8939 kqworkloop_release(kqwl
);
8944 pid_kqueue_extinfo(proc_t p
, struct kqueue
*kq
, user_addr_t ubuf
,
8945 uint32_t bufsize
, int32_t *retval
)
8950 struct filedesc
*fdp
= p
->p_fd
;
8951 unsigned long nknotes
= 0;
8952 unsigned long buflen
= bufsize
/ sizeof(struct kevent_extinfo
);
8953 struct kevent_extinfo
*kqext
= NULL
;
8955 /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
8956 buflen
= MIN(buflen
, PROC_PIDFDKQUEUE_KNOTES_MAX
);
8958 kqext
= kheap_alloc(KHEAP_TEMP
,
8959 buflen
* sizeof(struct kevent_extinfo
), Z_WAITOK
| Z_ZERO
);
8960 if (kqext
== NULL
) {
8966 for (i
= 0; i
< fdp
->fd_knlistsize
; i
++) {
8967 kn
= SLIST_FIRST(&fdp
->fd_knlist
[i
]);
8968 nknotes
= kevent_extinfo_emit(kq
, kn
, kqext
, buflen
, nknotes
);
8972 if (fdp
->fd_knhashmask
!= 0) {
8973 for (i
= 0; i
< (int)fdp
->fd_knhashmask
+ 1; i
++) {
8975 kn
= SLIST_FIRST(&fdp
->fd_knhash
[i
]);
8976 nknotes
= kevent_extinfo_emit(kq
, kn
, kqext
, buflen
, nknotes
);
8981 assert(bufsize
>= sizeof(struct kevent_extinfo
) * MIN(buflen
, nknotes
));
8982 err
= copyout(kqext
, ubuf
, sizeof(struct kevent_extinfo
) * MIN(buflen
, nknotes
));
8986 kheap_free(KHEAP_TEMP
, kqext
, buflen
* sizeof(struct kevent_extinfo
));
8991 *retval
= (int32_t)MIN(nknotes
, PROC_PIDFDKQUEUE_KNOTES_MAX
);
8997 klist_copy_udata(struct klist
*list
, uint64_t *buf
,
8998 unsigned int buflen
, unsigned int nknotes
)
9001 SLIST_FOREACH(kn
, list
, kn_link
) {
9002 if (nknotes
< buflen
) {
9004 * kevent_register will always set kn_udata atomically
9005 * so that we don't have to take any kqlock here.
9007 buf
[nknotes
] = os_atomic_load_wide(&kn
->kn_udata
, relaxed
);
9009 /* we return total number of knotes, which may be more than requested */
9017 kevent_proc_copy_uptrs(void *proc
, uint64_t *buf
, uint32_t bufsize
)
9019 proc_t p
= (proc_t
)proc
;
9020 struct filedesc
*fdp
= p
->p_fd
;
9021 unsigned int nuptrs
= 0;
9022 unsigned int buflen
= bufsize
/ sizeof(uint64_t);
9023 struct kqworkloop
*kqwl
;
9026 assert(buf
!= NULL
);
9030 for (int i
= 0; i
< fdp
->fd_knlistsize
; i
++) {
9031 nuptrs
= klist_copy_udata(&fdp
->fd_knlist
[i
], buf
, buflen
, nuptrs
);
9036 if (fdp
->fd_knhashmask
!= 0) {
9037 for (size_t i
= 0; i
< fdp
->fd_knhashmask
+ 1; i
++) {
9038 nuptrs
= klist_copy_udata(&fdp
->fd_knhash
[i
], buf
, buflen
, nuptrs
);
9044 if (fdp
->fd_kqhashmask
!= 0) {
9045 for (size_t i
= 0; i
< fdp
->fd_kqhashmask
+ 1; i
++) {
9046 LIST_FOREACH(kqwl
, &fdp
->fd_kqhash
[i
], kqwl_hashlink
) {
9047 if (nuptrs
< buflen
) {
9048 buf
[nuptrs
] = kqwl
->kqwl_dynamicid
;
9060 kevent_set_return_to_kernel_user_tsd(proc_t p
, thread_t thread
)
9063 bool proc_is_64bit
= !!(p
->p_flag
& P_LP64
);
9064 size_t user_addr_size
= proc_is_64bit
? 8 : 4;
9065 uint32_t ast_flags32
= 0;
9066 uint64_t ast_flags64
= 0;
9067 struct uthread
*ut
= get_bsdthread_info(thread
);
9069 if (ut
->uu_kqr_bound
!= NULL
) {
9070 ast_flags64
|= R2K_WORKLOOP_PENDING_EVENTS
;
9073 if (ast_flags64
== 0) {
9077 if (!(p
->p_flag
& P_LP64
)) {
9078 ast_flags32
= (uint32_t)ast_flags64
;
9079 assert(ast_flags64
< 0x100000000ull
);
9082 ast_addr
= thread_rettokern_addr(thread
);
9083 if (ast_addr
== 0) {
9087 if (copyout((proc_is_64bit
? (void *)&ast_flags64
: (void *)&ast_flags32
),
9088 (user_addr_t
)ast_addr
,
9089 user_addr_size
) != 0) {
9090 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9091 "ast_addr = %llu\n", p
->p_pid
, thread_tid(current_thread()), ast_addr
);
9096 kevent_ast(thread_t thread
, uint16_t bits
)
9098 proc_t p
= current_proc();
9100 if (bits
& AST_KEVENT_REDRIVE_THREADREQ
) {
9101 workq_kern_threadreq_redrive(p
, WORKQ_THREADREQ_CAN_CREATE_THREADS
);
9103 if (bits
& AST_KEVENT_RETURN_TO_KERNEL
) {
9104 kevent_set_return_to_kernel_user_tsd(p
, thread
);
9108 #if DEVELOPMENT || DEBUG
9110 #define KEVENT_SYSCTL_BOUND_ID 1
9113 kevent_sysctl SYSCTL_HANDLER_ARGS
9115 #pragma unused(oidp, arg2)
9116 uintptr_t type
= (uintptr_t)arg1
;
9117 uint64_t bound_id
= 0;
9119 if (type
!= KEVENT_SYSCTL_BOUND_ID
) {
9127 struct uthread
*ut
= get_bsdthread_info(current_thread());
9132 workq_threadreq_t kqr
= ut
->uu_kqr_bound
;
9134 if (kqr
->tr_flags
& WORKQ_TR_FLAG_WORKLOOP
) {
9135 bound_id
= kqr_kqworkloop(kqr
)->kqwl_dynamicid
;
9141 return sysctl_io_number(req
, bound_id
, sizeof(bound_id
), NULL
, NULL
);
9144 SYSCTL_NODE(_kern
, OID_AUTO
, kevent
, CTLFLAG_RW
| CTLFLAG_LOCKED
, 0,
9145 "kevent information");
9147 SYSCTL_PROC(_kern_kevent
, OID_AUTO
, bound_id
,
9148 CTLTYPE_QUAD
| CTLFLAG_RD
| CTLFLAG_LOCKED
| CTLFLAG_MASKED
,
9149 (void *)KEVENT_SYSCTL_BOUND_ID
,
9150 sizeof(kqueue_id_t
), kevent_sysctl
, "Q",
9151 "get the ID of the bound kqueue");
9153 #endif /* DEVELOPMENT || DEBUG */