2 * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * @(#)kern_event.c 1.0 (3/31/2000)
58 #include <machine/atomic.h>
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
78 #include <sys/sysctl.h>
80 #include <sys/sysproto.h>
82 #include <sys/vnode_internal.h>
84 #include <sys/proc_info.h>
85 #include <sys/codesign.h>
86 #include <sys/pthread_shims.h>
87 #include <sys/kdebug.h>
88 #include <sys/reason.h>
89 #include <os/reason_private.h>
90 #include <pexpert/pexpert.h>
92 #include <kern/locks.h>
93 #include <kern/clock.h>
94 #include <kern/cpu_data.h>
95 #include <kern/policy_internal.h>
96 #include <kern/thread_call.h>
97 #include <kern/sched_prim.h>
98 #include <kern/waitq.h>
99 #include <kern/zalloc.h>
100 #include <kern/kalloc.h>
101 #include <kern/assert.h>
102 #include <kern/ast.h>
103 #include <kern/thread.h>
104 #include <kern/kcdata.h>
106 #include <pthread/priority_private.h>
107 #include <pthread/workqueue_syscalls.h>
108 #include <pthread/workqueue_internal.h>
109 #include <libkern/libkern.h>
110 #include <libkern/OSAtomic.h>
112 #include "net/net_str_id.h"
114 #include <mach/task.h>
115 #include <libkern/section_keywords.h>
117 #if CONFIG_MEMORYSTATUS
118 #include <sys/kern_memorystatus.h>
121 extern thread_t
port_name_to_thread(mach_port_name_t port_name
); /* osfmk/kern/ipc_tt.h */
122 extern mach_port_name_t
ipc_entry_name_mask(mach_port_name_t name
); /* osfmk/ipc/ipc_entry.h */
124 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
126 MALLOC_DEFINE(M_KQUEUE
, "kqueue", "memory for kqueue system");
128 #define KQ_EVENT NO_EVENT64
130 static int kqueue_read(struct fileproc
*fp
, struct uio
*uio
,
131 int flags
, vfs_context_t ctx
);
132 static int kqueue_write(struct fileproc
*fp
, struct uio
*uio
,
133 int flags
, vfs_context_t ctx
);
134 static int kqueue_ioctl(struct fileproc
*fp
, u_long com
, caddr_t data
,
136 static int kqueue_select(struct fileproc
*fp
, int which
, void *wq_link_id
,
138 static int kqueue_close(struct fileglob
*fg
, vfs_context_t ctx
);
139 static int kqueue_kqfilter(struct fileproc
*fp
, struct knote
*kn
,
140 struct kevent_internal_s
*kev
, vfs_context_t ctx
);
141 static int kqueue_drain(struct fileproc
*fp
, vfs_context_t ctx
);
143 static const struct fileops kqueueops
= {
144 .fo_type
= DTYPE_KQUEUE
,
145 .fo_read
= kqueue_read
,
146 .fo_write
= kqueue_write
,
147 .fo_ioctl
= kqueue_ioctl
,
148 .fo_select
= kqueue_select
,
149 .fo_close
= kqueue_close
,
150 .fo_kqfilter
= kqueue_kqfilter
,
151 .fo_drain
= kqueue_drain
,
154 static void kevent_put_kq(struct proc
*p
, kqueue_id_t id
, struct fileproc
*fp
, struct kqueue
*kq
);
155 static int kevent_internal(struct proc
*p
,
156 kqueue_id_t id
, kqueue_id_t
*id_out
,
157 user_addr_t changelist
, int nchanges
,
158 user_addr_t eventlist
, int nevents
,
159 user_addr_t data_out
, uint64_t data_available
,
160 unsigned int flags
, user_addr_t utimeout
,
161 kqueue_continue_t continuation
,
163 static int kevent_copyin(user_addr_t
*addrp
, struct kevent_internal_s
*kevp
,
164 struct proc
*p
, unsigned int flags
);
165 static int kevent_copyout(struct kevent_internal_s
*kevp
, user_addr_t
*addrp
,
166 struct proc
*p
, unsigned int flags
);
167 char * kevent_description(struct kevent_internal_s
*kevp
, char *s
, size_t n
);
169 static int kevent_register_wait_prepare(struct knote
*kn
, struct kevent_internal_s
*kev
);
170 static void kevent_register_wait_block(struct turnstile
*ts
, thread_t handoff_thread
,
171 struct knote_lock_ctx
*knlc
, thread_continue_t cont
,
172 struct _kevent_register
*cont_args
) __dead2
;
173 static void kevent_register_wait_return(struct _kevent_register
*cont_args
) __dead2
;
174 static void kevent_register_wait_cleanup(struct knote
*kn
);
175 static inline void kqueue_release_last(struct proc
*p
, kqueue_t kqu
);
176 static void kqueue_interrupt(struct kqueue
*kq
);
177 static int kevent_callback(struct kqueue
*kq
, struct kevent_internal_s
*kevp
,
179 static void kevent_continue(struct kqueue
*kq
, void *data
, int error
);
180 static void kqueue_scan_continue(void *contp
, wait_result_t wait_result
);
181 static int kqueue_process(struct kqueue
*kq
, kevent_callback_t callback
, void *callback_data
,
182 struct filt_process_s
*process_data
, int *countp
);
183 static int kqueue_queue_empty(struct kqueue
*kq
, kq_index_t qos_index
);
185 static struct kqtailq
*kqueue_get_suppressed_queue(kqueue_t kq
, struct knote
*kn
);
186 static void kqueue_threadreq_initiate(struct kqueue
*kq
, struct kqrequest
*kqr
, kq_index_t qos
, int flags
);
188 static void kqworkq_update_override(struct kqworkq
*kqwq
, struct knote
*kn
, kq_index_t qos
);
189 static void kqworkq_unbind(proc_t p
, struct kqrequest
*kqr
);
190 static thread_qos_t
kqworkq_unbind_locked(struct kqworkq
*kqwq
, struct kqrequest
*kqr
, thread_t thread
);
191 static struct kqrequest
*kqworkq_get_request(struct kqworkq
*kqwq
, kq_index_t qos_index
);
193 static void kqworkloop_update_override(struct kqworkloop
*kqwl
, kq_index_t override_index
);
194 static void kqworkloop_unbind(proc_t p
, struct kqworkloop
*kwql
);
195 static thread_qos_t
kqworkloop_unbind_locked(struct kqworkloop
*kwql
, thread_t thread
);
196 static kq_index_t
kqworkloop_owner_override(struct kqworkloop
*kqwl
);
200 * The wakeup qos is the qos of QUEUED knotes.
202 * This QoS is accounted for with the events override in the
203 * kqr_override_index field. It is raised each time a new knote is queued at
204 * a given QoS. The kqr_wakeup_indexes field is a superset of the non empty
205 * knote buckets and is recomputed after each event delivery.
207 KQWL_UTQ_UPDATE_WAKEUP_QOS
,
208 KQWL_UTQ_UPDATE_STAYACTIVE_QOS
,
209 KQWL_UTQ_RECOMPUTE_WAKEUP_QOS
,
210 KQWL_UTQ_UNBINDING
, /* attempt to rebind */
213 * The wakeup override is for suppressed knotes that have fired again at
214 * a higher QoS than the one for which they are suppressed already.
215 * This override is cleared when the knote suppressed list becomes empty.
217 KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE
,
218 KQWL_UTQ_RESET_WAKEUP_OVERRIDE
,
220 * The QoS is the maximum QoS of an event enqueued on this workloop in
221 * userland. It is copied from the only EVFILT_WORKLOOP knote with
222 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
223 * such knote, this QoS is 0.
225 KQWL_UTQ_SET_QOS_INDEX
,
226 KQWL_UTQ_REDRIVE_EVENTS
,
228 static void kqworkloop_update_threads_qos(struct kqworkloop
*kqwl
, int op
, kq_index_t qos
);
229 static void kqworkloop_request_help(struct kqworkloop
*kqwl
, kq_index_t qos_index
);
230 static int kqworkloop_end_processing(struct kqworkloop
*kqwl
, int flags
, int kevent_flags
);
232 static int knote_process(struct knote
*kn
, kevent_callback_t callback
, void *callback_data
,
233 struct filt_process_s
*process_data
);
235 static int kq_add_knote(struct kqueue
*kq
, struct knote
*kn
,
236 struct knote_lock_ctx
*knlc
, struct proc
*p
);
237 static struct knote
*kq_find_knote_and_kq_lock(struct kqueue
*kq
, struct kevent_internal_s
*kev
, bool is_fd
, struct proc
*p
);
239 static void knote_drop(struct kqueue
*kq
, struct knote
*kn
, struct knote_lock_ctx
*knlc
);
240 static struct knote
*knote_alloc(void);
241 static void knote_free(struct knote
*kn
);
243 static void knote_activate(struct knote
*kn
);
244 static void knote_deactivate(struct knote
*kn
);
246 static void knote_enable(struct knote
*kn
);
247 static void knote_disable(struct knote
*kn
);
249 static int knote_enqueue(struct knote
*kn
);
250 static void knote_dequeue(struct knote
*kn
);
252 static void knote_suppress(struct knote
*kn
);
253 static void knote_unsuppress(struct knote
*kn
);
254 static void knote_wakeup(struct knote
*kn
);
256 static bool knote_should_apply_qos_override(struct kqueue
*kq
, struct knote
*kn
,
257 int result
, thread_qos_t
*qos_out
);
258 static void knote_apply_qos_override(struct knote
*kn
, kq_index_t qos_index
);
259 static void knote_adjust_qos(struct kqueue
*kq
, struct knote
*kn
, int result
);
260 static void knote_reset_priority(struct knote
*kn
, pthread_priority_t pp
);
261 static kq_index_t
knote_get_qos_override_index(struct knote
*kn
);
262 static void knote_set_qos_overcommit(struct knote
*kn
);
264 static zone_t knote_zone
;
265 static zone_t kqfile_zone
;
266 static zone_t kqworkq_zone
;
267 static zone_t kqworkloop_zone
;
268 #if DEVELOPMENT || DEBUG
269 #define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK (1U << 0)
270 #define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS (1U << 1)
271 #define KEVENT_PANIC_BOOT_ARG_INITIALIZED (1U << 31)
273 #define KEVENT_PANIC_DEFAULT_VALUE (0)
275 kevent_debug_flags(void)
277 static uint32_t flags
= KEVENT_PANIC_DEFAULT_VALUE
;
279 if ((flags
& KEVENT_PANIC_BOOT_ARG_INITIALIZED
) == 0) {
281 if (!PE_parse_boot_argn("kevent_debug", &value
, sizeof(value
))) {
282 value
= KEVENT_PANIC_DEFAULT_VALUE
;
284 value
|= KEVENT_PANIC_BOOT_ARG_INITIALIZED
;
285 os_atomic_store(&flags
, value
, relaxed
);
291 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
293 /* placeholder for not-yet-implemented filters */
294 static int filt_badattach(struct knote
*kn
, struct kevent_internal_s
*kev
);
295 static int filt_badevent(struct knote
*kn
, long hint
);
296 SECURITY_READ_ONLY_EARLY(static struct filterops
) bad_filtops
= {
297 .f_attach
= filt_badattach
,
300 #if CONFIG_MEMORYSTATUS
301 extern const struct filterops memorystatus_filtops
;
302 #endif /* CONFIG_MEMORYSTATUS */
303 extern const struct filterops fs_filtops
;
304 extern const struct filterops sig_filtops
;
305 extern const struct filterops machport_filtops
;
306 extern const struct filterops pipe_rfiltops
;
307 extern const struct filterops pipe_wfiltops
;
308 extern const struct filterops ptsd_kqops
;
309 extern const struct filterops ptmx_kqops
;
310 extern const struct filterops soread_filtops
;
311 extern const struct filterops sowrite_filtops
;
312 extern const struct filterops sock_filtops
;
313 extern const struct filterops soexcept_filtops
;
314 extern const struct filterops spec_filtops
;
315 extern const struct filterops bpfread_filtops
;
316 extern const struct filterops necp_fd_rfiltops
;
317 extern const struct filterops fsevent_filtops
;
318 extern const struct filterops vnode_filtops
;
319 extern const struct filterops tty_filtops
;
321 const static struct filterops file_filtops
;
322 const static struct filterops kqread_filtops
;
323 const static struct filterops proc_filtops
;
324 const static struct filterops timer_filtops
;
325 const static struct filterops user_filtops
;
326 const static struct filterops workloop_filtops
;
330 * Rules for adding new filters to the system:
332 * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
333 * in the exported section of the header
334 * - Update the EVFILT_SYSCOUNT value to reflect the new addition
335 * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
336 * of the Public Filters section in the array.
338 * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
339 * in the XNU_KERNEL_PRIVATE section of the header
340 * - Update the EVFILTID_MAX value to reflect the new addition
341 * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
342 * the Private filters section of the array.
344 SECURITY_READ_ONLY_EARLY(static struct filterops
*) sysfilt_ops
[EVFILTID_MAX
] = {
346 [~EVFILT_READ
] = &file_filtops
,
347 [~EVFILT_WRITE
] = &file_filtops
,
348 [~EVFILT_AIO
] = &bad_filtops
,
349 [~EVFILT_VNODE
] = &file_filtops
,
350 [~EVFILT_PROC
] = &proc_filtops
,
351 [~EVFILT_SIGNAL
] = &sig_filtops
,
352 [~EVFILT_TIMER
] = &timer_filtops
,
353 [~EVFILT_MACHPORT
] = &machport_filtops
,
354 [~EVFILT_FS
] = &fs_filtops
,
355 [~EVFILT_USER
] = &user_filtops
,
357 [~EVFILT_VM
] = &bad_filtops
,
358 [~EVFILT_SOCK
] = &file_filtops
,
359 #if CONFIG_MEMORYSTATUS
360 [~EVFILT_MEMORYSTATUS
] = &memorystatus_filtops
,
362 [~EVFILT_MEMORYSTATUS
] = &bad_filtops
,
364 [~EVFILT_EXCEPT
] = &file_filtops
,
365 [~EVFILT_WORKLOOP
] = &workloop_filtops
,
367 /* Private filters */
368 [EVFILTID_KQREAD
] = &kqread_filtops
,
369 [EVFILTID_PIPE_R
] = &pipe_rfiltops
,
370 [EVFILTID_PIPE_W
] = &pipe_wfiltops
,
371 [EVFILTID_PTSD
] = &ptsd_kqops
,
372 [EVFILTID_SOREAD
] = &soread_filtops
,
373 [EVFILTID_SOWRITE
] = &sowrite_filtops
,
374 [EVFILTID_SCK
] = &sock_filtops
,
375 [EVFILTID_SOEXCEPT
] = &soexcept_filtops
,
376 [EVFILTID_SPEC
] = &spec_filtops
,
377 [EVFILTID_BPFREAD
] = &bpfread_filtops
,
378 [EVFILTID_NECP_FD
] = &necp_fd_rfiltops
,
379 [EVFILTID_FSEVENT
] = &fsevent_filtops
,
380 [EVFILTID_VN
] = &vnode_filtops
,
381 [EVFILTID_TTY
] = &tty_filtops
,
382 [EVFILTID_PTMX
] = &ptmx_kqops
,
385 /* waitq prepost callback */
386 void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook
, void *knote_hook
, int qos
);
388 static inline struct kqworkloop
*
389 kqr_kqworkloop(struct kqrequest
*kqr
)
391 if (kqr
->kqr_state
& KQR_WORKLOOP
) {
392 return __container_of(kqr
, struct kqworkloop
, kqwl_request
);
397 static inline kqueue_t
398 kqr_kqueue(proc_t p
, struct kqrequest
*kqr
)
401 if (kqr
->kqr_state
& KQR_WORKLOOP
) {
402 kqu
.kqwl
= kqr_kqworkloop(kqr
);
404 kqu
.kqwq
= (struct kqworkq
*)p
->p_fd
->fd_wqkqueue
;
405 assert(kqr
>= kqu
.kqwq
->kqwq_request
&&
406 kqr
< kqu
.kqwq
->kqwq_request
+ KQWQ_NBUCKETS
);
411 static inline boolean_t
412 is_workqueue_thread(thread_t thread
)
414 return (thread_get_tag(thread
) & THREAD_TAG_WORKQUEUE
);
418 * kqueue/note lock implementations
420 * The kqueue lock guards the kq state, the state of its queues,
421 * and the kqueue-aware status and locks of individual knotes.
423 * The kqueue workq lock is used to protect state guarding the
424 * interaction of the kqueue with the workq. This state cannot
425 * be guarded by the kq lock - as it needs to be taken when we
426 * already have the waitq set lock held (during the waitq hook
427 * callback). It might be better to use the waitq lock itself
428 * for this, but the IRQ requirements make that difficult).
430 * Knote flags, filter flags, and associated data are protected
431 * by the underlying object lock - and are only ever looked at
432 * by calling the filter to get a [consistent] snapshot of that
435 static lck_grp_attr_t
*kq_lck_grp_attr
;
436 static lck_grp_t
*kq_lck_grp
;
437 static lck_attr_t
*kq_lck_attr
;
442 lck_spin_lock(&kqu
.kq
->kq_lock
);
446 kqlock_held(__assert_only kqueue_t kqu
)
448 LCK_SPIN_ASSERT(&kqu
.kq
->kq_lock
, LCK_ASSERT_OWNED
);
452 kqunlock(kqueue_t kqu
)
454 lck_spin_unlock(&kqu
.kq
->kq_lock
);
458 kq_req_lock(kqueue_t kqu
)
460 assert(kqu
.kq
->kq_state
& (KQ_WORKLOOP
| KQ_WORKQ
));
461 lck_spin_lock(&kqu
.kq
->kq_reqlock
);
465 kq_req_unlock(kqueue_t kqu
)
467 assert(kqu
.kq
->kq_state
& (KQ_WORKLOOP
| KQ_WORKQ
));
468 lck_spin_unlock(&kqu
.kq
->kq_reqlock
);
472 kq_req_held(__assert_only kqueue_t kqu
)
474 assert(kqu
.kq
->kq_state
& (KQ_WORKLOOP
| KQ_WORKQ
));
475 LCK_SPIN_ASSERT(&kqu
.kq
->kq_reqlock
, LCK_ASSERT_OWNED
);
479 knhash_lock(proc_t p
)
481 lck_mtx_lock(&p
->p_fd
->fd_knhashlock
);
485 knhash_unlock(proc_t p
)
487 lck_mtx_unlock(&p
->p_fd
->fd_knhashlock
);
490 #pragma mark knote locks
493 * Enum used by the knote_lock_* functions.
495 * KNOTE_KQ_LOCK_ALWAYS
496 * The function will always return with the kq lock held.
498 * KNOTE_KQ_UNLOCK_ON_SUCCESS
499 * The function will return with the kq lock held if it was successful
500 * (knote_lock() is the only function that can fail).
502 * KNOTE_KQ_UNLOCK_ON_FAILURE
503 * The function will return with the kq lock held if it was unsuccessful
504 * (knote_lock() is the only function that can fail).
507 * The function returns with the kq unlocked.
509 #define KNOTE_KQ_LOCK_ALWAYS 0x0
510 #define KNOTE_KQ_LOCK_ON_SUCCESS 0x1
511 #define KNOTE_KQ_LOCK_ON_FAILURE 0x2
512 #define KNOTE_KQ_UNLOCK 0x3
514 #if DEBUG || DEVELOPMENT
515 __attribute__((noinline
, not_tail_called
, disable_tail_calls
))
516 void knote_lock_ctx_chk(struct knote_lock_ctx
*knlc
)
518 /* evil hackery to make sure no one forgets to unlock */
519 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_UNLOCKED
);
523 static struct knote_lock_ctx
*
524 knote_lock_ctx_find(struct kqueue
*kq
, struct knote
*kn
)
526 struct knote_lock_ctx
*ctx
;
527 LIST_FOREACH(ctx
, &kq
->kq_knlocks
, knlc_le
) {
528 if (ctx
->knlc_knote
== kn
) return ctx
;
530 panic("knote lock context not found: %p", kn
);
534 /* slowpath of knote_lock() */
535 __attribute__((noinline
))
536 static bool __result_use_check
537 knote_lock_slow(struct kqueue
*kq
, struct knote
*kn
,
538 struct knote_lock_ctx
*knlc
, int kqlocking
)
542 struct knote_lock_ctx
*owner_lc
= knote_lock_ctx_find(kq
, kn
);
543 thread_t owner_thread
= owner_lc
->knlc_thread
;
545 #if DEBUG || DEVELOPMENT
546 knlc
->knlc_state
= KNOTE_LOCK_CTX_WAITING
;
549 thread_reference(owner_thread
);
550 TAILQ_INSERT_TAIL(&owner_lc
->knlc_head
, knlc
, knlc_tqe
);
551 assert_wait(&kn
->kn_status
, THREAD_UNINT
| THREAD_WAIT_NOREPORT
);
554 if (thread_handoff_deallocate(owner_thread
) == THREAD_RESTART
) {
555 if (kqlocking
== KNOTE_KQ_LOCK_ALWAYS
||
556 kqlocking
== KNOTE_KQ_LOCK_ON_FAILURE
) {
559 #if DEBUG || DEVELOPMENT
560 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_WAITING
);
561 knlc
->knlc_state
= KNOTE_LOCK_CTX_UNLOCKED
;
565 #if DEBUG || DEVELOPMENT
566 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_LOCKED
);
568 if (kqlocking
== KNOTE_KQ_LOCK_ALWAYS
||
569 kqlocking
== KNOTE_KQ_LOCK_ON_SUCCESS
) {
576 * Attempts to take the "knote" lock.
578 * Called with the kqueue lock held.
580 * Returns true if the knote lock is acquired, false if it has been dropped
582 static bool __result_use_check
583 knote_lock(struct kqueue
*kq
, struct knote
*kn
, struct knote_lock_ctx
*knlc
,
588 #if DEBUG || DEVELOPMENT
589 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_UNLOCKED
);
591 knlc
->knlc_knote
= kn
;
592 knlc
->knlc_thread
= current_thread();
593 TAILQ_INIT(&knlc
->knlc_head
);
595 if (__improbable(kn
->kn_status
& KN_LOCKED
)) {
596 return knote_lock_slow(kq
, kn
, knlc
, kqlocking
);
600 * When the knote will be dropped, the knote lock is taken before
601 * KN_DROPPING is set, and then the knote will be removed from any
602 * hash table that references it before the lock is canceled.
604 assert((kn
->kn_status
& KN_DROPPING
) == 0);
605 LIST_INSERT_HEAD(&kq
->kq_knlocks
, knlc
, knlc_le
);
606 kn
->kn_status
|= KN_LOCKED
;
607 #if DEBUG || DEVELOPMENT
608 knlc
->knlc_state
= KNOTE_LOCK_CTX_LOCKED
;
611 if (kqlocking
== KNOTE_KQ_UNLOCK
||
612 kqlocking
== KNOTE_KQ_LOCK_ON_FAILURE
) {
619 * Unlocks a knote successfully locked with knote_lock().
621 * Called with the kqueue lock held.
623 * Returns with the kqueue lock held according to KNOTE_KQ_* flags
626 knote_unlock(struct kqueue
*kq
, struct knote
*kn
,
627 struct knote_lock_ctx
*knlc
, int flags
)
631 assert(knlc
->knlc_knote
== kn
);
632 assert(kn
->kn_status
& KN_LOCKED
);
633 #if DEBUG || DEVELOPMENT
634 assert(knlc
->knlc_state
== KNOTE_LOCK_CTX_LOCKED
);
637 struct knote_lock_ctx
*next_owner_lc
= TAILQ_FIRST(&knlc
->knlc_head
);
639 LIST_REMOVE(knlc
, knlc_le
);
642 assert(next_owner_lc
->knlc_knote
== kn
);
643 TAILQ_REMOVE(&knlc
->knlc_head
, next_owner_lc
, knlc_tqe
);
645 assert(TAILQ_EMPTY(&next_owner_lc
->knlc_head
));
646 TAILQ_CONCAT(&next_owner_lc
->knlc_head
, &knlc
->knlc_head
, knlc_tqe
);
647 LIST_INSERT_HEAD(&kq
->kq_knlocks
, next_owner_lc
, knlc_le
);
648 #if DEBUG || DEVELOPMENT
649 next_owner_lc
->knlc_state
= KNOTE_LOCK_CTX_LOCKED
;
652 kn
->kn_status
&= ~KN_LOCKED
;
654 if (kn
->kn_inuse
== 0) {
656 * No f_event() in flight anymore, we can leave QoS "Merge" mode
658 * See knote_should_apply_qos_override()
660 kn
->kn_status
&= ~KN_MERGE_QOS
;
662 if (flags
& KNOTE_KQ_UNLOCK
) {
666 thread_wakeup_thread(&kn
->kn_status
, next_owner_lc
->knlc_thread
);
668 #if DEBUG || DEVELOPMENT
669 knlc
->knlc_state
= KNOTE_LOCK_CTX_UNLOCKED
;
674 * Aborts all waiters for a knote lock, and unlock the knote.
676 * Called with the kqueue lock held.
678 * Returns with the kqueue lock held according to KNOTE_KQ_* flags
681 knote_unlock_cancel(struct kqueue
*kq
, struct knote
*kn
,
682 struct knote_lock_ctx
*knlc
, int kqlocking
)
686 assert(knlc
->knlc_knote
== kn
);
687 assert(kn
->kn_status
& KN_LOCKED
);
688 assert(kn
->kn_status
& KN_DROPPING
);
690 LIST_REMOVE(knlc
, knlc_le
);
691 kn
->kn_status
&= ~KN_LOCKED
;
693 if (kqlocking
== KNOTE_KQ_UNLOCK
||
694 kqlocking
== KNOTE_KQ_LOCK_ON_FAILURE
) {
697 if (!TAILQ_EMPTY(&knlc
->knlc_head
)) {
698 thread_wakeup_with_result(&kn
->kn_status
, THREAD_RESTART
);
700 #if DEBUG || DEVELOPMENT
701 knlc
->knlc_state
= KNOTE_LOCK_CTX_UNLOCKED
;
706 * Call the f_event hook of a given filter.
708 * Takes a use count to protect against concurrent drops.
711 knote_call_filter_event(struct kqueue
*kq
, struct knote
*kn
, long hint
)
713 int result
, dropping
= 0;
717 if (kn
->kn_status
& (KN_DROPPING
| KN_VANISHED
))
722 result
= filter_call(knote_fops(kn
), f_event(kn
, hint
));
725 dropping
= (kn
->kn_status
& KN_DROPPING
);
727 if (!dropping
&& (result
& FILTER_ACTIVE
)) {
728 if (result
& FILTER_ADJUST_EVENT_QOS_BIT
)
729 knote_adjust_qos(kq
, kn
, result
);
733 if (--kn
->kn_inuse
== 0) {
734 if ((kn
->kn_status
& KN_LOCKED
) == 0) {
736 * We're the last f_event() call and there's no other f_* call in
737 * flight, we can leave QoS "Merge" mode.
739 * See knote_should_apply_qos_override()
741 kn
->kn_status
&= ~KN_MERGE_QOS
;
744 waitq_wakeup64_all((struct waitq
*)&kq
->kq_wqs
,
745 CAST_EVENT64_T(&kn
->kn_inuse
),
746 THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
752 * Called by knote_drop() to wait for the last f_event() caller to be done.
754 * - kq locked at entry
755 * - kq unlocked at exit
758 knote_wait_for_filter_events(struct kqueue
*kq
, struct knote
*kn
)
760 wait_result_t wr
= THREAD_NOT_WAITING
;
764 assert(kn
->kn_status
& KN_DROPPING
);
767 wr
= waitq_assert_wait64((struct waitq
*)&kq
->kq_wqs
,
768 CAST_EVENT64_T(&kn
->kn_inuse
),
769 THREAD_UNINT
| THREAD_WAIT_NOREPORT
, TIMEOUT_WAIT_FOREVER
);
772 if (wr
== THREAD_WAITING
) {
773 thread_block(THREAD_CONTINUE_NULL
);
777 #pragma mark file_filtops
780 filt_fileattach(struct knote
*kn
, struct kevent_internal_s
*kev
)
782 return fo_kqfilter(kn
->kn_fp
, kn
, kev
, vfs_context_current());
785 SECURITY_READ_ONLY_EARLY(static struct filterops
) file_filtops
= {
787 .f_attach
= filt_fileattach
,
790 #pragma mark kqread_filtops
792 #define f_flag f_fglob->fg_flag
793 #define f_ops f_fglob->fg_ops
794 #define f_data f_fglob->fg_data
797 filt_kqdetach(struct knote
*kn
)
799 struct kqfile
*kqf
= (struct kqfile
*)kn
->kn_fp
->f_data
;
800 struct kqueue
*kq
= &kqf
->kqf_kqueue
;
803 KNOTE_DETACH(&kqf
->kqf_sel
.si_note
, kn
);
808 filt_kqueue(struct knote
*kn
, __unused
long hint
)
810 struct kqueue
*kq
= (struct kqueue
*)kn
->kn_fp
->f_data
;
812 return (kq
->kq_count
> 0);
816 filt_kqtouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
819 struct kqueue
*kq
= (struct kqueue
*)kn
->kn_fp
->f_data
;
823 kn
->kn_data
= kq
->kq_count
;
824 res
= (kn
->kn_data
> 0);
832 filt_kqprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
)
835 struct kqueue
*kq
= (struct kqueue
*)kn
->kn_fp
->f_data
;
839 kn
->kn_data
= kq
->kq_count
;
840 res
= (kn
->kn_data
> 0);
842 *kev
= kn
->kn_kevent
;
843 if (kn
->kn_flags
& EV_CLEAR
)
851 SECURITY_READ_ONLY_EARLY(static struct filterops
) kqread_filtops
= {
853 .f_detach
= filt_kqdetach
,
854 .f_event
= filt_kqueue
,
855 .f_touch
= filt_kqtouch
,
856 .f_process
= filt_kqprocess
,
859 #pragma mark proc_filtops
862 filt_procattach(struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
866 assert(PID_MAX
< NOTE_PDATAMASK
);
868 if ((kn
->kn_sfflags
& (NOTE_TRACK
| NOTE_TRACKERR
| NOTE_CHILD
)) != 0) {
869 knote_set_error(kn
, ENOTSUP
);
873 p
= proc_find(kn
->kn_id
);
875 knote_set_error(kn
, ESRCH
);
879 const int NoteExitStatusBits
= NOTE_EXIT
| NOTE_EXITSTATUS
;
881 if ((kn
->kn_sfflags
& NoteExitStatusBits
) == NoteExitStatusBits
)
883 pid_t selfpid
= proc_selfpid();
885 if (p
->p_ppid
== selfpid
)
886 break; /* parent => ok */
888 if ((p
->p_lflag
& P_LTRACED
) != 0 &&
889 (p
->p_oppid
== selfpid
))
890 break; /* parent-in-waiting => ok */
893 knote_set_error(kn
, EACCES
);
899 kn
->kn_ptr
.p_proc
= p
; /* store the proc handle */
901 KNOTE_ATTACH(&p
->p_klist
, kn
);
908 * only captures edge-triggered events after this point
909 * so it can't already be fired.
916 * The knote may be attached to a different process, which may exit,
917 * leaving nothing for the knote to be attached to. In that case,
918 * the pointer to the process will have already been nulled out.
921 filt_procdetach(struct knote
*kn
)
927 p
= kn
->kn_ptr
.p_proc
;
928 if (p
!= PROC_NULL
) {
929 kn
->kn_ptr
.p_proc
= PROC_NULL
;
930 KNOTE_DETACH(&p
->p_klist
, kn
);
937 filt_proc(struct knote
*kn
, long hint
)
941 /* ALWAYS CALLED WITH proc_klist_lock */
944 * Note: a lot of bits in hint may be obtained from the knote
945 * To free some of those bits, see <rdar://problem/12592988> Freeing up
946 * bits in hint for filt_proc
948 * mask off extra data
950 event
= (u_int
)hint
& NOTE_PCTRLMASK
;
953 * termination lifecycle events can happen while a debugger
954 * has reparented a process, in which case notifications
955 * should be quashed except to the tracing parent. When
956 * the debugger reaps the child (either via wait4(2) or
957 * process exit), the child will be reparented to the original
958 * parent and these knotes re-fired.
960 if (event
& NOTE_EXIT
) {
961 if ((kn
->kn_ptr
.p_proc
->p_oppid
!= 0)
962 && (knote_get_kq(kn
)->kq_p
->p_pid
!= kn
->kn_ptr
.p_proc
->p_ppid
)) {
964 * This knote is not for the current ptrace(2) parent, ignore.
971 * if the user is interested in this event, record it.
973 if (kn
->kn_sfflags
& event
)
974 kn
->kn_fflags
|= event
;
976 #pragma clang diagnostic push
977 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
978 if ((event
== NOTE_REAP
) || ((event
== NOTE_EXIT
) && !(kn
->kn_sfflags
& NOTE_REAP
))) {
979 kn
->kn_flags
|= (EV_EOF
| EV_ONESHOT
);
981 #pragma clang diagnostic pop
985 * The kernel has a wrapper in place that returns the same data
986 * as is collected here, in kn_data. Any changes to how
987 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
988 * should also be reflected in the proc_pidnoteexit() wrapper.
990 if (event
== NOTE_EXIT
) {
992 if ((kn
->kn_sfflags
& NOTE_EXITSTATUS
) != 0) {
993 kn
->kn_fflags
|= NOTE_EXITSTATUS
;
994 kn
->kn_data
|= (hint
& NOTE_PDATAMASK
);
996 if ((kn
->kn_sfflags
& NOTE_EXIT_DETAIL
) != 0) {
997 kn
->kn_fflags
|= NOTE_EXIT_DETAIL
;
998 if ((kn
->kn_ptr
.p_proc
->p_lflag
&
999 P_LTERM_DECRYPTFAIL
) != 0) {
1000 kn
->kn_data
|= NOTE_EXIT_DECRYPTFAIL
;
1002 if ((kn
->kn_ptr
.p_proc
->p_lflag
&
1003 P_LTERM_JETSAM
) != 0) {
1004 kn
->kn_data
|= NOTE_EXIT_MEMORY
;
1005 switch (kn
->kn_ptr
.p_proc
->p_lflag
& P_JETSAM_MASK
) {
1006 case P_JETSAM_VMPAGESHORTAGE
:
1007 kn
->kn_data
|= NOTE_EXIT_MEMORY_VMPAGESHORTAGE
;
1009 case P_JETSAM_VMTHRASHING
:
1010 kn
->kn_data
|= NOTE_EXIT_MEMORY_VMTHRASHING
;
1012 case P_JETSAM_FCTHRASHING
:
1013 kn
->kn_data
|= NOTE_EXIT_MEMORY_FCTHRASHING
;
1015 case P_JETSAM_VNODE
:
1016 kn
->kn_data
|= NOTE_EXIT_MEMORY_VNODE
;
1018 case P_JETSAM_HIWAT
:
1019 kn
->kn_data
|= NOTE_EXIT_MEMORY_HIWAT
;
1022 kn
->kn_data
|= NOTE_EXIT_MEMORY_PID
;
1024 case P_JETSAM_IDLEEXIT
:
1025 kn
->kn_data
|= NOTE_EXIT_MEMORY_IDLE
;
1029 if ((kn
->kn_ptr
.p_proc
->p_csflags
&
1031 kn
->kn_data
|= NOTE_EXIT_CSERROR
;
1036 /* if we have any matching state, activate the knote */
1037 return (kn
->kn_fflags
!= 0);
1041 filt_proctouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
1047 /* accept new filter flags and mask off output events no long interesting */
1048 kn
->kn_sfflags
= kev
->fflags
;
1050 /* restrict the current results to the (smaller?) set of new interest */
1052 * For compatibility with previous implementations, we leave kn_fflags
1053 * as they were before.
1055 //kn->kn_fflags &= kn->kn_sfflags;
1057 res
= (kn
->kn_fflags
!= 0);
1059 proc_klist_unlock();
1065 filt_procprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
)
1067 #pragma unused(data)
1071 res
= (kn
->kn_fflags
!= 0);
1073 *kev
= kn
->kn_kevent
;
1074 kn
->kn_flags
|= EV_CLEAR
; /* automatically set */
1078 proc_klist_unlock();
1082 SECURITY_READ_ONLY_EARLY(static struct filterops
) proc_filtops
= {
1083 .f_attach
= filt_procattach
,
1084 .f_detach
= filt_procdetach
,
1085 .f_event
= filt_proc
,
1086 .f_touch
= filt_proctouch
,
1087 .f_process
= filt_procprocess
,
1090 #pragma mark timer_filtops
1092 struct filt_timer_params
{
1093 uint64_t deadline
; /* deadline in abs/cont time
1094 (or 0 if NOTE_ABSOLUTE and deadline is in past) */
1095 uint64_t leeway
; /* leeway in abstime, or 0 if none */
1096 uint64_t interval
; /* interval in abstime or 0 if non-repeating timer */
1100 * Values stored in the knote at rest (using Mach absolute time units)
1102 * kn->kn_hook where the thread_call object is stored
1103 * kn->kn_ext[0] next deadline or 0 if immediate expiration
1104 * kn->kn_ext[1] leeway value
1105 * kn->kn_sdata interval timer: the interval
1106 * absolute/deadline timer: 0
1107 * kn->kn_hookid timer state
1110 * The timer has either never been scheduled or been cancelled.
1111 * It is safe to schedule a new one in this state.
1114 * The timer has been scheduled
1117 * The timer has fired and an event needs to be delivered.
1118 * When in this state, the callout may still be running.
1121 * The timer has fired at registration time, and the callout was never
1124 #define TIMER_IDLE 0x0
1125 #define TIMER_ARMED 0x1
1126 #define TIMER_FIRED 0x2
1127 #define TIMER_IMMEDIATE 0x3
1130 filt_timer_set_params(struct knote
*kn
, struct filt_timer_params
*params
)
1132 kn
->kn_ext
[0] = params
->deadline
;
1133 kn
->kn_ext
[1] = params
->leeway
;
1134 kn
->kn_sdata
= params
->interval
;
1138 * filt_timervalidate - process data from user
1140 * Sets up the deadline, interval, and leeway from the provided user data
1143 * kn_sdata timer deadline or interval time
1144 * kn_sfflags style of timer, unit of measurement
1147 * struct filter_timer_params to apply to the filter with
1148 * filt_timer_set_params when changes are ready to be commited.
1151 * EINVAL Invalid user data parameters
1152 * ERANGE Various overflows with the parameters
1154 * Called with timer filter lock held.
1157 filt_timervalidate(const struct kevent_internal_s
*kev
,
1158 struct filt_timer_params
*params
)
1161 * There are 5 knobs that need to be chosen for a timer registration:
1163 * A) Units of time (what is the time duration of the specified number)
1164 * Absolute and interval take:
1165 * NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1166 * Defaults to milliseconds if not specified
1168 * B) Clock epoch (what is the zero point of the specified number)
1169 * For interval, there is none
1170 * For absolute, defaults to the gettimeofday/calendar epoch
1171 * With NOTE_MACHTIME, uses mach_absolute_time()
1172 * With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1174 * C) The knote's behavior on delivery
1175 * Interval timer causes the knote to arm for the next interval unless one-shot is set
1176 * Absolute is a forced one-shot timer which deletes on delivery
1177 * TODO: Add a way for absolute to be not forced one-shot
1179 * D) Whether the time duration is relative to now or absolute
1180 * Interval fires at now + duration when it is set up
1181 * Absolute fires at now + difference between now walltime and passed in walltime
1182 * With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1184 * E) Whether the timer continues to tick across sleep
1185 * By default all three do not.
1186 * For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1187 * With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1188 * expires when mach_continuous_time() is > the passed in value.
1191 uint64_t multiplier
;
1193 boolean_t use_abstime
= FALSE
;
1195 switch (kev
->fflags
& (NOTE_SECONDS
|NOTE_USECONDS
|NOTE_NSECONDS
|NOTE_MACHTIME
)) {
1197 multiplier
= NSEC_PER_SEC
;
1200 multiplier
= NSEC_PER_USEC
;
1209 case 0: /* milliseconds (default) */
1210 multiplier
= NSEC_PER_SEC
/ 1000;
1216 /* transform the leeway in kn_ext[1] to same time scale */
1217 if (kev
->fflags
& NOTE_LEEWAY
) {
1218 uint64_t leeway_abs
;
1221 leeway_abs
= (uint64_t)kev
->ext
[1];
1224 if (os_mul_overflow((uint64_t)kev
->ext
[1], multiplier
, &leeway_ns
))
1227 nanoseconds_to_absolutetime(leeway_ns
, &leeway_abs
);
1230 params
->leeway
= leeway_abs
;
1235 if (kev
->fflags
& NOTE_ABSOLUTE
) {
1236 uint64_t deadline_abs
;
1239 deadline_abs
= (uint64_t)kev
->data
;
1241 uint64_t calendar_deadline_ns
;
1243 if (os_mul_overflow((uint64_t)kev
->data
, multiplier
, &calendar_deadline_ns
))
1246 /* calendar_deadline_ns is in nanoseconds since the epoch */
1248 clock_sec_t seconds
;
1249 clock_nsec_t nanoseconds
;
1252 * Note that the conversion through wall-time is only done once.
1254 * If the relationship between MAT and gettimeofday changes,
1255 * the underlying timer does not update.
1257 * TODO: build a wall-time denominated timer_call queue
1258 * and a flag to request DTRTing with wall-time timers
1260 clock_get_calendar_nanotime(&seconds
, &nanoseconds
);
1262 uint64_t calendar_now_ns
= (uint64_t)seconds
* NSEC_PER_SEC
+ nanoseconds
;
1264 /* if deadline is in the future */
1265 if (calendar_now_ns
< calendar_deadline_ns
) {
1266 uint64_t interval_ns
= calendar_deadline_ns
- calendar_now_ns
;
1267 uint64_t interval_abs
;
1269 nanoseconds_to_absolutetime(interval_ns
, &interval_abs
);
1272 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1273 * causes the timer to keep ticking across sleep, but
1274 * it does not change the calendar timebase.
1277 if (kev
->fflags
& NOTE_MACH_CONTINUOUS_TIME
)
1278 clock_continuoustime_interval_to_deadline(interval_abs
,
1281 clock_absolutetime_interval_to_deadline(interval_abs
,
1284 deadline_abs
= 0; /* cause immediate expiration */
1288 params
->deadline
= deadline_abs
;
1289 params
->interval
= 0; /* NOTE_ABSOLUTE is non-repeating */
1290 } else if (kev
->data
< 0) {
1292 * Negative interval timers fire immediately, once.
1294 * Ideally a negative interval would be an error, but certain clients
1295 * pass negative values on accident, and expect an event back.
1297 * In the old implementation the timer would repeat with no delay
1298 * N times until mach_absolute_time() + (N * interval) underflowed,
1299 * then it would wait ~forever by accidentally arming a timer for the far future.
1301 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1304 params
->deadline
= 0; /* expire immediately */
1305 params
->interval
= 0; /* non-repeating */
1307 uint64_t interval_abs
= 0;
1310 interval_abs
= (uint64_t)kev
->data
;
1312 uint64_t interval_ns
;
1313 if (os_mul_overflow((uint64_t)kev
->data
, multiplier
, &interval_ns
))
1316 nanoseconds_to_absolutetime(interval_ns
, &interval_abs
);
1319 uint64_t deadline
= 0;
1321 if (kev
->fflags
& NOTE_MACH_CONTINUOUS_TIME
)
1322 clock_continuoustime_interval_to_deadline(interval_abs
, &deadline
);
1324 clock_absolutetime_interval_to_deadline(interval_abs
, &deadline
);
1326 params
->deadline
= deadline
;
1327 params
->interval
= interval_abs
;
1334 * filt_timerexpire - the timer callout routine
1337 filt_timerexpire(void *knx
, __unused
void *spare
)
1339 struct knote
*kn
= knx
;
1342 if (os_atomic_cmpxchgv(&kn
->kn_hookid
, TIMER_ARMED
, TIMER_FIRED
,
1344 // our f_event always would say FILTER_ACTIVE,
1345 // so be leaner and just do it.
1346 struct kqueue
*kq
= knote_get_kq(kn
);
1352 * From TIMER_ARMED, the only allowed transition are:
1353 * - to TIMER_FIRED through the timer callout just above
1354 * - to TIMER_IDLE due to filt_timercancel() which will wait for the
1355 * timer callout (and any possible invocation of filt_timerexpire) to
1356 * have finished before the state is changed again.
1358 assert(v
== TIMER_IDLE
);
1363 filt_timercancel(struct knote
*kn
)
1365 if (os_atomic_xchg(&kn
->kn_hookid
, TIMER_IDLE
, relaxed
) == TIMER_ARMED
) {
1366 /* cancel the thread call and wait for any filt_timerexpire in flight */
1367 thread_call_cancel_wait((thread_call_t
)kn
->kn_hook
);
1372 * Does this deadline needs a timer armed for it, or has it expired?
1375 filt_timer_is_ready(struct knote
*kn
)
1377 uint64_t now
, deadline
= kn
->kn_ext
[0];
1379 if (deadline
== 0) {
1383 if (kn
->kn_sfflags
& NOTE_MACH_CONTINUOUS_TIME
) {
1384 now
= mach_continuous_time();
1386 now
= mach_absolute_time();
1388 return deadline
<= now
;
1394 * It is the responsibility of the caller to make sure the timer call
1395 * has completed or been cancelled properly prior to arming it.
1398 filt_timerarm(struct knote
*kn
)
1400 uint64_t deadline
= kn
->kn_ext
[0];
1401 uint64_t leeway
= kn
->kn_ext
[1];
1403 int filter_flags
= kn
->kn_sfflags
;
1404 unsigned int timer_flags
= 0;
1406 assert(os_atomic_load(&kn
->kn_hookid
, relaxed
) == TIMER_IDLE
);
1408 if (filter_flags
& NOTE_CRITICAL
)
1409 timer_flags
|= THREAD_CALL_DELAY_USER_CRITICAL
;
1410 else if (filter_flags
& NOTE_BACKGROUND
)
1411 timer_flags
|= THREAD_CALL_DELAY_USER_BACKGROUND
;
1413 timer_flags
|= THREAD_CALL_DELAY_USER_NORMAL
;
1415 if (filter_flags
& NOTE_LEEWAY
)
1416 timer_flags
|= THREAD_CALL_DELAY_LEEWAY
;
1418 if (filter_flags
& NOTE_MACH_CONTINUOUS_TIME
)
1419 timer_flags
|= THREAD_CALL_CONTINUOUS
;
1421 os_atomic_store(&kn
->kn_hookid
, TIMER_ARMED
, relaxed
);
1422 thread_call_enter_delayed_with_leeway((thread_call_t
)kn
->kn_hook
, NULL
,
1423 deadline
, leeway
, timer_flags
);
1427 * Allocate a thread call for the knote's lifetime, and kick off the timer.
1430 filt_timerattach(struct knote
*kn
, struct kevent_internal_s
*kev
)
1432 thread_call_t callout
;
1433 struct filt_timer_params params
;
1436 if ((error
= filt_timervalidate(kev
, ¶ms
)) != 0) {
1437 knote_set_error(kn
, error
);
1441 callout
= thread_call_allocate_with_options(filt_timerexpire
,
1442 (thread_call_param_t
)kn
, THREAD_CALL_PRIORITY_HIGH
,
1443 THREAD_CALL_OPTIONS_ONCE
);
1445 if (NULL
== callout
) {
1446 knote_set_error(kn
, ENOMEM
);
1450 filt_timer_set_params(kn
, ¶ms
);
1451 kn
->kn_hook
= callout
;
1452 kn
->kn_flags
|= EV_CLEAR
;
1453 os_atomic_store(&kn
->kn_hookid
, TIMER_IDLE
, relaxed
);
1455 /* NOTE_ABSOLUTE implies EV_ONESHOT */
1456 if (kn
->kn_sfflags
& NOTE_ABSOLUTE
)
1457 kn
->kn_flags
|= EV_ONESHOT
;
1459 if (filt_timer_is_ready(kn
)) {
1460 os_atomic_store(&kn
->kn_hookid
, TIMER_IMMEDIATE
, relaxed
);
1461 return FILTER_ACTIVE
;
1469 * Shut down the timer if it's running, and free the callout.
1472 filt_timerdetach(struct knote
*kn
)
1474 __assert_only boolean_t freed
;
1477 * Unconditionally cancel to make sure there can't be any filt_timerexpire()
1480 thread_call_cancel_wait((thread_call_t
)kn
->kn_hook
);
1481 freed
= thread_call_free((thread_call_t
)kn
->kn_hook
);
1486 * filt_timertouch - update timer knote with new user input
1488 * Cancel and restart the timer based on new user data. When
1489 * the user picks up a knote, clear the count of how many timer
1490 * pops have gone off (in kn_data).
1493 filt_timertouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
1495 struct filt_timer_params params
;
1496 uint32_t changed_flags
= (kn
->kn_sfflags
^ kev
->fflags
);
1499 if (changed_flags
& NOTE_ABSOLUTE
) {
1500 kev
->flags
|= EV_ERROR
;
1505 if ((error
= filt_timervalidate(kev
, ¶ms
)) != 0) {
1506 kev
->flags
|= EV_ERROR
;
1511 /* capture the new values used to compute deadline */
1512 filt_timercancel(kn
);
1513 filt_timer_set_params(kn
, ¶ms
);
1514 kn
->kn_sfflags
= kev
->fflags
;
1516 if (filt_timer_is_ready(kn
)) {
1517 os_atomic_store(&kn
->kn_hookid
, TIMER_IMMEDIATE
, relaxed
);
1518 return FILTER_ACTIVE
| FILTER_UPDATE_REQ_QOS
;
1521 return FILTER_UPDATE_REQ_QOS
;
1526 * filt_timerprocess - query state of knote and snapshot event data
1528 * Determine if the timer has fired in the past, snapshot the state
1529 * of the kevent for returning to user-space, and clear pending event
1530 * counters for the next time.
1535 __unused
struct filt_process_s
*data
,
1536 struct kevent_internal_s
*kev
)
1539 * filt_timerprocess is serialized with any filter routine except for
1540 * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
1541 * transition, and on success, activates the knote.
1543 * Hence, we don't need atomic modifications of the state, only to peek at
1544 * whether we see any of the "FIRED" state, and if we do, it is safe to
1545 * do simple state machine transitions.
1547 switch (os_atomic_load(&kn
->kn_hookid
, relaxed
)) {
1551 * This can happen if a touch resets a timer that had fired
1552 * without being processed
1557 os_atomic_store(&kn
->kn_hookid
, TIMER_IDLE
, relaxed
);
1560 * Copy out the interesting kevent state,
1561 * but don't leak out the raw time calculations.
1563 * TODO: potential enhancements - tell the user about:
1564 * - deadline to which this timer thought it was expiring
1565 * - return kn_sfflags in the fflags field so the client can know
1566 * under what flags the timer fired
1568 *kev
= kn
->kn_kevent
;
1570 /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */
1572 if (kn
->kn_sdata
== 0) {
1576 * This is a 'repeating' timer, so we have to emit
1577 * how many intervals expired between the arm
1580 * A very strange style of interface, because
1581 * this could easily be done in the client...
1586 if (kn
->kn_sfflags
& NOTE_MACH_CONTINUOUS_TIME
)
1587 now
= mach_continuous_time();
1589 now
= mach_absolute_time();
1591 uint64_t first_deadline
= kn
->kn_ext
[0];
1592 uint64_t interval_abs
= kn
->kn_sdata
;
1593 uint64_t orig_arm_time
= first_deadline
- interval_abs
;
1595 assert(now
> orig_arm_time
);
1596 assert(now
> first_deadline
);
1598 uint64_t elapsed
= now
- orig_arm_time
;
1600 uint64_t num_fired
= elapsed
/ interval_abs
;
1603 * To reach this code, we must have seen the timer pop
1604 * and be in repeating mode, so therefore it must have been
1605 * more than 'interval' time since the attach or last
1608 assert(num_fired
> 0);
1610 /* report how many intervals have elapsed to the user */
1611 kev
->data
= (int64_t)num_fired
;
1613 /* We only need to re-arm the timer if it's not about to be destroyed */
1614 if ((kn
->kn_flags
& EV_ONESHOT
) == 0) {
1615 /* fire at the end of the next interval */
1616 uint64_t new_deadline
= first_deadline
+ num_fired
* interval_abs
;
1618 assert(new_deadline
> now
);
1620 kn
->kn_ext
[0] = new_deadline
;
1623 * This can't shortcut setting up the thread call, because
1624 * knote_process deactivates EV_CLEAR knotes unconditionnally.
1630 return FILTER_ACTIVE
;
1633 SECURITY_READ_ONLY_EARLY(static struct filterops
) timer_filtops
= {
1634 .f_extended_codes
= true,
1635 .f_attach
= filt_timerattach
,
1636 .f_detach
= filt_timerdetach
,
1637 .f_event
= filt_badevent
,
1638 .f_touch
= filt_timertouch
,
1639 .f_process
= filt_timerprocess
,
1642 #pragma mark user_filtops
1645 filt_userattach(struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
1647 if (kn
->kn_sfflags
& NOTE_TRIGGER
) {
1648 kn
->kn_hookid
= FILTER_ACTIVE
;
1652 return (kn
->kn_hookid
);
1656 filt_userdetach(__unused
struct knote
*kn
)
1658 /* EVFILT_USER knotes are not attached to anything in the kernel */
1662 filt_usertouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
1667 ffctrl
= kev
->fflags
& NOTE_FFCTRLMASK
;
1668 fflags
= kev
->fflags
& NOTE_FFLAGSMASK
;
1673 kn
->kn_sfflags
&= fflags
;
1676 kn
->kn_sfflags
|= fflags
;
1679 kn
->kn_sfflags
= fflags
;
1682 kn
->kn_sdata
= kev
->data
;
1684 if (kev
->fflags
& NOTE_TRIGGER
) {
1685 kn
->kn_hookid
= FILTER_ACTIVE
;
1687 return (int)kn
->kn_hookid
;
1693 __unused
struct filt_process_s
*data
,
1694 struct kevent_internal_s
*kev
)
1696 int result
= (int)kn
->kn_hookid
;
1699 *kev
= kn
->kn_kevent
;
1700 kev
->fflags
= kn
->kn_sfflags
;
1701 kev
->data
= kn
->kn_sdata
;
1702 if (kn
->kn_flags
& EV_CLEAR
) {
1712 SECURITY_READ_ONLY_EARLY(static struct filterops
) user_filtops
= {
1713 .f_extended_codes
= true,
1714 .f_attach
= filt_userattach
,
1715 .f_detach
= filt_userdetach
,
1716 .f_event
= filt_badevent
,
1717 .f_touch
= filt_usertouch
,
1718 .f_process
= filt_userprocess
,
1721 #pragma mark workloop_filtops
1724 filt_wllock(struct kqworkloop
*kqwl
)
1726 lck_mtx_lock(&kqwl
->kqwl_statelock
);
1730 filt_wlunlock(struct kqworkloop
*kqwl
)
1732 lck_mtx_unlock(&kqwl
->kqwl_statelock
);
1736 * Returns true when the interlock for the turnstile is the workqueue lock
1738 * When this is the case, all turnstiles operations are delegated
1739 * to the workqueue subsystem.
1741 * This is required because kqueue_threadreq_bind_prepost only holds the
1742 * workqueue lock but needs to move the inheritor from the workloop turnstile
1743 * away from the creator thread, so that this now fulfilled request cannot be
1744 * picked anymore by other threads.
1747 filt_wlturnstile_interlock_is_workq(struct kqworkloop
*kqwl
)
1749 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
1750 return (kqr
->kqr_state
& KQR_THREQUESTED
) &&
1751 (kqr
->kqr_thread
== THREAD_NULL
);
1755 filt_wlupdate_inheritor(struct kqworkloop
*kqwl
, struct turnstile
*ts
,
1756 turnstile_update_flags_t flags
)
1758 turnstile_inheritor_t inheritor
= TURNSTILE_INHERITOR_NULL
;
1759 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
1762 * binding to the workq should always happen through
1763 * workq_kern_threadreq_update_inheritor()
1765 assert(!filt_wlturnstile_interlock_is_workq(kqwl
));
1767 if ((inheritor
= kqwl
->kqwl_owner
)) {
1768 flags
|= TURNSTILE_INHERITOR_THREAD
;
1769 } else if ((inheritor
= kqr
->kqr_thread
)) {
1770 flags
|= TURNSTILE_INHERITOR_THREAD
;
1773 turnstile_update_inheritor(ts
, inheritor
, flags
);
1776 #define FILT_WLATTACH 0
1777 #define FILT_WLTOUCH 1
1778 #define FILT_WLDROP 2
1782 filt_wlupdate(struct kqworkloop
*kqwl
, struct knote
*kn
,
1783 struct kevent_internal_s
*kev
, kq_index_t qos_index
, int op
)
1785 user_addr_t uaddr
= CAST_USER_ADDR_T(kev
->ext
[EV_EXTIDX_WL_ADDR
]);
1786 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
1787 thread_t cur_owner
, new_owner
, extra_thread_ref
= THREAD_NULL
;
1788 kq_index_t cur_owner_override
= THREAD_QOS_UNSPECIFIED
;
1789 int action
= KQWL_UTQ_NONE
, error
= 0;
1790 bool needs_wake
= false, needs_wllock
= false;
1791 uint64_t kdata
= kev
->ext
[EV_EXTIDX_WL_VALUE
];
1792 uint64_t mask
= kev
->ext
[EV_EXTIDX_WL_MASK
];
1795 if (kev
->fflags
& (NOTE_WL_END_OWNERSHIP
| NOTE_WL_DISCOVER_OWNER
)) {
1797 * If we're maybe going to change the kqwl_owner,
1798 * then we need to hold the filt_wllock().
1800 needs_wllock
= true;
1801 } else if (kqr
->kqr_thread
== current_thread()) {
1803 * <rdar://problem/41531764> Servicer updates need to be serialized with
1804 * any ownership change too, as the kqr_thread value influences the
1805 * outcome of handling NOTE_WL_DISCOVER_OWNER.
1807 needs_wllock
= true;
1813 * The kqwl owner is set under both the req and filter lock,
1814 * meaning it's fine to look at it under any.
1816 new_owner
= cur_owner
= kqwl
->kqwl_owner
;
1818 new_owner
= cur_owner
= THREAD_NULL
;
1824 * If asked, load the uint64 value at the user provided address and compare
1825 * it against the passed in mask and expected value.
1827 * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
1828 * a thread reference.
1830 * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
1831 * the current thread, then end ownership.
1833 * Lastly decide whether we need to perform a QoS update.
1836 error
= copyin_word(uaddr
, &udata
, sizeof(udata
));
1841 /* Update state as copied in. */
1842 kev
->ext
[EV_EXTIDX_WL_VALUE
] = udata
;
1844 if ((udata
& mask
) != (kdata
& mask
)) {
1846 } else if (kev
->fflags
& NOTE_WL_DISCOVER_OWNER
) {
1848 * Decipher the owner port name, and translate accordingly.
1849 * The low 2 bits were borrowed for other flags, so mask them off.
1851 * Then attempt translation to a thread reference or fail.
1853 mach_port_name_t name
= (mach_port_name_t
)udata
& ~0x3;
1854 if (name
!= MACH_PORT_NULL
) {
1855 name
= ipc_entry_name_mask(name
);
1856 extra_thread_ref
= port_name_to_thread(name
);
1857 if (extra_thread_ref
== THREAD_NULL
) {
1861 new_owner
= extra_thread_ref
;
1866 if ((kev
->fflags
& NOTE_WL_END_OWNERSHIP
) && new_owner
== current_thread()) {
1867 new_owner
= THREAD_NULL
;
1871 if ((kev
->fflags
& NOTE_WL_THREAD_REQUEST
) && (kev
->flags
& EV_DELETE
)) {
1872 action
= KQWL_UTQ_SET_QOS_INDEX
;
1873 } else if (qos_index
&& kqr
->kqr_qos_index
!= qos_index
) {
1874 action
= KQWL_UTQ_SET_QOS_INDEX
;
1877 if (op
== FILT_WLTOUCH
) {
1879 * Save off any additional fflags/data we just accepted
1880 * But only keep the last round of "update" bits we acted on which helps
1883 kn
->kn_sfflags
&= ~NOTE_WL_UPDATES_MASK
;
1884 kn
->kn_sfflags
|= kev
->fflags
;
1885 kn
->kn_sdata
= kev
->data
;
1886 if (kev
->fflags
& NOTE_WL_SYNC_WAKE
) {
1887 needs_wake
= (kn
->kn_hook
!= THREAD_NULL
);
1889 } else if (op
== FILT_WLDROP
) {
1890 if ((kn
->kn_sfflags
& (NOTE_WL_SYNC_WAIT
| NOTE_WL_SYNC_WAKE
)) ==
1891 NOTE_WL_SYNC_WAIT
) {
1893 * When deleting a SYNC_WAIT knote that hasn't been woken up
1894 * explicitly, issue a wake up.
1896 kn
->kn_sfflags
|= NOTE_WL_SYNC_WAKE
;
1897 needs_wake
= (kn
->kn_hook
!= THREAD_NULL
);
1905 * Commit ownership and QoS changes if any, possibly wake up waiters
1908 if (cur_owner
== new_owner
&& action
== KQWL_UTQ_NONE
&& !needs_wake
) {
1914 /* If already tracked as servicer, don't track as owner */
1915 if (new_owner
== kqr
->kqr_thread
) {
1916 new_owner
= THREAD_NULL
;
1919 if (cur_owner
!= new_owner
) {
1920 kqwl
->kqwl_owner
= new_owner
;
1921 if (new_owner
== extra_thread_ref
) {
1922 /* we just transfered this ref to kqwl_owner */
1923 extra_thread_ref
= THREAD_NULL
;
1925 cur_owner_override
= kqworkloop_owner_override(kqwl
);
1928 thread_ends_owning_workloop(cur_owner
);
1932 /* override it before we drop the old */
1933 if (cur_owner_override
!= THREAD_QOS_UNSPECIFIED
) {
1934 thread_add_ipc_override(new_owner
, cur_owner_override
);
1936 thread_starts_owning_workloop(new_owner
);
1937 if ((kqr
->kqr_state
& KQR_THREQUESTED
) && !kqr
->kqr_thread
) {
1938 if (action
== KQWL_UTQ_NONE
) {
1939 action
= KQWL_UTQ_REDRIVE_EVENTS
;
1943 if ((kqr
->kqr_state
& (KQR_THREQUESTED
| KQR_WAKEUP
)) == KQR_WAKEUP
) {
1944 if (action
== KQWL_UTQ_NONE
) {
1945 action
= KQWL_UTQ_REDRIVE_EVENTS
;
1951 struct turnstile
*ts
= kqwl
->kqwl_turnstile
;
1952 bool wl_inheritor_updated
= false;
1954 if (action
!= KQWL_UTQ_NONE
) {
1955 kqworkloop_update_threads_qos(kqwl
, action
, qos_index
);
1958 if (cur_owner
!= new_owner
&& ts
) {
1959 if (action
== KQWL_UTQ_REDRIVE_EVENTS
) {
1961 * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
1962 * the code went through workq_kern_threadreq_initiate()
1963 * and the workqueue has set the inheritor already
1965 assert(filt_wlturnstile_interlock_is_workq(kqwl
));
1966 } else if (filt_wlturnstile_interlock_is_workq(kqwl
)) {
1967 workq_kern_threadreq_lock(kqwl
->kqwl_p
);
1968 workq_kern_threadreq_update_inheritor(kqwl
->kqwl_p
, kqr
, new_owner
,
1969 ts
, TURNSTILE_IMMEDIATE_UPDATE
);
1970 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
1971 if (!filt_wlturnstile_interlock_is_workq(kqwl
)) {
1973 * If the workq is no longer the interlock, then
1974 * workq_kern_threadreq_update_inheritor() has finished a bind
1975 * and we need to fallback to the regular path.
1977 filt_wlupdate_inheritor(kqwl
, ts
, TURNSTILE_IMMEDIATE_UPDATE
);
1979 wl_inheritor_updated
= true;
1981 filt_wlupdate_inheritor(kqwl
, ts
, TURNSTILE_IMMEDIATE_UPDATE
);
1982 wl_inheritor_updated
= true;
1986 * We need a turnstile reference because we are dropping the interlock
1987 * and the caller has not called turnstile_prepare.
1989 if (wl_inheritor_updated
) {
1990 turnstile_reference(ts
);
1994 if (needs_wake
&& ts
) {
1995 waitq_wakeup64_thread(&ts
->ts_waitq
, CAST_EVENT64_T((event_t
)kn
),
1996 (thread_t
)kn
->kn_hook
, THREAD_AWAKENED
);
1999 kq_req_unlock(kqwl
);
2001 if (wl_inheritor_updated
) {
2002 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_NOT_HELD
);
2003 turnstile_deallocate(ts
);
2010 * Unlock and cleanup various lingering references and things.
2013 filt_wlunlock(kqwl
);
2016 #if CONFIG_WORKLOOP_DEBUG
2017 KQWL_HISTORY_WRITE_ENTRY(kqwl
, {
2018 .updater
= current_thread(),
2019 .servicer
= kqr
->kqr_thread
, /* Note: racy */
2020 .old_owner
= cur_owner
,
2021 .new_owner
= new_owner
,
2023 .kev_ident
= kev
->ident
,
2024 .error
= (int16_t)error
,
2025 .kev_flags
= kev
->flags
,
2026 .kev_fflags
= kev
->fflags
,
2032 #endif // CONFIG_WORKLOOP_DEBUG
2034 if (cur_owner
&& new_owner
!= cur_owner
) {
2035 if (cur_owner_override
!= THREAD_QOS_UNSPECIFIED
) {
2036 thread_drop_ipc_override(cur_owner
);
2038 thread_deallocate(cur_owner
);
2041 if (extra_thread_ref
) {
2042 thread_deallocate(extra_thread_ref
);
2048 * Remembers the last updated that came in from userspace for debugging reasons.
2049 * - fflags is mirrored from the userspace kevent
2050 * - ext[i, i != VALUE] is mirrored from the userspace kevent
2051 * - ext[VALUE] is set to what the kernel loaded atomically
2052 * - data is set to the error if any
2055 filt_wlremember_last_update(struct knote
*kn
, struct kevent_internal_s
*kev
,
2058 kn
->kn_fflags
= kev
->fflags
;
2059 kn
->kn_data
= error
;
2060 memcpy(kn
->kn_ext
, kev
->ext
, sizeof(kev
->ext
));
2064 filt_wlattach(struct knote
*kn
, struct kevent_internal_s
*kev
)
2066 struct kqueue
*kq
= knote_get_kq(kn
);
2067 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
2069 kq_index_t qos_index
= 0;
2071 if ((kq
->kq_state
& KQ_WORKLOOP
) == 0) {
2076 #if DEVELOPMENT || DEBUG
2077 if (kev
->ident
== 0 && kev
->udata
== 0 && kev
->fflags
== 0) {
2078 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
2082 if (kqr
->kqr_dsync_waiters
) {
2083 kev
->fflags
|= NOTE_WL_SYNC_WAIT
;
2085 if (kqr
->kqr_qos_index
) {
2086 kev
->fflags
|= NOTE_WL_THREAD_REQUEST
;
2088 kev
->ext
[0] = thread_tid(kqwl
->kqwl_owner
);
2089 kev
->ext
[1] = thread_tid(kqwl
->kqwl_request
.kqr_thread
);
2090 kev
->ext
[2] = thread_owned_workloops_count(current_thread());
2091 kev
->ext
[3] = kn
->kn_kevent
.ext
[3];
2092 kq_req_unlock(kqwl
);
2098 int command
= (kn
->kn_sfflags
& NOTE_WL_COMMANDS_MASK
);
2100 case NOTE_WL_THREAD_REQUEST
:
2101 if (kn
->kn_id
!= kqwl
->kqwl_dynamicid
) {
2105 qos_index
= _pthread_priority_thread_qos(kn
->kn_qos
);
2106 if (qos_index
== THREAD_QOS_UNSPECIFIED
) {
2110 if (kqwl
->kqwl_request
.kqr_qos_index
) {
2112 * There already is a thread request, and well, you're only allowed
2113 * one per workloop, so fail the attach.
2119 case NOTE_WL_SYNC_WAIT
:
2120 case NOTE_WL_SYNC_WAKE
:
2121 if (kn
->kn_id
== kqwl
->kqwl_dynamicid
) {
2125 if ((kn
->kn_flags
& EV_DISABLE
) == 0) {
2129 if (kn
->kn_sfflags
& NOTE_WL_END_OWNERSHIP
) {
2139 error
= filt_wlupdate(kqwl
, kn
, kev
, qos_index
, FILT_WLATTACH
);
2143 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2144 if (error
== ESTALE
&& (kn
->kn_sfflags
& NOTE_WL_IGNORE_ESTALE
)) {
2147 knote_set_error(kn
, error
);
2150 if (command
== NOTE_WL_SYNC_WAIT
) {
2151 return kevent_register_wait_prepare(kn
, kev
);
2153 /* Just attaching the thread request successfully will fire it */
2154 if (command
== NOTE_WL_THREAD_REQUEST
) {
2156 * Thread Request knotes need an explicit touch to be active again,
2157 * so delivering an event needs to also consume it.
2159 kn
->kn_flags
|= EV_CLEAR
;
2160 return FILTER_ACTIVE
;
2166 filt_wlwait_continue(void *parameter
, wait_result_t wr
)
2168 struct _kevent_register
*cont_args
= parameter
;
2169 struct kqworkloop
*kqwl
= (struct kqworkloop
*)cont_args
->kq
;
2170 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
2173 kqr
->kqr_dsync_waiters
--;
2174 if (filt_wlturnstile_interlock_is_workq(kqwl
)) {
2175 workq_kern_threadreq_lock(kqwl
->kqwl_p
);
2176 turnstile_complete((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
, NULL
);
2177 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
2179 turnstile_complete((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
, NULL
);
2181 kq_req_unlock(kqwl
);
2183 turnstile_cleanup();
2185 if (wr
== THREAD_INTERRUPTED
) {
2186 cont_args
->kev
.flags
|= EV_ERROR
;
2187 cont_args
->kev
.data
= EINTR
;
2188 } else if (wr
!= THREAD_AWAKENED
) {
2189 panic("Unexpected wait result: %d", wr
);
2192 kevent_register_wait_return(cont_args
);
2196 * Called with the workloop mutex held, most of the time never returns as it
2197 * calls filt_wlwait_continue through a continuation.
2200 filt_wlpost_register_wait(struct uthread
*uth
, struct knote_lock_ctx
*knlc
,
2201 struct _kevent_register
*cont_args
)
2203 struct kqworkloop
*kqwl
= (struct kqworkloop
*)cont_args
->kq
;
2204 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
2205 struct turnstile
*ts
;
2206 bool workq_locked
= false;
2210 kqr
->kqr_dsync_waiters
++;
2212 if (filt_wlturnstile_interlock_is_workq(kqwl
)) {
2213 workq_kern_threadreq_lock(kqwl
->kqwl_p
);
2214 workq_locked
= true;
2217 ts
= turnstile_prepare((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
,
2218 TURNSTILE_NULL
, TURNSTILE_WORKLOOPS
);
2221 workq_kern_threadreq_update_inheritor(kqwl
->kqwl_p
,
2222 &kqwl
->kqwl_request
, kqwl
->kqwl_owner
, ts
,
2223 TURNSTILE_DELAYED_UPDATE
);
2224 if (!filt_wlturnstile_interlock_is_workq(kqwl
)) {
2226 * if the interlock is no longer the workqueue lock,
2227 * then we don't need to hold it anymore.
2229 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
2230 workq_locked
= false;
2233 if (!workq_locked
) {
2235 * If the interlock is the workloop's, then it's our responsibility to
2236 * call update_inheritor, so just do it.
2238 filt_wlupdate_inheritor(kqwl
, ts
, TURNSTILE_DELAYED_UPDATE
);
2241 thread_set_pending_block_hint(uth
->uu_thread
, kThreadWaitWorkloopSyncWait
);
2242 waitq_assert_wait64(&ts
->ts_waitq
, CAST_EVENT64_T(cont_args
->knote
),
2243 THREAD_ABORTSAFE
, TIMEOUT_WAIT_FOREVER
);
2246 workq_kern_threadreq_unlock(kqwl
->kqwl_p
);
2249 thread_t thread
= kqwl
->kqwl_owner
?: kqr
->kqr_thread
;
2251 thread_reference(thread
);
2253 kq_req_unlock(kqwl
);
2255 kevent_register_wait_block(ts
, thread
, knlc
, filt_wlwait_continue
, cont_args
);
2258 /* called in stackshot context to report the thread responsible for blocking this thread */
2260 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread
,
2261 event64_t event
, thread_waitinfo_t
*waitinfo
)
2263 struct knote
*kn
= (struct knote
*)event
;
2264 assert(kdp_is_in_zone(kn
, "knote zone"));
2266 assert(kn
->kn_hook
== thread
);
2268 struct kqueue
*kq
= knote_get_kq(kn
);
2269 assert(kdp_is_in_zone(kq
, "kqueue workloop zone"));
2270 assert(kq
->kq_state
& KQ_WORKLOOP
);
2272 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
2273 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
2275 thread_t kqwl_owner
= kqwl
->kqwl_owner
;
2276 thread_t servicer
= kqr
->kqr_thread
;
2278 if (kqwl_owner
!= THREAD_NULL
) {
2279 assert(kdp_is_in_zone(kqwl_owner
, "threads"));
2281 waitinfo
->owner
= thread_tid(kqwl
->kqwl_owner
);
2282 } else if (servicer
!= THREAD_NULL
) {
2283 assert(kdp_is_in_zone(servicer
, "threads"));
2285 waitinfo
->owner
= thread_tid(servicer
);
2286 } else if (kqr
->kqr_state
& KQR_THREQUESTED
) {
2287 waitinfo
->owner
= STACKSHOT_WAITOWNER_THREQUESTED
;
2289 waitinfo
->owner
= 0;
2292 waitinfo
->context
= kqwl
->kqwl_dynamicid
;
2296 filt_wldetach(__assert_only
struct knote
*kn
)
2298 assert(knote_get_kq(kn
)->kq_state
& KQ_WORKLOOP
);
2300 kevent_register_wait_cleanup(kn
);
2305 filt_wlvalidate_kev_flags(struct knote
*kn
, struct kevent_internal_s
*kev
,
2306 thread_qos_t
*qos_index
)
2308 int new_commands
= kev
->fflags
& NOTE_WL_COMMANDS_MASK
;
2309 int sav_commands
= kn
->kn_sfflags
& NOTE_WL_COMMANDS_MASK
;
2311 if ((kev
->fflags
& NOTE_WL_DISCOVER_OWNER
) && (kev
->flags
& EV_DELETE
)) {
2314 if (kev
->fflags
& NOTE_WL_UPDATE_QOS
) {
2315 if (kev
->flags
& EV_DELETE
) {
2318 if (sav_commands
!= NOTE_WL_THREAD_REQUEST
) {
2321 if (!(*qos_index
= _pthread_priority_thread_qos(kev
->qos
))) {
2326 switch (new_commands
) {
2327 case NOTE_WL_THREAD_REQUEST
:
2328 /* thread requests can only update themselves */
2329 if (sav_commands
!= NOTE_WL_THREAD_REQUEST
)
2333 case NOTE_WL_SYNC_WAIT
:
2334 if (kev
->fflags
& NOTE_WL_END_OWNERSHIP
)
2338 case NOTE_WL_SYNC_WAKE
:
2340 if (!(sav_commands
& (NOTE_WL_SYNC_WAIT
| NOTE_WL_SYNC_WAKE
)))
2342 if ((kev
->flags
& (EV_ENABLE
| EV_DELETE
)) == EV_ENABLE
)
2353 filt_wltouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
2355 struct kqworkloop
*kqwl
= (struct kqworkloop
*)knote_get_kq(kn
);
2356 thread_qos_t qos_index
= THREAD_QOS_UNSPECIFIED
;
2358 int error
= filt_wlvalidate_kev_flags(kn
, kev
, &qos_index
);
2363 error
= filt_wlupdate(kqwl
, kn
, kev
, qos_index
, FILT_WLTOUCH
);
2364 filt_wlremember_last_update(kn
, kev
, error
);
2371 if (error
== ESTALE
&& (kev
->fflags
& NOTE_WL_IGNORE_ESTALE
)) {
2372 /* If userland wants ESTALE to be hidden, do not activate */
2375 kev
->flags
|= EV_ERROR
;
2379 int command
= kev
->fflags
& NOTE_WL_COMMANDS_MASK
;
2380 if (command
== NOTE_WL_SYNC_WAIT
&& !(kn
->kn_sfflags
& NOTE_WL_SYNC_WAKE
)) {
2381 return kevent_register_wait_prepare(kn
, kev
);
2383 /* Just touching the thread request successfully will fire it */
2384 if (command
== NOTE_WL_THREAD_REQUEST
) {
2385 if (kev
->fflags
& NOTE_WL_UPDATE_QOS
) {
2386 return FILTER_ACTIVE
| FILTER_UPDATE_REQ_QOS
;
2388 return FILTER_ACTIVE
;
2394 filt_wlallow_drop(struct knote
*kn
, struct kevent_internal_s
*kev
)
2396 struct kqworkloop
*kqwl
= (struct kqworkloop
*)knote_get_kq(kn
);
2398 int error
= filt_wlvalidate_kev_flags(kn
, kev
, NULL
);
2403 error
= filt_wlupdate(kqwl
, kn
, kev
, 0, FILT_WLDROP
);
2404 filt_wlremember_last_update(kn
, kev
, error
);
2411 if (error
== ESTALE
&& (kev
->fflags
& NOTE_WL_IGNORE_ESTALE
)) {
2414 kev
->flags
|= EV_ERROR
;
2424 __unused
struct filt_process_s
*data
,
2425 struct kevent_internal_s
*kev
)
2427 struct kqworkloop
*kqwl
= (struct kqworkloop
*)knote_get_kq(kn
);
2430 assert(kn
->kn_sfflags
& NOTE_WL_THREAD_REQUEST
);
2434 if (kqwl
->kqwl_owner
) {
2436 * <rdar://problem/33584321> userspace sometimes due to events being
2437 * delivered but not triggering a drain session can cause a process
2438 * of the thread request knote.
2440 * When that happens, the automatic deactivation due to process
2441 * would swallow the event, so we have to activate the knote again.
2447 #if DEBUG || DEVELOPMENT
2448 if (kevent_debug_flags() & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS
) {
2450 * see src/queue_internal.h in libdispatch
2452 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
2453 user_addr_t addr
= CAST_USER_ADDR_T(kn
->kn_ext
[EV_EXTIDX_WL_ADDR
]);
2454 task_t t
= current_task();
2456 if (addr
&& task_is_active(t
) && !task_is_halting(t
) &&
2457 copyin_word(addr
, &val
, sizeof(val
)) == 0 &&
2458 val
&& (val
& DISPATCH_QUEUE_ENQUEUED
) == 0 &&
2459 (val
>> 48) != 0xdead && (val
>> 48) != 0 && (val
>> 48) != 0xffff) {
2460 panic("kevent: workloop %#016llx is not enqueued "
2461 "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2462 kn
->kn_udata
, kn
, val
, kn
->kn_ext
[EV_EXTIDX_WL_VALUE
]);
2466 *kev
= kn
->kn_kevent
;
2467 kev
->fflags
= kn
->kn_sfflags
;
2468 kev
->data
= kn
->kn_sdata
;
2469 kev
->qos
= kn
->kn_qos
;
2470 rc
|= FILTER_ACTIVE
;
2473 filt_wlunlock(kqwl
);
2475 if (rc
& FILTER_ACTIVE
) {
2476 workq_thread_set_max_qos(kqwl
->kqwl_p
, &kqwl
->kqwl_request
);
2481 SECURITY_READ_ONLY_EARLY(static struct filterops
) workloop_filtops
= {
2482 .f_extended_codes
= true,
2483 .f_attach
= filt_wlattach
,
2484 .f_detach
= filt_wldetach
,
2485 .f_event
= filt_badevent
,
2486 .f_touch
= filt_wltouch
,
2487 .f_process
= filt_wlprocess
,
2488 .f_allow_drop
= filt_wlallow_drop
,
2489 .f_post_register_wait
= filt_wlpost_register_wait
,
2492 #pragma mark kevent / knotes
2495 * JMM - placeholder for not-yet-implemented filters
2498 filt_badevent(struct knote
*kn
, long hint
)
2500 panic("%s[%d](%p, %ld)", __func__
, kn
->kn_filter
, kn
, hint
);
2505 filt_badattach(__unused
struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
2507 knote_set_error(kn
, ENOTSUP
);
2512 kqueue_alloc(struct proc
*p
, unsigned int flags
)
2514 struct filedesc
*fdp
= p
->p_fd
;
2515 struct kqueue
*kq
= NULL
;
2519 if (flags
& KEVENT_FLAG_WORKQ
) {
2520 struct kqworkq
*kqwq
;
2523 kqwq
= (struct kqworkq
*)zalloc(kqworkq_zone
);
2527 kq
= &kqwq
->kqwq_kqueue
;
2528 bzero(kqwq
, sizeof (struct kqworkq
));
2530 kqwq
->kqwq_state
= KQ_WORKQ
;
2532 for (i
= 0; i
< KQWQ_NBUCKETS
; i
++) {
2533 TAILQ_INIT(&kqwq
->kqwq_queue
[i
]);
2535 for (i
= 0; i
< KQWQ_NBUCKETS
; i
++) {
2536 if (i
!= KQWQ_QOS_MANAGER
) {
2538 * Because of how the bucketized system works, we mix overcommit
2539 * sources with not overcommit: each time we move a knote from
2540 * one bucket to the next due to overrides, we'd had to track
2541 * overcommitness, and it's really not worth it in the workloop
2542 * enabled world that track this faithfully.
2544 * Incidentally, this behaves like the original manager-based
2545 * kqwq where event delivery always happened (hence is
2548 kqwq
->kqwq_request
[i
].kqr_state
|= KQR_THOVERCOMMIT
;
2550 kqwq
->kqwq_request
[i
].kqr_qos_index
= i
;
2551 TAILQ_INIT(&kqwq
->kqwq_request
[i
].kqr_suppressed
);
2554 policy
= SYNC_POLICY_FIFO
;
2555 hook
= (void *)kqwq
;
2556 } else if (flags
& KEVENT_FLAG_WORKLOOP
) {
2557 struct kqworkloop
*kqwl
;
2560 kqwl
= (struct kqworkloop
*)zalloc(kqworkloop_zone
);
2564 bzero(kqwl
, sizeof (struct kqworkloop
));
2566 kqwl
->kqwl_state
= KQ_WORKLOOP
| KQ_DYNAMIC
;
2567 kqwl
->kqwl_retains
= 1; /* donate a retain to creator */
2568 kqwl
->kqwl_request
.kqr_state
= KQR_WORKLOOP
;
2570 kq
= &kqwl
->kqwl_kqueue
;
2571 for (i
= 0; i
< KQWL_NBUCKETS
; i
++) {
2572 TAILQ_INIT(&kqwl
->kqwl_queue
[i
]);
2574 TAILQ_INIT(&kqwl
->kqwl_request
.kqr_suppressed
);
2576 lck_mtx_init(&kqwl
->kqwl_statelock
, kq_lck_grp
, kq_lck_attr
);
2578 policy
= SYNC_POLICY_FIFO
;
2579 hook
= (void *)kqwl
;
2583 kqf
= (struct kqfile
*)zalloc(kqfile_zone
);
2587 kq
= &kqf
->kqf_kqueue
;
2588 bzero(kqf
, sizeof (struct kqfile
));
2589 TAILQ_INIT(&kqf
->kqf_queue
);
2590 TAILQ_INIT(&kqf
->kqf_suppressed
);
2592 policy
= SYNC_POLICY_FIFO
| SYNC_POLICY_PREPOST
;
2595 waitq_set_init(&kq
->kq_wqs
, policy
, NULL
, hook
);
2596 lck_spin_init(&kq
->kq_lock
, kq_lck_grp
, kq_lck_attr
);
2597 lck_spin_init(&kq
->kq_reqlock
, kq_lck_grp
, kq_lck_attr
);
2600 if (fdp
->fd_knlistsize
< 0) {
2602 if (fdp
->fd_knlistsize
< 0)
2603 fdp
->fd_knlistsize
= 0; /* this process has had a kq */
2611 * knotes_dealloc - detach all knotes for the process and drop them
2613 * Called with proc_fdlock held.
2614 * Returns with it locked.
2615 * May drop it temporarily.
2616 * Process is in such a state that it will not try to allocate
2617 * any more knotes during this process (stopped for exit or exec).
2620 knotes_dealloc(proc_t p
)
2622 struct filedesc
*fdp
= p
->p_fd
;
2625 struct klist
*kn_hash
= NULL
;
2628 /* Close all the fd-indexed knotes up front */
2629 if (fdp
->fd_knlistsize
> 0) {
2630 for (i
= 0; i
< fdp
->fd_knlistsize
; i
++) {
2631 while ((kn
= SLIST_FIRST(&fdp
->fd_knlist
[i
])) != NULL
) {
2632 kq
= knote_get_kq(kn
);
2635 knote_drop(kq
, kn
, NULL
);
2639 /* free the table */
2640 FREE(fdp
->fd_knlist
, M_KQUEUE
);
2641 fdp
->fd_knlist
= NULL
;
2643 fdp
->fd_knlistsize
= -1;
2648 /* Clean out all the hashed knotes as well */
2649 if (fdp
->fd_knhashmask
!= 0) {
2650 for (i
= 0; i
<= (int)fdp
->fd_knhashmask
; i
++) {
2651 while ((kn
= SLIST_FIRST(&fdp
->fd_knhash
[i
])) != NULL
) {
2652 kq
= knote_get_kq(kn
);
2655 knote_drop(kq
, kn
, NULL
);
2659 kn_hash
= fdp
->fd_knhash
;
2660 fdp
->fd_knhashmask
= 0;
2661 fdp
->fd_knhash
= NULL
;
2666 /* free the kn_hash table */
2668 FREE(kn_hash
, M_KQUEUE
);
2674 * kqworkloop_invalidate
2676 * Invalidate ownership of a workloop.
2678 * This is meant to be used so that any remnant of overrides and ownership
2679 * information is dropped before a kqworkloop can no longer be found in the
2680 * global hash table and have ghost workloop ownership left over.
2682 * Possibly returns a thread to deallocate in a safe context.
2685 kqworkloop_invalidate(struct kqworkloop
*kqwl
)
2687 thread_t cur_owner
= kqwl
->kqwl_owner
;
2689 assert(TAILQ_EMPTY(&kqwl
->kqwl_request
.kqr_suppressed
));
2692 * If the kqueue had an owner that prevented the thread request to
2693 * go through, then no unbind happened, and we may have lingering
2694 * overrides to drop.
2696 if (kqworkloop_owner_override(kqwl
) != THREAD_QOS_UNSPECIFIED
) {
2697 thread_drop_ipc_override(cur_owner
);
2699 thread_ends_owning_workloop(cur_owner
);
2700 kqwl
->kqwl_owner
= THREAD_NULL
;
2707 * kqueue_dealloc - detach all knotes from a kqueue and free it
2709 * We walk each list looking for knotes referencing this
2710 * this kqueue. If we find one, we try to drop it. But
2711 * if we fail to get a drop reference, that will wait
2712 * until it is dropped. So, we can just restart again
2713 * safe in the assumption that the list will eventually
2714 * not contain any more references to this kqueue (either
2715 * we dropped them all, or someone else did).
2717 * Assumes no new events are being added to the kqueue.
2718 * Nothing locked on entry or exit.
2720 * Workloop kqueues cant get here unless all the knotes
2721 * are already gone and all requested threads have come
2722 * and gone (cancelled or arrived).
2725 kqueue_dealloc(struct kqueue
*kq
)
2728 struct filedesc
*fdp
;
2739 * Workloops are refcounted by their knotes, so there's no point
2740 * spending a lot of time under these locks just to deallocate one.
2742 if ((kq
->kq_state
& KQ_WORKLOOP
) == 0) {
2743 KNOTE_LOCK_CTX(knlc
);
2746 for (i
= 0; i
< fdp
->fd_knlistsize
; i
++) {
2747 kn
= SLIST_FIRST(&fdp
->fd_knlist
[i
]);
2748 while (kn
!= NULL
) {
2749 if (kq
== knote_get_kq(kn
)) {
2752 if (knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ON_SUCCESS
)) {
2753 knote_drop(kq
, kn
, &knlc
);
2756 /* start over at beginning of list */
2757 kn
= SLIST_FIRST(&fdp
->fd_knlist
[i
]);
2760 kn
= SLIST_NEXT(kn
, kn_link
);
2767 if (fdp
->fd_knhashmask
!= 0) {
2768 for (i
= 0; i
< (int)fdp
->fd_knhashmask
+ 1; i
++) {
2769 kn
= SLIST_FIRST(&fdp
->fd_knhash
[i
]);
2770 while (kn
!= NULL
) {
2771 if (kq
== knote_get_kq(kn
)) {
2774 if (knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ON_SUCCESS
)) {
2775 knote_drop(kq
, kn
, &knlc
);
2778 /* start over at beginning of list */
2779 kn
= SLIST_FIRST(&fdp
->fd_knhash
[i
]);
2782 kn
= SLIST_NEXT(kn
, kn_link
);
2789 if (kq
->kq_state
& KQ_WORKLOOP
) {
2790 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
2791 thread_t cur_owner
= kqworkloop_invalidate(kqwl
);
2793 if (cur_owner
) thread_deallocate(cur_owner
);
2795 if (kqwl
->kqwl_request
.kqr_state
& KQR_ALLOCATED_TURNSTILE
) {
2796 struct turnstile
*ts
;
2797 turnstile_complete((uintptr_t)kqwl
, &kqwl
->kqwl_turnstile
, &ts
);
2798 turnstile_cleanup();
2799 turnstile_deallocate(ts
);
2801 assert(kqwl
->kqwl_turnstile
== NULL
);
2806 * waitq_set_deinit() remove the KQ's waitq set from
2807 * any select sets to which it may belong.
2809 waitq_set_deinit(&kq
->kq_wqs
);
2810 lck_spin_destroy(&kq
->kq_lock
, kq_lck_grp
);
2811 lck_spin_destroy(&kq
->kq_reqlock
, kq_lck_grp
);
2813 if (kq
->kq_state
& KQ_WORKQ
) {
2814 zfree(kqworkq_zone
, (struct kqworkq
*)kq
);
2815 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
2816 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
2818 assert(kqwl
->kqwl_retains
== 0);
2819 lck_mtx_destroy(&kqwl
->kqwl_statelock
, kq_lck_grp
);
2820 zfree(kqworkloop_zone
, kqwl
);
2822 zfree(kqfile_zone
, (struct kqfile
*)kq
);
2827 kqueue_retain(struct kqueue
*kq
)
2829 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
2832 if ((kq
->kq_state
& KQ_DYNAMIC
) == 0)
2835 previous
= OSIncrementAtomic(&kqwl
->kqwl_retains
);
2836 if (previous
== KQ_WORKLOOP_RETAINS_MAX
)
2837 panic("kq(%p) retain overflow", kq
);
2840 panic("kq(%p) resurrection", kq
);
2843 #define KQUEUE_CANT_BE_LAST_REF 0
2844 #define KQUEUE_MIGHT_BE_LAST_REF 1
2847 kqueue_release(kqueue_t kqu
, __assert_only
int possibly_last
)
2849 if ((kqu
.kq
->kq_state
& KQ_DYNAMIC
) == 0) {
2853 assert(kqu
.kq
->kq_state
& KQ_WORKLOOP
); /* for now */
2854 uint32_t refs
= OSDecrementAtomic(&kqu
.kqwl
->kqwl_retains
);
2855 if (__improbable(refs
== 0)) {
2856 panic("kq(%p) over-release", kqu
.kq
);
2859 assert(possibly_last
);
2865 kqueue_body(struct proc
*p
, fp_allocfn_t fp_zalloc
, void *cra
, int32_t *retval
)
2868 struct fileproc
*fp
;
2871 error
= falloc_withalloc(p
,
2872 &fp
, &fd
, vfs_context_current(), fp_zalloc
, cra
);
2877 kq
= kqueue_alloc(p
, 0);
2883 fp
->f_flag
= FREAD
| FWRITE
;
2884 fp
->f_ops
= &kqueueops
;
2888 *fdflags(p
, fd
) |= UF_EXCLOSE
;
2889 procfdtbl_releasefd(p
, fd
, NULL
);
2890 fp_drop(p
, fd
, fp
, 1);
2898 kqueue(struct proc
*p
, __unused
struct kqueue_args
*uap
, int32_t *retval
)
2900 return (kqueue_body(p
, fileproc_alloc_init
, NULL
, retval
));
2904 kevent_copyin(user_addr_t
*addrp
, struct kevent_internal_s
*kevp
, struct proc
*p
,
2910 if (flags
& KEVENT_FLAG_LEGACY32
) {
2911 bzero(kevp
, sizeof (*kevp
));
2913 if (IS_64BIT_PROCESS(p
)) {
2914 struct user64_kevent kev64
;
2916 advance
= sizeof (kev64
);
2917 error
= copyin(*addrp
, (caddr_t
)&kev64
, advance
);
2920 kevp
->ident
= kev64
.ident
;
2921 kevp
->filter
= kev64
.filter
;
2922 kevp
->flags
= kev64
.flags
;
2923 kevp
->udata
= kev64
.udata
;
2924 kevp
->fflags
= kev64
.fflags
;
2925 kevp
->data
= kev64
.data
;
2927 struct user32_kevent kev32
;
2929 advance
= sizeof (kev32
);
2930 error
= copyin(*addrp
, (caddr_t
)&kev32
, advance
);
2933 kevp
->ident
= (uintptr_t)kev32
.ident
;
2934 kevp
->filter
= kev32
.filter
;
2935 kevp
->flags
= kev32
.flags
;
2936 kevp
->udata
= CAST_USER_ADDR_T(kev32
.udata
);
2937 kevp
->fflags
= kev32
.fflags
;
2938 kevp
->data
= (intptr_t)kev32
.data
;
2940 } else if (flags
& KEVENT_FLAG_LEGACY64
) {
2941 struct kevent64_s kev64
;
2943 bzero(kevp
, sizeof (*kevp
));
2945 advance
= sizeof (struct kevent64_s
);
2946 error
= copyin(*addrp
, (caddr_t
)&kev64
, advance
);
2949 kevp
->ident
= kev64
.ident
;
2950 kevp
->filter
= kev64
.filter
;
2951 kevp
->flags
= kev64
.flags
;
2952 kevp
->udata
= kev64
.udata
;
2953 kevp
->fflags
= kev64
.fflags
;
2954 kevp
->data
= kev64
.data
;
2955 kevp
->ext
[0] = kev64
.ext
[0];
2956 kevp
->ext
[1] = kev64
.ext
[1];
2959 struct kevent_qos_s kevqos
;
2961 bzero(kevp
, sizeof (*kevp
));
2963 advance
= sizeof (struct kevent_qos_s
);
2964 error
= copyin(*addrp
, (caddr_t
)&kevqos
, advance
);
2967 kevp
->ident
= kevqos
.ident
;
2968 kevp
->filter
= kevqos
.filter
;
2969 kevp
->flags
= kevqos
.flags
;
2970 kevp
->qos
= kevqos
.qos
;
2971 // kevp->xflags = kevqos.xflags;
2972 kevp
->udata
= kevqos
.udata
;
2973 kevp
->fflags
= kevqos
.fflags
;
2974 kevp
->data
= kevqos
.data
;
2975 kevp
->ext
[0] = kevqos
.ext
[0];
2976 kevp
->ext
[1] = kevqos
.ext
[1];
2977 kevp
->ext
[2] = kevqos
.ext
[2];
2978 kevp
->ext
[3] = kevqos
.ext
[3];
2986 kevent_copyout(struct kevent_internal_s
*kevp
, user_addr_t
*addrp
, struct proc
*p
,
2989 user_addr_t addr
= *addrp
;
2994 * fully initialize the differnt output event structure
2995 * types from the internal kevent (and some universal
2996 * defaults for fields not represented in the internal
2999 if (flags
& KEVENT_FLAG_LEGACY32
) {
3000 assert((flags
& KEVENT_FLAG_STACK_EVENTS
) == 0);
3002 if (IS_64BIT_PROCESS(p
)) {
3003 struct user64_kevent kev64
;
3005 advance
= sizeof (kev64
);
3006 bzero(&kev64
, advance
);
3009 * deal with the special case of a user-supplied
3010 * value of (uintptr_t)-1.
3012 kev64
.ident
= (kevp
->ident
== (uintptr_t)-1) ?
3013 (uint64_t)-1LL : (uint64_t)kevp
->ident
;
3015 kev64
.filter
= kevp
->filter
;
3016 kev64
.flags
= kevp
->flags
;
3017 kev64
.fflags
= kevp
->fflags
;
3018 kev64
.data
= (int64_t) kevp
->data
;
3019 kev64
.udata
= kevp
->udata
;
3020 error
= copyout((caddr_t
)&kev64
, addr
, advance
);
3022 struct user32_kevent kev32
;
3024 advance
= sizeof (kev32
);
3025 bzero(&kev32
, advance
);
3026 kev32
.ident
= (uint32_t)kevp
->ident
;
3027 kev32
.filter
= kevp
->filter
;
3028 kev32
.flags
= kevp
->flags
;
3029 kev32
.fflags
= kevp
->fflags
;
3030 kev32
.data
= (int32_t)kevp
->data
;
3031 kev32
.udata
= kevp
->udata
;
3032 error
= copyout((caddr_t
)&kev32
, addr
, advance
);
3034 } else if (flags
& KEVENT_FLAG_LEGACY64
) {
3035 struct kevent64_s kev64
;
3037 advance
= sizeof (struct kevent64_s
);
3038 if (flags
& KEVENT_FLAG_STACK_EVENTS
) {
3041 bzero(&kev64
, advance
);
3042 kev64
.ident
= kevp
->ident
;
3043 kev64
.filter
= kevp
->filter
;
3044 kev64
.flags
= kevp
->flags
;
3045 kev64
.fflags
= kevp
->fflags
;
3046 kev64
.data
= (int64_t) kevp
->data
;
3047 kev64
.udata
= kevp
->udata
;
3048 kev64
.ext
[0] = kevp
->ext
[0];
3049 kev64
.ext
[1] = kevp
->ext
[1];
3050 error
= copyout((caddr_t
)&kev64
, addr
, advance
);
3052 struct kevent_qos_s kevqos
;
3054 advance
= sizeof (struct kevent_qos_s
);
3055 if (flags
& KEVENT_FLAG_STACK_EVENTS
) {
3058 bzero(&kevqos
, advance
);
3059 kevqos
.ident
= kevp
->ident
;
3060 kevqos
.filter
= kevp
->filter
;
3061 kevqos
.flags
= kevp
->flags
;
3062 kevqos
.qos
= kevp
->qos
;
3063 kevqos
.udata
= kevp
->udata
;
3064 kevqos
.fflags
= kevp
->fflags
;
3066 kevqos
.data
= (int64_t) kevp
->data
;
3067 kevqos
.ext
[0] = kevp
->ext
[0];
3068 kevqos
.ext
[1] = kevp
->ext
[1];
3069 kevqos
.ext
[2] = kevp
->ext
[2];
3070 kevqos
.ext
[3] = kevp
->ext
[3];
3071 error
= copyout((caddr_t
)&kevqos
, addr
, advance
);
3074 if (flags
& KEVENT_FLAG_STACK_EVENTS
)
3077 *addrp
= addr
+ advance
;
3083 kevent_get_data_size(
3085 uint64_t data_available
,
3087 user_size_t
*residp
)
3092 if (data_available
!= USER_ADDR_NULL
) {
3093 if (flags
& KEVENT_FLAG_KERNEL
) {
3094 resid
= *(user_size_t
*)(uintptr_t)data_available
;
3095 } else if (IS_64BIT_PROCESS(p
)) {
3096 user64_size_t usize
;
3097 error
= copyin((user_addr_t
)data_available
, &usize
, sizeof(usize
));
3098 resid
= (user_size_t
)usize
;
3100 user32_size_t usize
;
3101 error
= copyin((user_addr_t
)data_available
, &usize
, sizeof(usize
));
3102 resid
= (user_size_t
)usize
;
3114 kevent_put_data_size(
3116 uint64_t data_available
,
3122 if (data_available
) {
3123 if (flags
& KEVENT_FLAG_KERNEL
) {
3124 *(user_size_t
*)(uintptr_t)data_available
= resid
;
3125 } else if (IS_64BIT_PROCESS(p
)) {
3126 user64_size_t usize
= (user64_size_t
)resid
;
3127 error
= copyout(&usize
, (user_addr_t
)data_available
, sizeof(usize
));
3129 user32_size_t usize
= (user32_size_t
)resid
;
3130 error
= copyout(&usize
, (user_addr_t
)data_available
, sizeof(usize
));
3137 * kevent_continue - continue a kevent syscall after blocking
3139 * assume we inherit a use count on the kq fileglob.
3141 __attribute__((noreturn
))
3143 kevent_continue(__unused
struct kqueue
*kq
, void *data
, int error
)
3145 struct _kevent
*cont_args
;
3146 struct fileproc
*fp
;
3147 uint64_t data_available
;
3148 user_size_t data_size
;
3149 user_size_t data_resid
;
3154 struct proc
*p
= current_proc();
3156 cont_args
= (struct _kevent
*)data
;
3157 data_available
= cont_args
->data_available
;
3158 flags
= cont_args
->process_data
.fp_flags
;
3159 data_size
= cont_args
->process_data
.fp_data_size
;
3160 data_resid
= cont_args
->process_data
.fp_data_resid
;
3161 noutputs
= cont_args
->eventout
;
3162 retval
= cont_args
->retval
;
3166 kevent_put_kq(p
, fd
, fp
, kq
);
3168 /* don't abandon other output just because of residual copyout failures */
3169 if (error
== 0 && data_available
&& data_resid
!= data_size
) {
3170 (void)kevent_put_data_size(p
, data_available
, flags
, data_resid
);
3173 /* don't restart after signals... */
3174 if (error
== ERESTART
)
3176 else if (error
== EWOULDBLOCK
)
3180 unix_syscall_return(error
);
3184 * kevent - [syscall] register and wait for kernel events
3188 kevent(struct proc
*p
, struct kevent_args
*uap
, int32_t *retval
)
3190 unsigned int flags
= KEVENT_FLAG_LEGACY32
;
3192 return kevent_internal(p
,
3193 (kqueue_id_t
)uap
->fd
, NULL
,
3194 uap
->changelist
, uap
->nchanges
,
3195 uap
->eventlist
, uap
->nevents
,
3204 kevent64(struct proc
*p
, struct kevent64_args
*uap
, int32_t *retval
)
3208 /* restrict to user flags and set legacy64 */
3209 flags
= uap
->flags
& KEVENT_FLAG_USER
;
3210 flags
|= KEVENT_FLAG_LEGACY64
;
3212 return kevent_internal(p
,
3213 (kqueue_id_t
)uap
->fd
, NULL
,
3214 uap
->changelist
, uap
->nchanges
,
3215 uap
->eventlist
, uap
->nevents
,
3224 kevent_qos(struct proc
*p
, struct kevent_qos_args
*uap
, int32_t *retval
)
3226 /* restrict to user flags */
3227 uap
->flags
&= KEVENT_FLAG_USER
;
3229 return kevent_internal(p
,
3230 (kqueue_id_t
)uap
->fd
, NULL
,
3231 uap
->changelist
, uap
->nchanges
,
3232 uap
->eventlist
, uap
->nevents
,
3233 uap
->data_out
, (uint64_t)uap
->data_available
,
3241 kevent_qos_internal(struct proc
*p
, int fd
,
3242 user_addr_t changelist
, int nchanges
,
3243 user_addr_t eventlist
, int nevents
,
3244 user_addr_t data_out
, user_size_t
*data_available
,
3248 return kevent_internal(p
,
3249 (kqueue_id_t
)fd
, NULL
,
3250 changelist
, nchanges
,
3252 data_out
, (uint64_t)data_available
,
3253 (flags
| KEVENT_FLAG_KERNEL
),
3260 kevent_id(struct proc
*p
, struct kevent_id_args
*uap
, int32_t *retval
)
3262 /* restrict to user flags */
3263 uap
->flags
&= KEVENT_FLAG_USER
;
3265 return kevent_internal(p
,
3266 (kqueue_id_t
)uap
->id
, NULL
,
3267 uap
->changelist
, uap
->nchanges
,
3268 uap
->eventlist
, uap
->nevents
,
3269 uap
->data_out
, (uint64_t)uap
->data_available
,
3270 (uap
->flags
| KEVENT_FLAG_DYNAMIC_KQUEUE
),
3277 kevent_id_internal(struct proc
*p
, kqueue_id_t
*id
,
3278 user_addr_t changelist
, int nchanges
,
3279 user_addr_t eventlist
, int nevents
,
3280 user_addr_t data_out
, user_size_t
*data_available
,
3284 return kevent_internal(p
,
3286 changelist
, nchanges
,
3288 data_out
, (uint64_t)data_available
,
3289 (flags
| KEVENT_FLAG_KERNEL
| KEVENT_FLAG_DYNAMIC_KQUEUE
),
3296 kevent_get_timeout(struct proc
*p
,
3297 user_addr_t utimeout
,
3299 struct timeval
*atvp
)
3304 if (flags
& KEVENT_FLAG_IMMEDIATE
) {
3305 getmicrouptime(&atv
);
3306 } else if (utimeout
!= USER_ADDR_NULL
) {
3308 if (flags
& KEVENT_FLAG_KERNEL
) {
3309 struct timespec
*tsp
= (struct timespec
*)utimeout
;
3310 TIMESPEC_TO_TIMEVAL(&rtv
, tsp
);
3311 } else if (IS_64BIT_PROCESS(p
)) {
3312 struct user64_timespec ts
;
3313 error
= copyin(utimeout
, &ts
, sizeof(ts
));
3314 if ((ts
.tv_sec
& 0xFFFFFFFF00000000ull
) != 0)
3317 TIMESPEC_TO_TIMEVAL(&rtv
, &ts
);
3319 struct user32_timespec ts
;
3320 error
= copyin(utimeout
, &ts
, sizeof(ts
));
3321 TIMESPEC_TO_TIMEVAL(&rtv
, &ts
);
3325 if (itimerfix(&rtv
))
3327 getmicrouptime(&atv
);
3328 timevaladd(&atv
, &rtv
);
3330 /* wait forever value */
3339 kevent_set_kq_mode(struct kqueue
*kq
, unsigned int flags
)
3341 /* each kq should only be used for events of one type */
3343 if (kq
->kq_state
& (KQ_KEV32
| KQ_KEV64
| KQ_KEV_QOS
)) {
3344 if (flags
& KEVENT_FLAG_LEGACY32
) {
3345 if ((kq
->kq_state
& KQ_KEV32
) == 0) {
3349 } else if (kq
->kq_state
& KQ_KEV32
) {
3353 } else if (flags
& KEVENT_FLAG_LEGACY32
) {
3354 kq
->kq_state
|= KQ_KEV32
;
3355 } else if (flags
& KEVENT_FLAG_LEGACY64
) {
3356 kq
->kq_state
|= KQ_KEV64
;
3358 kq
->kq_state
|= KQ_KEV_QOS
;
3364 #define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3365 #define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3368 kqhash_lock(proc_t p
)
3370 lck_mtx_lock_spin_always(&p
->p_fd
->fd_kqhashlock
);
3374 kqhash_lock_held(__assert_only proc_t p
)
3376 LCK_MTX_ASSERT(&p
->p_fd
->fd_kqhashlock
, LCK_MTX_ASSERT_OWNED
);
3380 kqhash_unlock(proc_t p
)
3382 lck_mtx_unlock(&p
->p_fd
->fd_kqhashlock
);
3386 kqueue_hash_init_if_needed(proc_t p
)
3388 struct filedesc
*fdp
= p
->p_fd
;
3390 kqhash_lock_held(p
);
3392 if (__improbable(fdp
->fd_kqhash
== NULL
)) {
3393 struct kqlist
*alloc_hash
;
3397 alloc_hash
= hashinit(CONFIG_KQ_HASHSIZE
, M_KQUEUE
, &alloc_mask
);
3400 /* See if we won the race */
3401 if (fdp
->fd_kqhashmask
== 0) {
3402 fdp
->fd_kqhash
= alloc_hash
;
3403 fdp
->fd_kqhashmask
= alloc_mask
;
3406 FREE(alloc_hash
, M_KQUEUE
);
3413 * Called with the kqhash_lock() held
3421 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
3422 struct filedesc
*fdp
= p
->p_fd
;
3423 struct kqlist
*list
;
3425 /* should hold the kq hash lock */
3426 kqhash_lock_held(p
);
3428 if ((kq
->kq_state
& KQ_DYNAMIC
) == 0) {
3429 assert(kq
->kq_state
& KQ_DYNAMIC
);
3433 /* only dynamically allocate workloop kqs for now */
3434 assert(kq
->kq_state
& KQ_WORKLOOP
);
3435 assert(fdp
->fd_kqhash
);
3437 kqwl
->kqwl_dynamicid
= id
;
3439 list
= &fdp
->fd_kqhash
[KQ_HASH(id
, fdp
->fd_kqhashmask
)];
3440 SLIST_INSERT_HEAD(list
, kqwl
, kqwl_hashlink
);
3443 /* Called with kqhash_lock held */
3449 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
3450 struct filedesc
*fdp
= p
->p_fd
;
3451 struct kqlist
*list
;
3453 /* should hold the kq hash lock */
3454 kqhash_lock_held(p
);
3456 if ((kq
->kq_state
& KQ_DYNAMIC
) == 0) {
3457 assert(kq
->kq_state
& KQ_DYNAMIC
);
3460 assert(kq
->kq_state
& KQ_WORKLOOP
); /* for now */
3461 list
= &fdp
->fd_kqhash
[KQ_HASH(kqwl
->kqwl_dynamicid
, fdp
->fd_kqhashmask
)];
3462 SLIST_REMOVE(list
, kqwl
, kqworkloop
, kqwl_hashlink
);
3465 /* Called with kqhash_lock held */
3466 static struct kqueue
*
3467 kqueue_hash_lookup(struct proc
*p
, kqueue_id_t id
)
3469 struct filedesc
*fdp
= p
->p_fd
;
3470 struct kqlist
*list
;
3471 struct kqworkloop
*kqwl
;
3473 /* should hold the kq hash lock */
3474 kqhash_lock_held(p
);
3476 if (fdp
->fd_kqhashmask
== 0) return NULL
;
3478 list
= &fdp
->fd_kqhash
[KQ_HASH(id
, fdp
->fd_kqhashmask
)];
3479 SLIST_FOREACH(kqwl
, list
, kqwl_hashlink
) {
3480 if (kqwl
->kqwl_dynamicid
== id
) {
3481 struct kqueue
*kq
= (struct kqueue
*)kqwl
;
3483 assert(kq
->kq_state
& KQ_DYNAMIC
);
3484 assert(kq
->kq_state
& KQ_WORKLOOP
); /* for now */
3492 kqueue_release_last(struct proc
*p
, kqueue_t kqu
)
3494 struct kqueue
*kq
= kqu
.kq
;
3495 if (kq
->kq_state
& KQ_DYNAMIC
) {
3497 if (kqueue_release(kq
, KQUEUE_MIGHT_BE_LAST_REF
)) {
3498 thread_t cur_owner
= kqworkloop_invalidate(kqu
.kqwl
);
3499 kqueue_hash_remove(p
, kq
);
3501 if (cur_owner
) thread_deallocate(cur_owner
);
3510 * kqworkloops_dealloc - rebalance retains on kqworkloops created with
3511 * scheduling parameters
3513 * Called with proc_fdlock held.
3514 * Returns with it locked.
3515 * Process is in such a state that it will not try to allocate
3516 * any more knotes during this process (stopped for exit or exec).
3519 kqworkloops_dealloc(proc_t p
)
3521 struct filedesc
*fdp
= p
->p_fd
;
3522 struct kqlist
*list
;
3523 struct kqworkloop
*kqwl
, *kqwln
;
3524 struct kqlist tofree
;
3527 if (!(fdp
->fd_flags
& FD_WORKLOOP
)) {
3531 SLIST_INIT(&tofree
);
3534 assert(fdp
->fd_kqhashmask
!= 0);
3536 for (i
= 0; i
<= (int)fdp
->fd_kqhashmask
; i
++) {
3537 list
= &fdp
->fd_kqhash
[i
];
3538 SLIST_FOREACH_SAFE(kqwl
, list
, kqwl_hashlink
, kqwln
) {
3540 * kqworkloops that have scheduling parameters have an
3541 * implicit retain from kqueue_workloop_ctl that needs
3542 * to be balanced on process exit.
3544 assert(kqwl
->kqwl_params
);
3545 SLIST_REMOVE(list
, kqwl
, kqworkloop
, kqwl_hashlink
);
3546 SLIST_INSERT_HEAD(&tofree
, kqwl
, kqwl_hashlink
);
3552 SLIST_FOREACH_SAFE(kqwl
, &tofree
, kqwl_hashlink
, kqwln
) {
3553 struct kqueue
*kq
= (struct kqueue
*)kqwl
;
3554 __assert_only
bool released
;
3555 released
= kqueue_release(kq
, KQUEUE_MIGHT_BE_LAST_REF
);
3561 static struct kqueue
*
3562 kevent_get_bound_kqworkloop(thread_t thread
)
3564 struct uthread
*ut
= get_bsdthread_info(thread
);
3565 struct kqrequest
*kqr
= ut
->uu_kqr_bound
;
3567 return kqr
? (struct kqueue
*)kqr_kqworkloop(kqr
) : NULL
;
3571 kevent_get_kq(struct proc
*p
, kqueue_id_t id
, workq_threadreq_param_t
*trp
,
3572 unsigned int flags
, struct fileproc
**fpp
, int *fdp
,
3573 struct kqueue
**kqp
)
3575 struct filedesc
*descp
= p
->p_fd
;
3576 struct fileproc
*fp
= NULL
;
3577 struct kqueue
*kq
= NULL
;
3580 thread_t th
= current_thread();
3582 assert(!trp
|| (flags
& KEVENT_FLAG_WORKLOOP
));
3584 /* Was the workloop flag passed? Then it is for sure only a workloop */
3585 if (flags
& KEVENT_FLAG_DYNAMIC_KQUEUE
) {
3586 assert(flags
& KEVENT_FLAG_WORKLOOP
);
3587 assert(!trp
|| (flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
));
3588 kq
= kevent_get_bound_kqworkloop(th
);
3591 * when kevent_id_internal is called from within the
3592 * kernel, and the passed 'id' value is '-1' then we
3593 * look for the currently bound workloop kq.
3595 if (id
== (kqueue_id_t
)-1 &&
3596 (flags
& KEVENT_FLAG_KERNEL
) &&
3597 (flags
& KEVENT_FLAG_WORKLOOP
)) {
3599 if (!is_workqueue_thread(th
) || !kq
) {
3607 if (id
== 0 || id
== (kqueue_id_t
)-1) {
3611 /* try shortcut on kq lookup for bound threads */
3612 if (kq
!= NULL
&& ((struct kqworkloop
*)kq
)->kqwl_dynamicid
== id
) {
3614 if (flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
) {
3618 /* retain a reference while working with this kq. */
3619 assert(kq
->kq_state
& KQ_DYNAMIC
);
3624 /* look for the kq on the hash table */
3626 kq
= kqueue_hash_lookup(p
, id
);
3630 if (flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST
) {
3634 struct kqueue
*alloc_kq
;
3635 alloc_kq
= kqueue_alloc(p
, flags
);
3641 kqueue_hash_init_if_needed(p
);
3642 kq
= kqueue_hash_lookup(p
, id
);
3644 /* insert our new one */
3647 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
3648 kqwl
->kqwl_params
= trp
->trp_value
;
3650 kqueue_hash_insert(p
, id
, kq
);
3652 } else if (flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
) {
3653 /* lost race and caller wants an error */
3655 kqueue_release(alloc_kq
, KQUEUE_MIGHT_BE_LAST_REF
);
3656 kqueue_dealloc(alloc_kq
);
3659 /* lost race, retain existing workloop */
3662 kqueue_release(alloc_kq
, KQUEUE_MIGHT_BE_LAST_REF
);
3663 kqueue_dealloc(alloc_kq
);
3667 if (flags
& KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
) {
3672 /* retain a reference while working with this kq. */
3673 assert(kq
->kq_state
& KQ_DYNAMIC
);
3678 } else if (flags
& KEVENT_FLAG_WORKQ
) {
3679 /* must already exist for bound threads. */
3680 if (flags
& KEVENT_FLAG_KERNEL
) {
3681 assert(descp
->fd_wqkqueue
!= NULL
);
3685 * use the private kq associated with the proc workq.
3686 * Just being a thread within the process (and not
3687 * being the exit/exec thread) is enough to hold a
3688 * reference on this special kq.
3690 kq
= descp
->fd_wqkqueue
;
3692 struct kqueue
*alloc_kq
= kqueue_alloc(p
, KEVENT_FLAG_WORKQ
);
3693 if (alloc_kq
== NULL
) {
3698 if (descp
->fd_wqkqueue
== NULL
) {
3699 kq
= descp
->fd_wqkqueue
= alloc_kq
;
3703 kq
= descp
->fd_wqkqueue
;
3704 kqueue_dealloc(alloc_kq
);
3708 /* get a usecount for the kq itself */
3710 if ((error
= fp_getfkq(p
, fd
, &fp
, &kq
)) != 0)
3713 if ((error
= kevent_set_kq_mode(kq
, flags
)) != 0) {
3714 /* drop the usecount */
3716 fp_drop(p
, fd
, fp
, 0);
3732 struct fileproc
*fp
,
3735 kqueue_release_last(p
, kq
);
3737 assert((kq
->kq_state
& KQ_WORKQ
) == 0);
3738 fp_drop(p
, (int)id
, fp
, 0);
3743 kevent_workloop_serial_no_copyin(proc_t p
, uint64_t workloop_id
)
3745 uint64_t serial_no
= 0;
3749 if (workloop_id
== 0 || p
->p_dispatchqueue_serialno_offset
== 0) {
3752 addr
= (user_addr_t
)(workloop_id
+ p
->p_dispatchqueue_serialno_offset
);
3754 if (proc_is64bit(p
)) {
3755 rc
= copyin(addr
, (caddr_t
)&serial_no
, sizeof(serial_no
));
3757 uint32_t serial_no32
= 0;
3758 rc
= copyin(addr
, (caddr_t
)&serial_no32
, sizeof(serial_no32
));
3759 serial_no
= serial_no32
;
3761 return rc
== 0 ? serial_no
: 0;
3765 kevent_exit_on_workloop_ownership_leak(thread_t thread
)
3767 proc_t p
= current_proc();
3768 struct filedesc
*fdp
= p
->p_fd
;
3769 kqueue_id_t workloop_id
= 0;
3770 os_reason_t reason
= OS_REASON_NULL
;
3771 mach_vm_address_t addr
;
3772 uint32_t reason_size
;
3775 if (fdp
->fd_kqhashmask
> 0) {
3776 for (uint32_t i
= 0; i
< fdp
->fd_kqhashmask
+ 1; i
++) {
3777 struct kqworkloop
*kqwl
;
3779 SLIST_FOREACH(kqwl
, &fdp
->fd_kqhash
[i
], kqwl_hashlink
) {
3780 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
3781 if ((kq
->kq_state
& KQ_DYNAMIC
) && kqwl
->kqwl_owner
== thread
) {
3782 workloop_id
= kqwl
->kqwl_dynamicid
;
3790 reason
= os_reason_create(OS_REASON_LIBSYSTEM
,
3791 OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK
);
3792 if (reason
== OS_REASON_NULL
) {
3796 reason
->osr_flags
|= OS_REASON_FLAG_GENERATE_CRASH_REPORT
;
3797 reason_size
= 2 * sizeof(uint64_t);
3798 reason_size
= kcdata_estimate_required_buffer_size(2, reason_size
);
3799 if (os_reason_alloc_buffer(reason
, reason_size
) != 0) {
3804 struct kcdata_descriptor
*kcd
= &reason
->osr_kcd_descriptor
;
3806 if (kcdata_get_memory_addr(kcd
, EXIT_REASON_WORKLOOP_ID
,
3807 sizeof(workloop_id
), &addr
) == KERN_SUCCESS
) {
3808 kcdata_memcpy(kcd
, addr
, &workloop_id
, sizeof(workloop_id
));
3811 uint64_t serial_no
= kevent_workloop_serial_no_copyin(p
, workloop_id
);
3812 if (serial_no
&& kcdata_get_memory_addr(kcd
, EXIT_REASON_DISPATCH_QUEUE_NO
,
3813 sizeof(serial_no
), &addr
) == KERN_SUCCESS
) {
3814 kcdata_memcpy(kcd
, addr
, &serial_no
, sizeof(serial_no
));
3818 #if DEVELOPMENT || DEBUG
3819 if (kevent_debug_flags() & KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK
) {
3820 panic("thread %p in task %p is leaked workloop 0x%016llx ownership",
3821 thread
, p
->task
, workloop_id
);
3823 psignal_try_thread_with_reason(p
, thread
, SIGABRT
, reason
);
3826 return exit_with_reason(p
, W_EXITCODE(0, SIGKILL
), (int *)NULL
,
3827 FALSE
, FALSE
, 0, reason
);
3831 static inline boolean_t
3832 kevent_args_requesting_events(unsigned int flags
, int nevents
)
3834 return (!(flags
& KEVENT_FLAG_ERROR_EVENTS
) && nevents
> 0);
3838 kevent_internal(struct proc
*p
,
3839 kqueue_id_t id
, kqueue_id_t
*id_out
,
3840 user_addr_t changelist
, int nchanges
,
3841 user_addr_t ueventlist
, int nevents
,
3842 user_addr_t data_out
, uint64_t data_available
,
3844 user_addr_t utimeout
,
3845 kqueue_continue_t continuation
,
3850 struct fileproc
*fp
= NULL
;
3852 struct kevent_internal_s kev
;
3853 int error
, noutputs
, register_rc
;
3854 bool needs_end_processing
= false;
3856 user_size_t data_size
;
3857 user_size_t data_resid
;
3858 thread_t thread
= current_thread();
3859 KNOTE_LOCK_CTX(knlc
);
3861 /* Don't allow user-space threads to process output events from the workq kqs */
3862 if (((flags
& (KEVENT_FLAG_WORKQ
| KEVENT_FLAG_KERNEL
)) == KEVENT_FLAG_WORKQ
) &&
3863 kevent_args_requesting_events(flags
, nevents
))
3866 if (flags
& KEVENT_FLAG_PARKING
) {
3867 if (!kevent_args_requesting_events(flags
, nevents
) || id
!= (kqueue_id_t
)-1)
3871 /* restrict dynamic kqueue allocation to workloops (for now) */
3872 if ((flags
& (KEVENT_FLAG_DYNAMIC_KQUEUE
| KEVENT_FLAG_WORKLOOP
)) == KEVENT_FLAG_DYNAMIC_KQUEUE
)
3875 if ((flags
& (KEVENT_FLAG_WORKLOOP
)) && (flags
& (KEVENT_FLAG_WORKQ
)))
3878 if (flags
& (KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST
| KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
)) {
3880 /* allowed only on workloops when calling kevent_id from user-space */
3881 if (!(flags
& KEVENT_FLAG_WORKLOOP
) || (flags
& KEVENT_FLAG_KERNEL
) || !(flags
& KEVENT_FLAG_DYNAMIC_KQUEUE
))
3885 /* prepare to deal with stack-wise allocation of out events */
3886 if (flags
& KEVENT_FLAG_STACK_EVENTS
) {
3887 int scale
= ((flags
& KEVENT_FLAG_LEGACY32
) ?
3888 (IS_64BIT_PROCESS(p
) ? sizeof(struct user64_kevent
) :
3889 sizeof(struct user32_kevent
)) :
3890 ((flags
& KEVENT_FLAG_LEGACY64
) ? sizeof(struct kevent64_s
) :
3891 sizeof(struct kevent_qos_s
)));
3892 ueventlist
+= nevents
* scale
;
3895 /* convert timeout to absolute - if we have one (and not immediate) */
3896 error
= kevent_get_timeout(p
, utimeout
, flags
, &atv
);
3900 /* copyin initial value of data residual from data_available */
3901 error
= kevent_get_data_size(p
, data_available
, flags
, &data_size
);
3905 /* get the kq we are going to be working on */
3906 error
= kevent_get_kq(p
, id
, NULL
, flags
, &fp
, &fd
, &kq
);
3907 #if CONFIG_WORKLOOP_DEBUG
3908 ut
= (uthread_t
)get_bsdthread_info(thread
);
3909 UU_KEVENT_HISTORY_WRITE_ENTRY(ut
, {
3911 .uu_kq
= error
? NULL
: kq
,
3913 .uu_nchanges
= nchanges
,
3914 .uu_nevents
= nevents
,
3917 #endif // CONFIG_WORKLOOP_DEBUG
3921 /* only bound threads can receive events on workloops */
3922 if (flags
& KEVENT_FLAG_WORKLOOP
) {
3923 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
3924 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
3926 assert(kq
->kq_state
& KQ_WORKLOOP
);
3928 if (kevent_args_requesting_events(flags
, nevents
)) {
3929 if (kq
!= kevent_get_bound_kqworkloop(thread
)) {
3936 * Disable the R2K notification while doing a register, if the
3937 * caller wants events too, we don't want the AST to be set if we
3938 * will process these events soon.
3940 kqr
->kqr_state
&= ~KQR_R2K_NOTIF_ARMED
;
3941 needs_end_processing
= true;
3946 *id_out
= kqwl
->kqwl_dynamicid
;
3951 /* register all the change requests the user provided... */
3953 while (nchanges
> 0 && error
== 0) {
3954 error
= kevent_copyin(&changelist
, &kev
, p
, flags
);
3958 /* Make sure user doesn't pass in any system flags */
3959 kev
.flags
&= ~EV_SYSFLAGS
;
3961 register_rc
= kevent_register(kq
, &kev
, &knlc
);
3962 if (register_rc
& FILTER_REGISTER_WAIT
) {
3965 // f_post_register_wait is meant to call a continuation and not to
3966 // return, which is why we don't support FILTER_REGISTER_WAIT if
3967 // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
3968 // waits isn't the last.
3970 // It is implementable, but not used by any userspace code at the
3971 // moment, so for now return ENOTSUP if someone tries to do it.
3972 if (nchanges
== 1 && nevents
>= 1 && (flags
& KEVENT_FLAG_ERROR_EVENTS
)) {
3973 struct _kevent_register
*cont_args
;
3974 /* store the continuation/completion data in the uthread */
3975 ut
= (uthread_t
)get_bsdthread_info(thread
);
3976 cont_args
= &ut
->uu_save
.uus_kevent_register
;
3977 cont_args
->kev
= kev
;
3981 cont_args
->ueventlist
= ueventlist
;
3982 cont_args
->flags
= flags
;
3983 cont_args
->retval
= retval
;
3984 cont_args
->eventcount
= nevents
;
3985 cont_args
->eventout
= noutputs
;
3986 knote_fops(cont_args
->knote
)->f_post_register_wait(ut
, &knlc
, cont_args
);
3987 panic("f_post_register_wait returned (kev: %p)", &kev
);
3990 kev
.flags
|= EV_ERROR
;
3992 knote_unlock(kq
, knlc
.knlc_knote
, &knlc
, KNOTE_KQ_UNLOCK
);
3995 // keep in sync with kevent_register_wait_return()
3996 if (nevents
> 0 && (kev
.flags
& (EV_ERROR
|EV_RECEIPT
))) {
3997 if ((kev
.flags
& EV_ERROR
) == 0) {
3998 kev
.flags
|= EV_ERROR
;
4001 error
= kevent_copyout(&kev
, &ueventlist
, p
, flags
);
4006 } else if (kev
.flags
& EV_ERROR
) {
4012 /* short-circuit the scan if we only want error events */
4013 if (flags
& KEVENT_FLAG_ERROR_EVENTS
)
4016 /* process pending events */
4017 if (nevents
> 0 && noutputs
== 0 && error
== 0) {
4018 struct _kevent
*cont_args
;
4019 /* store the continuation/completion data in the uthread */
4020 ut
= (uthread_t
)get_bsdthread_info(thread
);
4021 cont_args
= &ut
->uu_save
.uus_kevent
;
4024 cont_args
->retval
= retval
;
4025 cont_args
->eventlist
= ueventlist
;
4026 cont_args
->eventcount
= nevents
;
4027 cont_args
->eventout
= noutputs
;
4028 cont_args
->data_available
= data_available
;
4029 cont_args
->process_data
.fp_fd
= (int)id
;
4030 cont_args
->process_data
.fp_flags
= flags
;
4031 cont_args
->process_data
.fp_data_out
= data_out
;
4032 cont_args
->process_data
.fp_data_size
= data_size
;
4033 cont_args
->process_data
.fp_data_resid
= data_size
;
4036 * kqworkloop_end_processing() will happen at the end of kqueue_scan()
4038 needs_end_processing
= false;
4040 error
= kqueue_scan(kq
, kevent_callback
,
4041 continuation
, cont_args
,
4042 &cont_args
->process_data
,
4045 /* process remaining outputs */
4046 noutputs
= cont_args
->eventout
;
4047 data_resid
= cont_args
->process_data
.fp_data_resid
;
4049 /* copyout residual data size value (if it needs to be copied out) */
4050 /* don't abandon other output just because of residual copyout failures */
4051 if (error
== 0 && data_available
&& data_resid
!= data_size
) {
4052 (void)kevent_put_data_size(p
, data_available
, flags
, data_resid
);
4057 if (__improbable(needs_end_processing
)) {
4059 * If we didn't through kqworkloop_end_processing(),
4060 * we need to do it here.
4063 kqworkloop_end_processing((struct kqworkloop
*)kq
, 0, 0);
4066 kevent_put_kq(p
, id
, fp
, kq
);
4068 /* don't restart after signals... */
4069 if (error
== ERESTART
)
4071 else if (error
== EWOULDBLOCK
)
4080 * kevent_callback - callback for each individual event
4082 * called with nothing locked
4083 * caller holds a reference on the kqueue
4086 kevent_callback(__unused
struct kqueue
*kq
, struct kevent_internal_s
*kevp
,
4089 struct _kevent
*cont_args
;
4092 cont_args
= (struct _kevent
*)data
;
4093 assert(cont_args
->eventout
< cont_args
->eventcount
);
4096 * Copy out the appropriate amount of event data for this user.
4098 error
= kevent_copyout(kevp
, &cont_args
->eventlist
, current_proc(),
4099 cont_args
->process_data
.fp_flags
);
4102 * If there isn't space for additional events, return
4103 * a harmless error to stop the processing here
4105 if (error
== 0 && ++cont_args
->eventout
== cont_args
->eventcount
)
4106 error
= EWOULDBLOCK
;
4111 * kevent_description - format a description of a kevent for diagnostic output
4113 * called with a 256-byte string buffer
4117 kevent_description(struct kevent_internal_s
*kevp
, char *s
, size_t n
)
4121 "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
4135 kevent_register_validate_priority(struct kqueue
*kq
, struct knote
*kn
,
4136 struct kevent_internal_s
*kev
)
4138 /* We don't care about the priority of a disabled or deleted knote */
4139 if (kev
->flags
& (EV_DISABLE
| EV_DELETE
)) {
4143 if (kq
->kq_state
& KQ_WORKLOOP
) {
4145 * Workloops need valid priorities with a QOS (excluding manager) for
4146 * any enabled knote.
4148 * When it is pre-existing, just make sure it has a valid QoS as
4149 * kevent_register() will not use the incoming priority (filters who do
4150 * have the responsibility to validate it again, see filt_wltouch).
4152 * If the knote is being made, validate the incoming priority.
4154 if (!_pthread_priority_thread_qos(kn
? kn
->kn_qos
: kev
->qos
)) {
4163 * Prepare a filter for waiting after register.
4165 * The f_post_register_wait hook will be called later by kevent_register()
4166 * and should call kevent_register_wait_block()
4169 kevent_register_wait_prepare(struct knote
*kn
, struct kevent_internal_s
*kev
)
4171 thread_t thread
= current_thread();
4172 struct uthread
*uth
= get_bsdthread_info(thread
);
4174 assert(knote_fops(kn
)->f_extended_codes
);
4176 if (kn
->kn_hook
== NULL
) {
4177 thread_reference(thread
);
4178 kn
->kn_hook
= thread
;
4179 } else if (kn
->kn_hook
!= thread
) {
4181 * kn_hook may be set from a previous aborted wait
4182 * However, it has to be from the same thread.
4184 kev
->flags
|= EV_ERROR
;
4189 uth
->uu_save
.uus_kevent_register
.knote
= kn
;
4190 return FILTER_REGISTER_WAIT
;
4194 * Cleanup a kevent_register_wait_prepare() effect for threads that have been
4195 * aborted instead of properly woken up with thread_wakeup_thread().
4198 kevent_register_wait_cleanup(struct knote
*kn
)
4200 thread_t thread
= kn
->kn_hook
;
4202 thread_deallocate(thread
);
4206 * Must be called at the end of a f_post_register_wait call from a filter.
4209 kevent_register_wait_block(struct turnstile
*ts
, thread_t thread
,
4210 struct knote_lock_ctx
*knlc
, thread_continue_t cont
,
4211 struct _kevent_register
*cont_args
)
4213 knote_unlock(cont_args
->kq
, cont_args
->knote
, knlc
, KNOTE_KQ_UNLOCK
);
4214 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_NOT_HELD
);
4215 cont_args
->handoff_thread
= thread
;
4216 thread_handoff_parameter(thread
, cont
, cont_args
);
4220 * Called by Filters using a f_post_register_wait to return from their wait.
4223 kevent_register_wait_return(struct _kevent_register
*cont_args
)
4225 struct kqueue
*kq
= cont_args
->kq
;
4226 proc_t p
= kq
->kq_p
;
4227 struct kevent_internal_s
*kev
= &cont_args
->kev
;
4230 if (cont_args
->handoff_thread
) {
4231 thread_deallocate(cont_args
->handoff_thread
);
4234 if (kev
->flags
& (EV_ERROR
|EV_RECEIPT
)) {
4235 if ((kev
->flags
& EV_ERROR
) == 0) {
4236 kev
->flags
|= EV_ERROR
;
4239 error
= kevent_copyout(kev
, &cont_args
->ueventlist
, p
, cont_args
->flags
);
4240 if (error
== 0) cont_args
->eventout
++;
4243 kevent_put_kq(p
, cont_args
->fd
, cont_args
->fp
, kq
);
4245 *cont_args
->retval
= cont_args
->eventout
;
4247 unix_syscall_return(error
);
4251 * kevent_register - add a new event to a kqueue
4253 * Creates a mapping between the event source and
4254 * the kqueue via a knote data structure.
4256 * Because many/most the event sources are file
4257 * descriptor related, the knote is linked off
4258 * the filedescriptor table for quick access.
4260 * called with nothing locked
4261 * caller holds a reference on the kqueue
4265 kevent_register(struct kqueue
*kq
, struct kevent_internal_s
*kev
,
4266 struct knote_lock_ctx
*knlc
)
4268 struct proc
*p
= kq
->kq_p
;
4269 const struct filterops
*fops
;
4270 struct knote
*kn
= NULL
;
4271 int result
= 0, error
= 0;
4272 unsigned short kev_flags
= kev
->flags
;
4274 if (kev
->filter
< 0) {
4275 if (kev
->filter
+ EVFILT_SYSCOUNT
< 0) {
4279 fops
= sysfilt_ops
[~kev
->filter
]; /* to 0-base index */
4285 /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
4286 if ((kev
->flags
& EV_VANISHED
) &&
4287 (kev
->flags
& (EV_ADD
| EV_DISPATCH2
)) != (EV_ADD
| EV_DISPATCH2
)) {
4292 /* Simplify the flags - delete and disable overrule */
4293 if (kev
->flags
& EV_DELETE
)
4294 kev
->flags
&= ~EV_ADD
;
4295 if (kev
->flags
& EV_DISABLE
)
4296 kev
->flags
&= ~EV_ENABLE
;
4298 if (kq
->kq_state
& KQ_WORKLOOP
) {
4299 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER
),
4300 ((struct kqworkloop
*)kq
)->kqwl_dynamicid
,
4301 kev
->udata
, kev
->flags
, kev
->filter
);
4302 } else if (kq
->kq_state
& KQ_WORKQ
) {
4303 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER
),
4304 0, kev
->udata
, kev
->flags
, kev
->filter
);
4306 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER
),
4307 VM_KERNEL_UNSLIDE_OR_PERM(kq
),
4308 kev
->udata
, kev
->flags
, kev
->filter
);
4312 /* find the matching knote from the fd tables/hashes */
4313 kn
= kq_find_knote_and_kq_lock(kq
, kev
, fops
->f_isfd
, p
);
4314 error
= kevent_register_validate_priority(kq
, kn
, kev
);
4320 if (kn
== NULL
&& (kev
->flags
& EV_ADD
) == 0) {
4322 * No knote found, EV_ADD wasn't specified
4325 if ((kev_flags
& EV_ADD
) && (kev_flags
& EV_DELETE
) &&
4326 (kq
->kq_state
& KQ_WORKLOOP
)) {
4328 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
4329 * that doesn't care about ENOENT, so just pretend the deletion
4337 } else if (kn
== NULL
) {
4339 * No knote found, need to attach a new one (attach)
4342 struct fileproc
*knote_fp
= NULL
;
4344 /* grab a file reference for the new knote */
4346 if ((error
= fp_lookup(p
, kev
->ident
, &knote_fp
, 0)) != 0) {
4354 if (knote_fp
!= NULL
)
4355 fp_drop(p
, kev
->ident
, knote_fp
, 0);
4359 kn
->kn_fp
= knote_fp
;
4360 kn
->kn_kq_packed
= (intptr_t)(struct kqueue
*)kq
;
4361 kqueue_retain(kq
); /* retain a kq ref */
4362 kn
->kn_filtid
= ~kev
->filter
;
4363 kn
->kn_status
= KN_ATTACHING
| KN_ATTACHED
;
4365 /* was vanish support requested */
4366 if (kev
->flags
& EV_VANISHED
) {
4367 kev
->flags
&= ~EV_VANISHED
;
4368 kn
->kn_status
|= KN_REQVANISH
;
4371 /* snapshot matching/dispatching protcol flags into knote */
4372 if (kev
->flags
& EV_DISPATCH
)
4373 kn
->kn_status
|= KN_DISPATCH
;
4374 if (kev
->flags
& EV_UDATA_SPECIFIC
)
4375 kn
->kn_status
|= KN_UDATA_SPECIFIC
;
4376 if (kev
->flags
& EV_DISABLE
)
4377 kn
->kn_status
|= KN_DISABLED
;
4380 * copy the kevent state into knote
4381 * protocol is that fflags and data
4382 * are saved off, and cleared before
4383 * calling the attach routine.
4385 kn
->kn_kevent
= *kev
;
4386 kn
->kn_sfflags
= kev
->fflags
;
4387 kn
->kn_sdata
= kev
->data
;
4390 knote_reset_priority(kn
, kev
->qos
);
4392 /* Add the knote for lookup thru the fd table */
4393 error
= kq_add_knote(kq
, kn
, knlc
, p
);
4395 (void)kqueue_release(kq
, KQUEUE_CANT_BE_LAST_REF
);
4397 if (knote_fp
!= NULL
)
4398 fp_drop(p
, kev
->ident
, knote_fp
, 0);
4400 if (error
== ERESTART
) {
4406 /* fp reference count now applies to knote */
4409 * we can't use filter_call() because f_attach can change the filter ops
4410 * for a filter that supports f_extended_codes, so we need to reload
4411 * knote_fops() and not use `fops`.
4413 result
= fops
->f_attach(kn
, kev
);
4414 if (result
&& !knote_fops(kn
)->f_extended_codes
) {
4415 result
= FILTER_ACTIVE
;
4420 if (kn
->kn_flags
& EV_ERROR
) {
4422 * Failed to attach correctly, so drop.
4424 kn
->kn_status
&= ~(KN_ATTACHED
| KN_ATTACHING
);
4425 error
= kn
->kn_data
;
4426 knote_drop(kq
, kn
, knlc
);
4432 * end "attaching" phase - now just attached
4434 * Mark the thread request overcommit, if appropos
4436 * If the attach routine indicated that an
4437 * event is already fired, activate the knote.
4439 kn
->kn_status
&= ~KN_ATTACHING
;
4440 knote_set_qos_overcommit(kn
);
4442 if (result
& FILTER_ACTIVE
) {
4443 if (result
& FILTER_ADJUST_EVENT_QOS_BIT
)
4444 knote_adjust_qos(kq
, kn
, result
);
4448 } else if (!knote_lock(kq
, kn
, knlc
, KNOTE_KQ_LOCK_ON_SUCCESS
)) {
4451 * The knote was dropped while we were waiting for the lock,
4452 * we need to re-evaluate entirely
4457 } else if (kev
->flags
& EV_DELETE
) {
4459 * Deletion of a knote (drop)
4461 * If the filter wants to filter drop events, let it do so.
4463 * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
4464 * we must wait for the knote to be re-enabled (unless it is being
4465 * re-enabled atomically here).
4468 if (knote_fops(kn
)->f_allow_drop
) {
4472 drop
= knote_fops(kn
)->f_allow_drop(kn
, kev
);
4475 if (!drop
) goto out_unlock
;
4478 if ((kev
->flags
& EV_ENABLE
) == 0 &&
4479 (kn
->kn_status
& (KN_DISPATCH2
| KN_DISABLED
)) ==
4480 (KN_DISPATCH2
| KN_DISABLED
)) {
4481 kn
->kn_status
|= KN_DEFERDELETE
;
4482 error
= EINPROGRESS
;
4486 knote_drop(kq
, kn
, knlc
);
4491 * Regular update of a knote (touch)
4493 * Call touch routine to notify filter of changes in filter values
4494 * (and to re-determine if any events are fired).
4496 * If the knote is in defer-delete, avoid calling the filter touch
4497 * routine (it has delivered its last event already).
4499 * If the touch routine had no failure,
4500 * apply the requested side effects to the knote.
4503 if (kn
->kn_status
& (KN_DEFERDELETE
| KN_VANISHED
)) {
4504 if (kev
->flags
& EV_ENABLE
) {
4505 result
= FILTER_ACTIVE
;
4509 result
= filter_call(knote_fops(kn
), f_touch(kn
, kev
));
4513 if (kev
->flags
& EV_ERROR
) {
4516 /* accept new kevent state */
4517 if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) == 0)
4518 kn
->kn_udata
= kev
->udata
;
4519 if (kev
->flags
& EV_DISABLE
)
4521 if (result
& (FILTER_UPDATE_REQ_QOS
| FILTER_ADJUST_EVENT_QOS_BIT
))
4523 if ((result
& FILTER_UPDATE_REQ_QOS
) &&
4524 kev
->qos
&& kev
->qos
!= kn
->kn_qos
) {
4525 knote_reset_priority(kn
, kev
->qos
);
4527 if (result
& FILTER_ACTIVE
) {
4529 if (result
& FILTER_ADJUST_EVENT_QOS_BIT
) {
4530 if (knote_should_apply_qos_override(kq
, kn
, result
, &qos
)) {
4531 knote_apply_qos_override(kn
, qos
);
4536 if (result
& (FILTER_UPDATE_REQ_QOS
| FILTER_ADJUST_EVENT_QOS_BIT
)) {
4537 if (knote_enqueue(kn
) && (kn
->kn_status
& KN_ACTIVE
)) {
4541 if (kev
->flags
& EV_ENABLE
)
4547 if ((result
& FILTER_REGISTER_WAIT
) == 0) {
4549 * When the filter asked for a post-register wait,
4550 * we leave the knote and kqueue locked for kevent_register()
4551 * to call the filter's f_post_register_wait hook.
4553 knote_unlock(kq
, kn
, knlc
, KNOTE_KQ_UNLOCK
);
4557 /* output local errors through the kevent */
4559 kev
->flags
|= EV_ERROR
;
4566 * knote_process - process a triggered event
4568 * Validate that it is really still a triggered event
4569 * by calling the filter routines (if necessary). Hold
4570 * a use reference on the knote to avoid it being detached.
4572 * If it is still considered triggered, we will have taken
4573 * a copy of the state under the filter lock. We use that
4574 * snapshot to dispatch the knote for future processing (or
4575 * not, if this was a lost event).
4577 * Our caller assures us that nobody else can be processing
4578 * events from this knote during the whole operation. But
4579 * others can be touching or posting events to the knote
4580 * interspersed with our processing it.
4582 * caller holds a reference on the kqueue.
4583 * kqueue locked on entry and exit - but may be dropped
4586 knote_process(struct knote
*kn
,
4587 kevent_callback_t callback
,
4588 void *callback_data
,
4589 struct filt_process_s
*process_data
)
4591 struct kevent_internal_s kev
;
4592 struct kqueue
*kq
= knote_get_kq(kn
);
4593 KNOTE_LOCK_CTX(knlc
);
4594 int result
= FILTER_ACTIVE
;
4598 bzero(&kev
, sizeof(kev
));
4601 * Must be active or stayactive
4602 * Must be queued and not disabled/suppressed
4604 assert(kn
->kn_status
& KN_QUEUED
);
4605 assert(kn
->kn_status
& (KN_ACTIVE
|KN_STAYACTIVE
));
4606 assert(!(kn
->kn_status
& (KN_DISABLED
|KN_SUPPRESSED
|KN_DROPPING
)));
4608 if (kq
->kq_state
& KQ_WORKLOOP
) {
4609 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS
),
4610 ((struct kqworkloop
*)kq
)->kqwl_dynamicid
,
4611 kn
->kn_udata
, kn
->kn_status
| (kn
->kn_id
<< 32),
4613 } else if (kq
->kq_state
& KQ_WORKQ
) {
4614 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS
),
4615 0, kn
->kn_udata
, kn
->kn_status
| (kn
->kn_id
<< 32),
4618 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS
),
4619 VM_KERNEL_UNSLIDE_OR_PERM(kq
), kn
->kn_udata
,
4620 kn
->kn_status
| (kn
->kn_id
<< 32), kn
->kn_filtid
);
4623 if ((kn
->kn_status
& KN_DROPPING
) ||
4624 !knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ALWAYS
)) {
4626 * When the knote is dropping or has dropped,
4627 * then there's nothing we want to process.
4633 * For deferred-drop or vanished events, we just create a fake
4634 * event to acknowledge end-of-life. Otherwise, we call the
4635 * filter's process routine to snapshot the kevent state under
4636 * the filter's locking protocol.
4638 * suppress knotes to avoid returning the same event multiple times in
4643 if (kn
->kn_status
& (KN_DEFERDELETE
| KN_VANISHED
)) {
4644 /* create fake event */
4645 kev
.filter
= kn
->kn_filter
;
4646 kev
.ident
= kn
->kn_id
;
4647 kev
.flags
= (kn
->kn_status
& KN_DEFERDELETE
) ? EV_DELETE
: EV_VANISHED
;
4648 kev
.flags
|= (EV_DISPATCH2
| EV_ONESHOT
);
4649 kev
.udata
= kn
->kn_udata
;
4651 /* deactivate - so new activations indicate a wakeup */
4652 knote_deactivate(kn
);
4655 result
= filter_call(knote_fops(kn
), f_process(kn
, process_data
, &kev
));
4660 * Determine how to dispatch the knote for future event handling.
4661 * not-fired: just return (do not callout, leave deactivated).
4662 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
4663 * is the deferred delete event delivery itself). Otherwise,
4665 * Dispatch: don't clear state, just mark it disabled.
4666 * Cleared: just leave it deactivated.
4667 * Others: re-activate as there may be more events to handle.
4668 * This will not wake up more handlers right now, but
4669 * at the completion of handling events it may trigger
4670 * more handler threads (TODO: optimize based on more than
4671 * just this one event being detected by the filter).
4673 if ((result
& FILTER_ACTIVE
) == 0) {
4674 if ((kn
->kn_status
& (KN_ACTIVE
| KN_STAYACTIVE
)) == 0) {
4676 * Stay active knotes should not be unsuppressed or we'd create an
4679 * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
4680 * within f_process() but that doesn't necessarily make them
4681 * ready to process, so we should leave them be.
4683 * For other knotes, since we will not return an event,
4684 * there's no point keeping the knote suppressed.
4686 knote_unsuppress(kn
);
4688 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ALWAYS
);
4692 if (result
& FILTER_ADJUST_EVENT_QOS_BIT
)
4693 knote_adjust_qos(kq
, kn
, result
);
4694 kev
.qos
= _pthread_priority_combine(kn
->kn_qos
, kn
->kn_qos_override
);
4696 if (kev
.flags
& EV_ONESHOT
) {
4697 if ((kn
->kn_status
& (KN_DISPATCH2
| KN_DEFERDELETE
)) == KN_DISPATCH2
) {
4698 /* defer dropping non-delete oneshot dispatch2 events */
4699 kn
->kn_status
|= KN_DEFERDELETE
;
4704 } else if (kn
->kn_status
& KN_DISPATCH
) {
4705 /* disable all dispatch knotes */
4707 } else if ((kev
.flags
& EV_CLEAR
) == 0) {
4708 /* re-activate in case there are more events */
4713 * callback to handle each event as we find it.
4714 * If we have to detach and drop the knote, do
4715 * it while we have the kq unlocked.
4718 knote_drop(kq
, kn
, &knlc
);
4720 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_UNLOCK
);
4723 if (kev
.flags
& EV_VANISHED
) {
4724 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED
),
4725 kev
.ident
, kn
->kn_udata
, kn
->kn_status
| (kn
->kn_id
<< 32),
4729 error
= (callback
)(kq
, &kev
, callback_data
);
4735 * Returns -1 if the kqueue was unbound and processing should not happen
4737 #define KQWQAE_BEGIN_PROCESSING 1
4738 #define KQWQAE_END_PROCESSING 2
4739 #define KQWQAE_UNBIND 3
4741 kqworkq_acknowledge_events(struct kqworkq
*kqwq
, struct kqrequest
*kqr
,
4742 int kevent_flags
, int kqwqae_op
)
4744 thread_qos_t old_override
= THREAD_QOS_UNSPECIFIED
;
4745 thread_t thread
= kqr
->kqr_thread
;
4748 bool seen_stayactive
= false, unbind
;
4750 kqlock_held(&kqwq
->kqwq_kqueue
);
4752 if (!TAILQ_EMPTY(&kqr
->kqr_suppressed
)) {
4754 * Return suppressed knotes to their original state.
4755 * For workq kqueues, suppressed ones that are still
4756 * truly active (not just forced into the queue) will
4757 * set flags we check below to see if anything got
4760 while ((kn
= TAILQ_FIRST(&kqr
->kqr_suppressed
)) != NULL
) {
4761 assert(kn
->kn_status
& KN_SUPPRESSED
);
4762 knote_unsuppress(kn
);
4763 if (kn
->kn_status
& KN_STAYACTIVE
) {
4764 seen_stayactive
= true;
4771 #if DEBUG || DEVELOPMENT
4772 thread_t self
= current_thread();
4773 struct uthread
*ut
= get_bsdthread_info(self
);
4775 assert(kqr
->kqr_state
& KQR_THREQUESTED
);
4776 assert(kqr
->kqr_thread
== self
);
4777 assert(ut
->uu_kqr_bound
== kqr
);
4778 #endif // DEBUG || DEVELOPMENT
4780 if (kqwqae_op
== KQWQAE_UNBIND
) {
4782 } else if ((kevent_flags
& KEVENT_FLAG_PARKING
) == 0) {
4784 } else if (kqwqae_op
== KQWQAE_BEGIN_PROCESSING
&& seen_stayactive
) {
4786 * When we unsuppress stayactive knotes, for the kind that are hooked
4787 * through select, we need to process once before we can assert there's
4788 * no event pending. Hence we can't unbind during BEGIN PROCESSING.
4792 unbind
= ((kqr
->kqr_state
& KQR_WAKEUP
) == 0);
4795 old_override
= kqworkq_unbind_locked(kqwq
, kqr
, thread
);
4798 * request a new thread if we didn't process the whole queue or real events
4799 * have happened (not just putting stay-active events back).
4801 if (kqr
->kqr_state
& KQR_WAKEUP
) {
4802 kqueue_threadreq_initiate(&kqwq
->kqwq_kqueue
, kqr
,
4803 kqr
->kqr_qos_index
, 0);
4809 * Reset wakeup bit to notice events firing while we are processing,
4810 * as we cannot rely on the bucket queue emptiness because of stay
4813 kqr
->kqr_state
&= ~KQR_WAKEUP
;
4816 kq_req_unlock(kqwq
);
4819 thread_drop_ipc_override(thread
);
4826 * Return 0 to indicate that processing should proceed,
4827 * -1 if there is nothing to process.
4829 * Called with kqueue locked and returns the same way,
4830 * but may drop lock temporarily.
4833 kqworkq_begin_processing(struct kqworkq
*kqwq
, struct kqrequest
*kqr
,
4838 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN
) | DBG_FUNC_START
,
4839 0, kqr
->kqr_qos_index
);
4841 rc
= kqworkq_acknowledge_events(kqwq
, kqr
, kevent_flags
,
4842 KQWQAE_BEGIN_PROCESSING
);
4844 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN
) | DBG_FUNC_END
,
4845 thread_tid(kqr
->kqr_thread
), kqr
->kqr_state
);
4851 kqworkloop_is_processing_on_current_thread(struct kqworkloop
*kqwl
)
4853 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
4857 if (kq
->kq_state
& KQ_PROCESSING
) {
4859 * KQ_PROCESSING is unset with the kqlock held, and the kqr thread is
4860 * never modified while KQ_PROCESSING is set, meaning that peeking at
4861 * its value is safe from this context.
4863 return kqwl
->kqwl_request
.kqr_thread
== current_thread();
4869 kqworkloop_acknowledge_events(struct kqworkloop
*kqwl
)
4871 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
4872 kq_index_t qos
= THREAD_QOS_UNSPECIFIED
;
4873 struct knote
*kn
, *tmp
;
4875 kqlock_held(&kqwl
->kqwl_kqueue
);
4877 TAILQ_FOREACH_SAFE(kn
, &kqr
->kqr_suppressed
, kn_tqe
, tmp
) {
4879 * If a knote that can adjust QoS is disabled because of the automatic
4880 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
4881 * further overrides keep pushing.
4883 if (knote_fops(kn
)->f_adjusts_qos
&& (kn
->kn_status
& KN_DISABLED
) &&
4884 (kn
->kn_status
& (KN_STAYACTIVE
| KN_DROPPING
)) == 0 &&
4885 (kn
->kn_flags
& (EV_DISPATCH
| EV_DISABLE
)) == EV_DISPATCH
) {
4886 qos
= MAX(qos
, knote_get_qos_override_index(kn
));
4889 knote_unsuppress(kn
);
4896 kqworkloop_begin_processing(struct kqworkloop
*kqwl
, unsigned int kevent_flags
)
4898 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
4899 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
4900 thread_qos_t old_override
= THREAD_QOS_UNSPECIFIED
, qos_override
;
4901 thread_t thread
= kqr
->kqr_thread
;
4902 int rc
= 0, op
= KQWL_UTQ_NONE
;
4906 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN
) | DBG_FUNC_START
,
4907 kqwl
->kqwl_dynamicid
, 0, 0);
4909 /* nobody else should still be processing */
4910 assert((kq
->kq_state
& KQ_PROCESSING
) == 0);
4912 kq
->kq_state
|= KQ_PROCESSING
;
4914 if (!TAILQ_EMPTY(&kqr
->kqr_suppressed
)) {
4915 op
= KQWL_UTQ_RESET_WAKEUP_OVERRIDE
;
4918 if (kevent_flags
& KEVENT_FLAG_PARKING
) {
4920 * When "parking" we want to process events and if no events are found
4923 * However, non overcommit threads sometimes park even when they have
4924 * more work so that the pool can narrow. For these, we need to unbind
4925 * early, so that calling kqworkloop_update_threads_qos() can ask the
4926 * workqueue subsystem whether the thread should park despite having
4929 if (kqr
->kqr_state
& KQR_THOVERCOMMIT
) {
4930 op
= KQWL_UTQ_PARKING
;
4932 op
= KQWL_UTQ_UNBINDING
;
4935 if (op
== KQWL_UTQ_NONE
) {
4939 qos_override
= kqworkloop_acknowledge_events(kqwl
);
4943 if (op
== KQWL_UTQ_UNBINDING
) {
4944 old_override
= kqworkloop_unbind_locked(kqwl
, thread
);
4945 (void)kqueue_release(kqwl
, KQUEUE_CANT_BE_LAST_REF
);
4947 kqworkloop_update_threads_qos(kqwl
, op
, qos_override
);
4948 if (op
== KQWL_UTQ_PARKING
) {
4949 if (!TAILQ_EMPTY(&kqwl
->kqwl_queue
[KQWL_BUCKET_STAYACTIVE
])) {
4951 * We cannot trust KQR_WAKEUP when looking at stay active knotes.
4952 * We need to process once, and kqworkloop_end_processing will
4953 * handle the unbind.
4955 } else if ((kqr
->kqr_state
& KQR_WAKEUP
) == 0 || kqwl
->kqwl_owner
) {
4956 old_override
= kqworkloop_unbind_locked(kqwl
, thread
);
4957 (void)kqueue_release(kqwl
, KQUEUE_CANT_BE_LAST_REF
);
4960 } else if (op
== KQWL_UTQ_UNBINDING
) {
4961 if (kqr
->kqr_thread
== thread
) {
4963 * The thread request fired again, passed the admission check and
4964 * got bound to the current thread again.
4973 * Reset wakeup bit to notice stay active events firing while we are
4974 * processing, as we cannot rely on the stayactive bucket emptiness.
4976 kqr
->kqr_wakeup_indexes
&= ~KQWL_STAYACTIVE_FIRED_BIT
;
4978 kq
->kq_state
&= ~KQ_PROCESSING
;
4981 kq_req_unlock(kqwl
);
4984 thread_drop_ipc_override(thread
);
4988 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN
) | DBG_FUNC_END
,
4989 kqwl
->kqwl_dynamicid
, 0, 0);
4995 * Return 0 to indicate that processing should proceed,
4996 * -1 if there is nothing to process.
4998 * Called with kqueue locked and returns the same way,
4999 * but may drop lock temporarily.
5003 kqfile_begin_processing(struct kqueue
*kq
)
5005 struct kqtailq
*suppressq
;
5009 assert((kq
->kq_state
& (KQ_WORKQ
| KQ_WORKLOOP
)) == 0);
5010 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN
) | DBG_FUNC_START
,
5011 VM_KERNEL_UNSLIDE_OR_PERM(kq
), 0);
5013 /* wait to become the exclusive processing thread */
5015 if (kq
->kq_state
& KQ_DRAIN
) {
5016 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN
) | DBG_FUNC_END
,
5017 VM_KERNEL_UNSLIDE_OR_PERM(kq
), 2);
5021 if ((kq
->kq_state
& KQ_PROCESSING
) == 0)
5024 /* if someone else is processing the queue, wait */
5025 kq
->kq_state
|= KQ_PROCWAIT
;
5026 suppressq
= kqueue_get_suppressed_queue(kq
, NULL
);
5027 waitq_assert_wait64((struct waitq
*)&kq
->kq_wqs
,
5028 CAST_EVENT64_T(suppressq
), THREAD_UNINT
| THREAD_WAIT_NOREPORT
,
5029 TIMEOUT_WAIT_FOREVER
);
5032 thread_block(THREAD_CONTINUE_NULL
);
5036 /* Nobody else processing */
5038 /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
5039 waitq_set_clear_preposts(&kq
->kq_wqs
);
5040 kq
->kq_state
&= ~KQ_WAKEUP
;
5042 /* anything left to process? */
5043 if (kqueue_queue_empty(kq
, QOS_INDEX_KQFILE
)) {
5044 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN
) | DBG_FUNC_END
,
5045 VM_KERNEL_UNSLIDE_OR_PERM(kq
), 1);
5049 /* convert to processing mode */
5050 kq
->kq_state
|= KQ_PROCESSING
;
5052 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN
) | DBG_FUNC_END
,
5053 VM_KERNEL_UNSLIDE_OR_PERM(kq
));
5059 * Try to end the processing, only called when a workq thread is attempting to
5060 * park (KEVENT_FLAG_PARKING is set).
5062 * When returning -1, the kqworkq is setup again so that it is ready to be
5066 kqworkq_end_processing(struct kqworkq
*kqwq
, struct kqrequest
*kqr
,
5069 if (!kqueue_queue_empty(&kqwq
->kqwq_kqueue
, kqr
->kqr_qos_index
)) {
5070 /* remember we didn't process everything */
5072 kqr
->kqr_state
|= KQR_WAKEUP
;
5073 kq_req_unlock(kqwq
);
5076 if (kevent_flags
& KEVENT_FLAG_PARKING
) {
5078 * if acknowledge events "succeeds" it means there are events,
5079 * which is a failure condition for end_processing.
5081 int rc
= kqworkq_acknowledge_events(kqwq
, kqr
, kevent_flags
,
5082 KQWQAE_END_PROCESSING
);
5092 * Try to end the processing, only called when a workq thread is attempting to
5093 * park (KEVENT_FLAG_PARKING is set).
5095 * When returning -1, the kqworkq is setup again so that it is ready to be
5096 * processed (as if kqworkloop_begin_processing had just been called).
5098 * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
5099 * the kqworkloop is unbound from its servicer as a side effect.
5102 kqworkloop_end_processing(struct kqworkloop
*kqwl
, int flags
, int kevent_flags
)
5104 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
5105 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
5106 thread_qos_t old_override
= THREAD_QOS_UNSPECIFIED
, qos_override
;
5107 thread_t thread
= kqr
->kqr_thread
;
5112 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END
) | DBG_FUNC_START
,
5113 kqwl
->kqwl_dynamicid
, 0, 0);
5115 if (flags
& KQ_PROCESSING
) {
5116 assert(kq
->kq_state
& KQ_PROCESSING
);
5119 * If we still have queued stayactive knotes, remember we didn't finish
5120 * processing all of them. This should be extremely rare and would
5121 * require to have a lot of them registered and fired.
5123 if (!TAILQ_EMPTY(&kqwl
->kqwl_queue
[KQWL_BUCKET_STAYACTIVE
])) {
5125 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_UPDATE_WAKEUP_QOS
,
5126 KQWL_BUCKET_STAYACTIVE
);
5127 kq_req_unlock(kqwl
);
5131 * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
5132 * still under the lock.
5134 * So we do everything kqworkloop_unbind() would do, but because we're
5135 * inside kqueue_process(), if the workloop actually received events
5136 * while our locks were dropped, we have the opportunity to fail the end
5137 * processing and loop again.
5139 * This avoids going through the process-wide workqueue lock hence
5142 if (kevent_flags
& KEVENT_FLAG_PARKING
) {
5143 qos_override
= kqworkloop_acknowledge_events(kqwl
);
5149 if (kevent_flags
& KEVENT_FLAG_PARKING
) {
5150 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_PARKING
, qos_override
);
5151 if ((kqr
->kqr_state
& KQR_WAKEUP
) && !kqwl
->kqwl_owner
) {
5153 * Reset wakeup bit to notice stay active events firing while we are
5154 * processing, as we cannot rely on the stayactive bucket emptiness.
5156 kqr
->kqr_wakeup_indexes
&= ~KQWL_STAYACTIVE_FIRED_BIT
;
5159 old_override
= kqworkloop_unbind_locked(kqwl
, thread
);
5160 (void)kqueue_release(kqwl
, KQUEUE_CANT_BE_LAST_REF
);
5161 kq
->kq_state
&= ~flags
;
5164 kq
->kq_state
&= ~flags
;
5165 kqr
->kqr_state
|= KQR_R2K_NOTIF_ARMED
;
5166 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS
, 0);
5169 kq_req_unlock(kqwl
);
5172 thread_drop_ipc_override(thread
);
5175 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END
) | DBG_FUNC_END
,
5176 kqwl
->kqwl_dynamicid
, 0, 0);
5182 * Called with kqueue lock held.
5185 kqfile_end_processing(struct kqueue
*kq
)
5188 struct kqtailq
*suppressq
;
5193 assert((kq
->kq_state
& (KQ_WORKQ
|KQ_WORKLOOP
)) == 0);
5195 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END
),
5196 VM_KERNEL_UNSLIDE_OR_PERM(kq
), 0);
5199 * Return suppressed knotes to their original state.
5201 suppressq
= kqueue_get_suppressed_queue(kq
, NULL
);
5202 while ((kn
= TAILQ_FIRST(suppressq
)) != NULL
) {
5203 assert(kn
->kn_status
& KN_SUPPRESSED
);
5204 knote_unsuppress(kn
);
5207 procwait
= (kq
->kq_state
& KQ_PROCWAIT
);
5208 kq
->kq_state
&= ~(KQ_PROCESSING
| KQ_PROCWAIT
);
5211 /* first wake up any thread already waiting to process */
5212 waitq_wakeup64_all((struct waitq
*)&kq
->kq_wqs
,
5213 CAST_EVENT64_T(suppressq
),
5215 WAITQ_ALL_PRIORITIES
);
5220 kqueue_workloop_ctl_internal(proc_t p
, uintptr_t cmd
, uint64_t __unused options
,
5221 struct kqueue_workloop_params
*params
, int *retval
)
5225 struct fileproc
*fp
;
5227 struct kqworkloop
*kqwl
;
5228 struct filedesc
*fdp
= p
->p_fd
;
5229 workq_threadreq_param_t trp
= { };
5232 case KQ_WORKLOOP_CREATE
:
5233 if (!params
->kqwlp_flags
) {
5238 if ((params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_SCHED_PRI
) &&
5239 (params
->kqwlp_sched_pri
< 1 ||
5240 params
->kqwlp_sched_pri
> 63 /* MAXPRI_USER */)) {
5245 if ((params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_SCHED_POL
) &&
5246 invalid_policy(params
->kqwlp_sched_pol
)) {
5251 if ((params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_CPU_PERCENT
) &&
5252 (params
->kqwlp_cpu_percent
<= 0 ||
5253 params
->kqwlp_cpu_percent
> 100 ||
5254 params
->kqwlp_cpu_refillms
<= 0 ||
5255 params
->kqwlp_cpu_refillms
> 0x00ffffff)) {
5260 if (params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_SCHED_PRI
) {
5261 trp
.trp_flags
|= TRP_PRIORITY
;
5262 trp
.trp_pri
= params
->kqwlp_sched_pri
;
5264 if (params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_SCHED_POL
) {
5265 trp
.trp_flags
|= TRP_POLICY
;
5266 trp
.trp_pol
= params
->kqwlp_sched_pol
;
5268 if (params
->kqwlp_flags
& KQ_WORKLOOP_CREATE_CPU_PERCENT
) {
5269 trp
.trp_flags
|= TRP_CPUPERCENT
;
5270 trp
.trp_cpupercent
= (uint8_t)params
->kqwlp_cpu_percent
;
5271 trp
.trp_refillms
= params
->kqwlp_cpu_refillms
;
5274 error
= kevent_get_kq(p
, params
->kqwlp_id
, &trp
,
5275 KEVENT_FLAG_DYNAMIC_KQUEUE
| KEVENT_FLAG_WORKLOOP
|
5276 KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST
, &fp
, &fd
, &kq
);
5281 if (!(fdp
->fd_flags
& FD_WORKLOOP
)) {
5282 /* FD_WORKLOOP indicates we've ever created a workloop
5283 * via this syscall but its only ever added to a process, never
5287 fdp
->fd_flags
|= FD_WORKLOOP
;
5291 case KQ_WORKLOOP_DESTROY
:
5292 error
= kevent_get_kq(p
, params
->kqwlp_id
, NULL
,
5293 KEVENT_FLAG_DYNAMIC_KQUEUE
| KEVENT_FLAG_WORKLOOP
|
5294 KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST
, &fp
, &fd
, &kq
);
5299 kqwl
= (struct kqworkloop
*)kq
;
5300 trp
.trp_value
= kqwl
->kqwl_params
;
5301 if (trp
.trp_flags
&& !(trp
.trp_flags
& TRP_RELEASED
)) {
5302 trp
.trp_flags
|= TRP_RELEASED
;
5303 kqueue_release(kq
, KQUEUE_CANT_BE_LAST_REF
);
5308 kqueue_release_last(p
, kq
);
5316 kqueue_workloop_ctl(proc_t p
, struct kqueue_workloop_ctl_args
*uap
, int *retval
)
5318 struct kqueue_workloop_params params
= {
5321 if (uap
->sz
< sizeof(params
.kqwlp_version
)) {
5325 size_t copyin_sz
= MIN(sizeof(params
), uap
->sz
);
5326 int rv
= copyin(uap
->addr
, ¶ms
, copyin_sz
);
5331 if (params
.kqwlp_version
!= (int)uap
->sz
) {
5335 return kqueue_workloop_ctl_internal(p
, uap
->cmd
, uap
->options
, ¶ms
,
5340 * kqueue_process - process the triggered events in a kqueue
5342 * Walk the queued knotes and validate that they are really still triggered
5343 * events by calling the filter routines (if necessary).
5345 * For each event that is still considered triggered, invoke the callback
5348 * caller holds a reference on the kqueue.
5349 * kqueue locked on entry and exit - but may be dropped
5350 * kqueue list locked (held for duration of call)
5353 kqueue_process(struct kqueue
*kq
,
5354 kevent_callback_t callback
,
5355 void *callback_data
,
5356 struct filt_process_s
*process_data
,
5359 struct uthread
*ut
= get_bsdthread_info(current_thread());
5360 struct kqrequest
*kqr
= ut
->uu_kqr_bound
;
5362 unsigned int flags
= process_data
? process_data
->fp_flags
: 0;
5363 int nevents
= 0, error
= 0, rc
= 0;
5364 struct kqtailq
*base_queue
, *queue
;
5365 kqueue_t kqu
= { .kq
= kq
};
5366 #if DEBUG || DEVELOPMENT
5370 if (kq
->kq_state
& KQ_WORKQ
) {
5371 if (kqr
== NULL
|| (kqr
->kqr_state
& KQR_WORKLOOP
)) {
5374 rc
= kqworkq_begin_processing(kqu
.kqwq
, kqr
, flags
);
5375 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
5376 if (ut
->uu_kqr_bound
!= &kqu
.kqwl
->kqwl_request
) {
5379 rc
= kqworkloop_begin_processing(kqu
.kqwl
, flags
);
5381 rc
= kqfile_begin_processing(kq
);
5385 /* Nothing to process */
5391 * loop through the enqueued knotes associated with this request,
5392 * processing each one. Each request may have several queues
5393 * of knotes to process (depending on the type of kqueue) so we
5394 * have to loop through all the queues as long as we have additional
5399 if (kq
->kq_state
& KQ_WORKQ
) {
5400 base_queue
= queue
= &kqu
.kqwq
->kqwq_queue
[kqr
->kqr_qos_index
];
5401 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
5402 base_queue
= &kqu
.kqwl
->kqwl_queue
[0];
5403 queue
= &kqu
.kqwl
->kqwl_queue
[KQWL_NBUCKETS
- 1];
5405 base_queue
= queue
= &kq
->kq_queue
[QOS_INDEX_KQFILE
];
5409 while (error
== 0 && (kn
= TAILQ_FIRST(queue
)) != NULL
) {
5410 error
= knote_process(kn
, callback
, callback_data
, process_data
);
5411 if (error
== EJUSTRETURN
) {
5416 /* error is EWOULDBLOCK when the out event array is full */
5419 if (error
== EWOULDBLOCK
) {
5420 /* break out if no more space for additional events */
5424 } while (queue
-- > base_queue
);
5429 * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
5430 * we want to unbind the kqrequest from the thread.
5432 * However, because the kq locks are dropped several times during process,
5433 * new knotes may have fired again, in which case, we want to fail the end
5434 * processing and process again, until it converges.
5436 * If we returned events however, end processing never fails.
5438 if (error
|| nevents
) flags
&= ~KEVENT_FLAG_PARKING
;
5439 if (kq
->kq_state
& KQ_WORKQ
) {
5440 rc
= kqworkq_end_processing(kqu
.kqwq
, kqr
, flags
);
5441 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
5442 rc
= kqworkloop_end_processing(kqu
.kqwl
, KQ_PROCESSING
, flags
);
5444 kqfile_end_processing(kq
);
5448 assert(flags
& KEVENT_FLAG_PARKING
);
5449 #if DEBUG || DEVELOPMENT
5450 if (retries
-- == 0) {
5451 panic("kevent: way too many knote_process retries, kq: %p (0x%02x)",
5461 kqueue_scan_continue(void *data
, wait_result_t wait_result
)
5463 thread_t self
= current_thread();
5464 uthread_t ut
= (uthread_t
)get_bsdthread_info(self
);
5465 struct _kqueue_scan
* cont_args
= &ut
->uu_save
.uus_kqueue_scan
;
5466 struct kqueue
*kq
= (struct kqueue
*)data
;
5467 struct filt_process_s
*process_data
= cont_args
->process_data
;
5471 /* convert the (previous) wait_result to a proper error */
5472 switch (wait_result
) {
5473 case THREAD_AWAKENED
: {
5476 error
= kqueue_process(kq
, cont_args
->call
, cont_args
->data
,
5477 process_data
, &count
);
5478 if (error
== 0 && count
== 0) {
5479 if (kq
->kq_state
& KQ_DRAIN
) {
5484 if (kq
->kq_state
& KQ_WAKEUP
)
5487 waitq_assert_wait64((struct waitq
*)&kq
->kq_wqs
,
5488 KQ_EVENT
, THREAD_ABORTSAFE
,
5489 cont_args
->deadline
);
5490 kq
->kq_state
|= KQ_SLEEP
;
5492 thread_block_parameter(kqueue_scan_continue
, kq
);
5497 case THREAD_TIMED_OUT
:
5498 error
= EWOULDBLOCK
;
5500 case THREAD_INTERRUPTED
:
5503 case THREAD_RESTART
:
5508 panic("%s: - invalid wait_result (%d)", __func__
,
5513 /* call the continuation with the results */
5514 assert(cont_args
->cont
!= NULL
);
5515 (cont_args
->cont
)(kq
, cont_args
->data
, error
);
5520 * kqueue_scan - scan and wait for events in a kqueue
5522 * Process the triggered events in a kqueue.
5524 * If there are no events triggered arrange to
5525 * wait for them. If the caller provided a
5526 * continuation routine, then kevent_scan will
5529 * The callback routine must be valid.
5530 * The caller must hold a use-count reference on the kq.
5533 kqueue_scan(struct kqueue
*kq
,
5534 kevent_callback_t callback
,
5535 kqueue_continue_t continuation
,
5536 void *callback_data
,
5537 struct filt_process_s
*process_data
,
5538 struct timeval
*atvp
,
5539 __unused
struct proc
*p
)
5541 thread_continue_t cont
= THREAD_CONTINUE_NULL
;
5548 assert(callback
!= NULL
);
5551 * Determine which QoS index we are servicing
5553 flags
= (process_data
) ? process_data
->fp_flags
: 0;
5554 fd
= (process_data
) ? process_data
->fp_fd
: -1;
5558 wait_result_t wait_result
;
5562 * Make a pass through the kq to find events already
5566 error
= kqueue_process(kq
, callback
, callback_data
,
5567 process_data
, &count
);
5569 break; /* lock still held */
5571 /* looks like we have to consider blocking */
5574 /* convert the timeout to a deadline once */
5575 if (atvp
->tv_sec
|| atvp
->tv_usec
) {
5578 clock_get_uptime(&now
);
5579 nanoseconds_to_absolutetime((uint64_t)atvp
->tv_sec
* NSEC_PER_SEC
+
5580 atvp
->tv_usec
* (long)NSEC_PER_USEC
,
5582 if (now
>= deadline
) {
5583 /* non-blocking call */
5584 error
= EWOULDBLOCK
;
5585 break; /* lock still held */
5588 clock_absolutetime_interval_to_deadline(deadline
, &deadline
);
5590 deadline
= 0; /* block forever */
5594 uthread_t ut
= (uthread_t
)get_bsdthread_info(current_thread());
5595 struct _kqueue_scan
*cont_args
= &ut
->uu_save
.uus_kqueue_scan
;
5597 cont_args
->call
= callback
;
5598 cont_args
->cont
= continuation
;
5599 cont_args
->deadline
= deadline
;
5600 cont_args
->data
= callback_data
;
5601 cont_args
->process_data
= process_data
;
5602 cont
= kqueue_scan_continue
;
5606 if (kq
->kq_state
& KQ_DRAIN
) {
5611 /* If awakened during processing, try again */
5612 if (kq
->kq_state
& KQ_WAKEUP
) {
5617 /* go ahead and wait */
5618 waitq_assert_wait64_leeway((struct waitq
*)&kq
->kq_wqs
,
5619 KQ_EVENT
, THREAD_ABORTSAFE
,
5620 TIMEOUT_URGENCY_USER_NORMAL
,
5621 deadline
, TIMEOUT_NO_LEEWAY
);
5622 kq
->kq_state
|= KQ_SLEEP
;
5624 wait_result
= thread_block_parameter(cont
, kq
);
5625 /* NOTREACHED if (continuation != NULL) */
5627 switch (wait_result
) {
5628 case THREAD_AWAKENED
:
5630 case THREAD_TIMED_OUT
:
5632 case THREAD_INTERRUPTED
:
5634 case THREAD_RESTART
:
5637 panic("%s: - bad wait_result (%d)", __func__
,
5649 * This could be expanded to call kqueue_scan, if desired.
5653 kqueue_read(__unused
struct fileproc
*fp
,
5654 __unused
struct uio
*uio
,
5656 __unused vfs_context_t ctx
)
5663 kqueue_write(__unused
struct fileproc
*fp
,
5664 __unused
struct uio
*uio
,
5666 __unused vfs_context_t ctx
)
5673 kqueue_ioctl(__unused
struct fileproc
*fp
,
5674 __unused u_long com
,
5675 __unused caddr_t data
,
5676 __unused vfs_context_t ctx
)
5683 kqueue_select(struct fileproc
*fp
, int which
, void *wq_link_id
,
5684 __unused vfs_context_t ctx
)
5686 struct kqueue
*kq
= (struct kqueue
*)fp
->f_data
;
5687 struct kqtailq
*queue
;
5688 struct kqtailq
*suppressq
;
5697 assert((kq
->kq_state
& KQ_WORKQ
) == 0);
5700 * If this is the first pass, link the wait queue associated with the
5701 * the kqueue onto the wait queue set for the select(). Normally we
5702 * use selrecord() for this, but it uses the wait queue within the
5703 * selinfo structure and we need to use the main one for the kqueue to
5704 * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
5705 * (The select() call will unlink them when it ends).
5707 if (wq_link_id
!= NULL
) {
5708 thread_t cur_act
= current_thread();
5709 struct uthread
* ut
= get_bsdthread_info(cur_act
);
5711 kq
->kq_state
|= KQ_SEL
;
5712 waitq_link((struct waitq
*)&kq
->kq_wqs
, ut
->uu_wqset
,
5713 WAITQ_SHOULD_LOCK
, (uint64_t *)wq_link_id
);
5715 /* always consume the reserved link object */
5716 waitq_link_release(*(uint64_t *)wq_link_id
);
5717 *(uint64_t *)wq_link_id
= 0;
5720 * selprocess() is expecting that we send it back the waitq
5721 * that was just added to the thread's waitq set. In order
5722 * to not change the selrecord() API (which is exported to
5723 * kexts), we pass this value back through the
5724 * void *wq_link_id pointer we were passed. We need to use
5725 * memcpy here because the pointer may not be properly aligned
5726 * on 32-bit systems.
5728 void *wqptr
= &kq
->kq_wqs
;
5729 memcpy(wq_link_id
, (void *)&wqptr
, sizeof(void *));
5732 if (kqfile_begin_processing(kq
) == -1) {
5737 queue
= &kq
->kq_queue
[QOS_INDEX_KQFILE
];
5738 if (!TAILQ_EMPTY(queue
)) {
5740 * there is something queued - but it might be a
5741 * KN_STAYACTIVE knote, which may or may not have
5742 * any events pending. Otherwise, we have to walk
5743 * the list of knotes to see, and peek at the
5744 * (non-vanished) stay-active ones to be really sure.
5746 while ((kn
= (struct knote
*)TAILQ_FIRST(queue
)) != NULL
) {
5747 if (kn
->kn_status
& KN_ACTIVE
) {
5751 assert(kn
->kn_status
& KN_STAYACTIVE
);
5756 * There were no regular events on the queue, so take
5757 * a deeper look at the stay-queued ones we suppressed.
5759 suppressq
= kqueue_get_suppressed_queue(kq
, NULL
);
5760 while ((kn
= (struct knote
*)TAILQ_FIRST(suppressq
)) != NULL
) {
5761 KNOTE_LOCK_CTX(knlc
);
5764 /* If didn't vanish while suppressed - peek at it */
5765 if ((kn
->kn_status
& KN_DROPPING
) || !knote_lock(kq
, kn
, &knlc
,
5766 KNOTE_KQ_LOCK_ON_FAILURE
)) {
5770 result
= filter_call(knote_fops(kn
), f_peek(kn
));
5773 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ALWAYS
);
5776 knote_unsuppress(kn
);
5778 /* has data or it has to report a vanish */
5779 if (result
& FILTER_ACTIVE
) {
5787 kqfile_end_processing(kq
);
5797 kqueue_close(struct fileglob
*fg
, __unused vfs_context_t ctx
)
5799 struct kqfile
*kqf
= (struct kqfile
*)fg
->fg_data
;
5801 assert((kqf
->kqf_state
& KQ_WORKQ
) == 0);
5802 kqueue_dealloc(&kqf
->kqf_kqueue
);
5808 * Max depth of the nested kq path that can be created.
5809 * Note that this has to be less than the size of kq_level
5810 * to avoid wrapping around and mislabeling the level.
5812 #define MAX_NESTED_KQ 1000
5816 * The callers has taken a use-count reference on this kqueue and will donate it
5817 * to the kqueue we are being added to. This keeps the kqueue from closing until
5818 * that relationship is torn down.
5821 kqueue_kqfilter(__unused
struct fileproc
*fp
, struct knote
*kn
,
5822 __unused
struct kevent_internal_s
*kev
, __unused vfs_context_t ctx
)
5824 struct kqfile
*kqf
= (struct kqfile
*)kn
->kn_fp
->f_data
;
5825 struct kqueue
*kq
= &kqf
->kqf_kqueue
;
5826 struct kqueue
*parentkq
= knote_get_kq(kn
);
5827 uint16_t plevel
= 0;
5829 assert((kqf
->kqf_state
& KQ_WORKQ
) == 0);
5831 if (parentkq
== kq
|| kn
->kn_filter
!= EVFILT_READ
) {
5832 knote_set_error(kn
, EINVAL
);
5837 * We have to avoid creating a cycle when nesting kqueues
5838 * inside another. Rather than trying to walk the whole
5839 * potential DAG of nested kqueues, we just use a simple
5840 * ceiling protocol. When a kqueue is inserted into another,
5841 * we check that the (future) parent is not already nested
5842 * into another kqueue at a lower level than the potenial
5843 * child (because it could indicate a cycle). If that test
5844 * passes, we just mark the nesting levels accordingly.
5846 * Only up to MAX_NESTED_KQ can be nested.
5850 if (parentkq
->kq_level
> 0 &&
5851 parentkq
->kq_level
< kq
->kq_level
)
5854 knote_set_error(kn
, EINVAL
);
5857 /* set parent level appropriately */
5858 plevel
= (parentkq
->kq_level
== 0)? 2: parentkq
->kq_level
;
5859 if (plevel
< kq
->kq_level
+ 1) {
5860 if (kq
->kq_level
+ 1 > MAX_NESTED_KQ
) {
5862 knote_set_error(kn
, EINVAL
);
5865 plevel
= kq
->kq_level
+ 1;
5868 parentkq
->kq_level
= plevel
;
5871 kn
->kn_filtid
= EVFILTID_KQREAD
;
5873 KNOTE_ATTACH(&kqf
->kqf_sel
.si_note
, kn
);
5874 /* indicate nesting in child, if needed */
5875 if (kq
->kq_level
== 0)
5878 int count
= kq
->kq_count
;
5885 * kqueue_drain - called when kq is closed
5889 kqueue_drain(struct fileproc
*fp
, __unused vfs_context_t ctx
)
5891 struct kqueue
*kq
= (struct kqueue
*)fp
->f_fglob
->fg_data
;
5893 assert((kq
->kq_state
& KQ_WORKQ
) == 0);
5896 kq
->kq_state
|= KQ_DRAIN
;
5897 kqueue_interrupt(kq
);
5904 kqueue_stat(struct kqueue
*kq
, void *ub
, int isstat64
, proc_t p
)
5906 assert((kq
->kq_state
& KQ_WORKQ
) == 0);
5909 if (isstat64
!= 0) {
5910 struct stat64
*sb64
= (struct stat64
*)ub
;
5912 bzero((void *)sb64
, sizeof(*sb64
));
5913 sb64
->st_size
= kq
->kq_count
;
5914 if (kq
->kq_state
& KQ_KEV_QOS
)
5915 sb64
->st_blksize
= sizeof(struct kevent_qos_s
);
5916 else if (kq
->kq_state
& KQ_KEV64
)
5917 sb64
->st_blksize
= sizeof(struct kevent64_s
);
5918 else if (IS_64BIT_PROCESS(p
))
5919 sb64
->st_blksize
= sizeof(struct user64_kevent
);
5921 sb64
->st_blksize
= sizeof(struct user32_kevent
);
5922 sb64
->st_mode
= S_IFIFO
;
5924 struct stat
*sb
= (struct stat
*)ub
;
5926 bzero((void *)sb
, sizeof(*sb
));
5927 sb
->st_size
= kq
->kq_count
;
5928 if (kq
->kq_state
& KQ_KEV_QOS
)
5929 sb
->st_blksize
= sizeof(struct kevent_qos_s
);
5930 else if (kq
->kq_state
& KQ_KEV64
)
5931 sb
->st_blksize
= sizeof(struct kevent64_s
);
5932 else if (IS_64BIT_PROCESS(p
))
5933 sb
->st_blksize
= sizeof(struct user64_kevent
);
5935 sb
->st_blksize
= sizeof(struct user32_kevent
);
5936 sb
->st_mode
= S_IFIFO
;
5943 * Interact with the pthread kext to request a servicing there at a specific QoS
5946 * - Caller holds the workq request lock
5948 * - May be called with the kqueue's wait queue set locked,
5949 * so cannot do anything that could recurse on that.
5952 kqueue_threadreq_initiate(struct kqueue
*kq
, struct kqrequest
*kqr
,
5953 kq_index_t qos
, int flags
)
5955 assert(kqr
->kqr_state
& KQR_WAKEUP
);
5956 assert(kqr
->kqr_thread
== THREAD_NULL
);
5957 assert((kqr
->kqr_state
& KQR_THREQUESTED
) == 0);
5958 struct turnstile
*ts
= TURNSTILE_NULL
;
5960 if (workq_is_exiting(kq
->kq_p
)) {
5964 /* Add a thread request reference on the kqueue. */
5969 if (kq
->kq_state
& KQ_WORKLOOP
) {
5970 __assert_only
struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
5972 assert(kqwl
->kqwl_owner
== THREAD_NULL
);
5973 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST
),
5974 kqwl
->kqwl_dynamicid
, 0, qos
, kqr
->kqr_state
);
5975 ts
= kqwl
->kqwl_turnstile
;
5977 assert(kq
->kq_state
& KQ_WORKQ
);
5978 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST
),
5979 -1, 0, qos
, kqr
->kqr_state
);
5982 kqr
->kqr_state
|= KQR_THREQUESTED
;
5985 * New-style thread request supported.
5986 * Provide the pthread kext a pointer to a workq_threadreq_s structure for
5987 * its use until a corresponding kqueue_threadreq_bind callback.
5990 if ((kq
->kq_state
& KQ_WORKLOOP
) && current_proc() == kq
->kq_p
) {
5991 flags
|= WORKQ_THREADREQ_SET_AST_ON_FAILURE
;
5994 if (qos
== KQWQ_QOS_MANAGER
) {
5995 qos
= WORKQ_THREAD_QOS_MANAGER
;
5997 if (!workq_kern_threadreq_initiate(kq
->kq_p
, kqr
, ts
, qos
, flags
)) {
5999 * Process is shutting down or exec'ing.
6000 * All the kqueues are going to be cleaned up
6001 * soon. Forget we even asked for a thread -
6002 * and make sure we don't ask for more.
6004 kqr
->kqr_state
&= ~(KQR_THREQUESTED
| KQR_R2K_NOTIF_ARMED
);
6005 kqueue_release(kq
, KQUEUE_CANT_BE_LAST_REF
);
6010 * kqueue_threadreq_bind_prepost - prepost the bind to kevent
6012 * This is used when kqueue_threadreq_bind may cause a lock inversion.
6015 kqueue_threadreq_bind_prepost(struct proc
*p __unused
, workq_threadreq_t req
,
6018 struct kqrequest
*kqr
= __container_of(req
, struct kqrequest
, kqr_req
);
6019 struct uthread
*ut
= get_bsdthread_info(thread
);
6021 req
->tr_binding_thread
= thread
;
6022 ut
->uu_kqr_bound
= kqr
;
6023 req
->tr_state
= TR_STATE_BINDING
;
6025 struct kqworkloop
*kqwl
= kqr_kqworkloop(kqr
);
6026 if (kqwl
&& kqwl
->kqwl_turnstile
) {
6027 struct turnstile
*ts
= kqwl
->kqwl_turnstile
;
6029 * While a thread request is in flight, the workqueue
6030 * is the interlock for the turnstile and can update the inheritor.
6032 turnstile_update_inheritor(ts
, thread
, TURNSTILE_IMMEDIATE_UPDATE
|
6033 TURNSTILE_INHERITOR_THREAD
);
6034 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_HELD
);
6039 * kqueue_threadreq_bind_commit - commit a bind prepost
6041 * The workq code has to commit any binding prepost before the thread has
6042 * a chance to come back to userspace (and do kevent syscalls) or be aborted.
6045 kqueue_threadreq_bind_commit(struct proc
*p
, thread_t thread
)
6047 struct uthread
*ut
= get_bsdthread_info(thread
);
6048 struct kqrequest
*kqr
= ut
->uu_kqr_bound
;
6049 kqueue_t kqu
= kqr_kqueue(p
, kqr
);
6052 if (kqr
->kqr_req
.tr_state
== TR_STATE_BINDING
) {
6053 kqueue_threadreq_bind(p
, &kqr
->kqr_req
, thread
, 0);
6059 kqueue_threadreq_modify(struct kqueue
*kq
, struct kqrequest
*kqr
, kq_index_t qos
)
6061 assert(kqr
->kqr_state
& KQR_THREQUESTED
);
6062 assert(kqr
->kqr_thread
== THREAD_NULL
);
6068 if ((kq
->kq_state
& KQ_WORKLOOP
) && kq
->kq_p
== current_proc()) {
6069 flags
|= WORKQ_THREADREQ_SET_AST_ON_FAILURE
;
6072 workq_kern_threadreq_modify(kq
->kq_p
, kqr
, qos
, flags
);
6076 * kqueue_threadreq_bind - bind thread to processing kqrequest
6078 * The provided thread will be responsible for delivering events
6079 * associated with the given kqrequest. Bind it and get ready for
6080 * the thread to eventually arrive.
6083 kqueue_threadreq_bind(struct proc
*p
, workq_threadreq_t req
, thread_t thread
,
6086 struct kqrequest
*kqr
= __container_of(req
, struct kqrequest
, kqr_req
);
6087 kqueue_t kqu
= kqr_kqueue(p
, kqr
);
6088 struct uthread
*ut
= get_bsdthread_info(thread
);
6092 assert(kqr
->kqr_state
& KQR_THREQUESTED
);
6093 assert(kqr
->kqr_thread
== THREAD_NULL
);
6094 assert(ut
->uu_kqueue_override
== 0);
6096 if (kqr
->kqr_req
.tr_state
== TR_STATE_BINDING
) {
6097 assert(ut
->uu_kqr_bound
== kqr
);
6098 assert(kqr
->kqr_req
.tr_binding_thread
== thread
);
6099 kqr
->kqr_req
.tr_state
= TR_STATE_IDLE
;
6100 kqr
->kqr_req
.tr_binding_thread
= NULL
;
6102 assert(ut
->uu_kqr_bound
== NULL
);
6105 ut
->uu_kqr_bound
= kqr
;
6106 kqr
->kqr_thread
= thread
;
6108 if (kqu
.kq
->kq_state
& KQ_WORKLOOP
) {
6109 struct turnstile
*ts
= kqu
.kqwl
->kqwl_turnstile
;
6111 if (__improbable(thread
== kqu
.kqwl
->kqwl_owner
)) {
6113 * <rdar://problem/38626999> shows that asserting here is not ok.
6115 * This is not supposed to happen for correct use of the interface,
6116 * but it is sadly possible for userspace (with the help of memory
6117 * corruption, such as over-release of a dispatch queue) to make
6118 * the creator thread the "owner" of a workloop.
6120 * Once that happens, and that creator thread picks up the same
6121 * workloop as a servicer, we trip this codepath. We need to fixup
6122 * the state to forget about this thread being the owner, as the
6123 * entire workloop state machine expects servicers to never be
6124 * owners and everything would basically go downhill from here.
6126 kqu
.kqwl
->kqwl_owner
= THREAD_NULL
;
6127 if (kqworkloop_owner_override(kqu
.kqwl
)) {
6128 thread_drop_ipc_override(thread
);
6130 thread_ends_owning_workloop(thread
);
6133 if (ts
&& (flags
& KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE
) == 0) {
6135 * Past this point, the interlock is the kq req lock again,
6136 * so we can fix the inheritor for good.
6138 filt_wlupdate_inheritor(kqu
.kqwl
, ts
, TURNSTILE_IMMEDIATE_UPDATE
);
6139 turnstile_update_inheritor_complete(ts
, TURNSTILE_INTERLOCK_HELD
);
6142 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND
), kqu
.kqwl
->kqwl_dynamicid
,
6143 thread_tid(thread
), kqr
->kqr_qos_index
,
6144 (kqr
->kqr_override_index
<< 16) | kqr
->kqr_state
);
6146 ut
->uu_kqueue_override
= kqr
->kqr_override_index
;
6147 if (kqr
->kqr_override_index
) {
6148 thread_add_ipc_override(thread
, kqr
->kqr_override_index
);
6151 assert(kqr
->kqr_override_index
== 0);
6153 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND
), -1,
6154 thread_tid(thread
), kqr
->kqr_qos_index
,
6155 (kqr
->kqr_override_index
<< 16) | kqr
->kqr_state
);
6160 * kqueue_threadreq_cancel - abort a pending thread request
6162 * Called when exiting/exec'ing. Forget our pending request.
6165 kqueue_threadreq_cancel(struct proc
*p
, workq_threadreq_t req
)
6167 struct kqrequest
*kqr
= __container_of(req
, struct kqrequest
, kqr_req
);
6168 kqueue_t kqu
= kqr_kqueue(p
, kqr
);
6172 assert(kqr
->kqr_thread
== THREAD_NULL
);
6173 assert(kqr
->kqr_state
& KQR_THREQUESTED
);
6174 kqr
->kqr_state
&= ~(KQR_THREQUESTED
| KQR_R2K_NOTIF_ARMED
);
6178 kqueue_release_last(p
, kqu
); /* may dealloc kqu */
6181 workq_threadreq_param_t
6182 kqueue_threadreq_workloop_param(workq_threadreq_t req
)
6184 struct kqrequest
*kqr
= __container_of(req
, struct kqrequest
, kqr_req
);
6185 struct kqworkloop
*kqwl
;
6186 workq_threadreq_param_t trp
;
6188 assert(kqr
->kqr_state
& KQR_WORKLOOP
);
6189 kqwl
= __container_of(kqr
, struct kqworkloop
, kqwl_request
);
6190 trp
.trp_value
= kqwl
->kqwl_params
;
6195 * kqueue_threadreq_unbind - unbind thread from processing kqueue
6197 * End processing the per-QoS bucket of events and allow other threads
6198 * to be requested for future servicing.
6200 * caller holds a reference on the kqueue.
6203 kqueue_threadreq_unbind(struct proc
*p
, struct kqrequest
*kqr
)
6205 if (kqr
->kqr_state
& KQR_WORKLOOP
) {
6206 kqworkloop_unbind(p
, kqr_kqworkloop(kqr
));
6208 kqworkq_unbind(p
, kqr
);
6213 * If we aren't already busy processing events [for this QoS],
6214 * request workq thread support as appropriate.
6216 * TBD - for now, we don't segregate out processing by QoS.
6218 * - May be called with the kqueue's wait queue set locked,
6219 * so cannot do anything that could recurse on that.
6222 kqworkq_request_help(struct kqworkq
*kqwq
, kq_index_t qos_index
)
6224 struct kqrequest
*kqr
;
6226 /* convert to thread qos value */
6227 assert(qos_index
< KQWQ_NBUCKETS
);
6230 kqr
= kqworkq_get_request(kqwq
, qos_index
);
6232 if ((kqr
->kqr_state
& KQR_WAKEUP
) == 0) {
6233 kqr
->kqr_state
|= KQR_WAKEUP
;
6234 if ((kqr
->kqr_state
& KQR_THREQUESTED
) == 0) {
6235 kqueue_threadreq_initiate(&kqwq
->kqwq_kqueue
, kqr
, qos_index
, 0);
6238 kq_req_unlock(kqwq
);
6242 kqworkloop_owner_override(struct kqworkloop
*kqwl
)
6244 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
6245 return MAX(kqr
->kqr_qos_index
, kqr
->kqr_override_index
);
6249 kqworkloop_request_fire_r2k_notification(struct kqworkloop
*kqwl
)
6251 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
6255 if (kqr
->kqr_state
& KQR_R2K_NOTIF_ARMED
) {
6256 assert(kqr
->kqr_thread
);
6257 kqr
->kqr_state
&= ~KQR_R2K_NOTIF_ARMED
;
6258 act_set_astkevent(kqr
->kqr_thread
, AST_KEVENT_RETURN_TO_KERNEL
);
6263 kqworkloop_update_threads_qos(struct kqworkloop
*kqwl
, int op
, kq_index_t qos
)
6265 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
6266 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
6267 kq_index_t old_owner_override
= kqworkloop_owner_override(kqwl
);
6270 /* must hold the kqr lock */
6274 case KQWL_UTQ_UPDATE_WAKEUP_QOS
:
6275 if (qos
== KQWL_BUCKET_STAYACTIVE
) {
6277 * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
6278 * a high watermark (kqr_stayactive_qos) of any stay active knote
6279 * that was ever registered with this workloop.
6281 * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
6282 * knote, we use this high-watermark as a wakeup-index, and also set
6283 * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
6284 * there is at least one stay active knote fired until the next full
6285 * processing of this bucket.
6287 kqr
->kqr_wakeup_indexes
|= KQWL_STAYACTIVE_FIRED_BIT
;
6288 qos
= kqr
->kqr_stayactive_qos
;
6291 if (kqr
->kqr_wakeup_indexes
& (1 << qos
)) {
6292 assert(kqr
->kqr_state
& KQR_WAKEUP
);
6296 kqr
->kqr_wakeup_indexes
|= (1 << qos
);
6297 kqr
->kqr_state
|= KQR_WAKEUP
;
6298 kqworkloop_request_fire_r2k_notification(kqwl
);
6301 case KQWL_UTQ_UPDATE_STAYACTIVE_QOS
:
6303 if (kqr
->kqr_stayactive_qos
< qos
) {
6304 kqr
->kqr_stayactive_qos
= qos
;
6305 if (kqr
->kqr_wakeup_indexes
& KQWL_STAYACTIVE_FIRED_BIT
) {
6306 assert(kqr
->kqr_state
& KQR_WAKEUP
);
6307 kqr
->kqr_wakeup_indexes
|= (1 << qos
);
6313 case KQWL_UTQ_PARKING
:
6314 case KQWL_UTQ_UNBINDING
:
6315 kqr
->kqr_override_index
= qos
;
6317 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS
:
6318 if (op
== KQWL_UTQ_RECOMPUTE_WAKEUP_QOS
) {
6319 assert(qos
== THREAD_QOS_UNSPECIFIED
);
6321 kqlock_held(kqwl
); // to look at kq_queues
6322 i
= KQWL_BUCKET_STAYACTIVE
;
6323 if (TAILQ_EMPTY(&kqr
->kqr_suppressed
)) {
6324 kqr
->kqr_override_index
= THREAD_QOS_UNSPECIFIED
;
6326 if (!TAILQ_EMPTY(&kqwl
->kqwl_queue
[i
]) &&
6327 (kqr
->kqr_wakeup_indexes
& KQWL_STAYACTIVE_FIRED_BIT
)) {
6329 * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
6330 * knote may have fired, so we need to merge in kqr_stayactive_qos.
6332 * Unlike other buckets, this one is never empty but could be idle.
6334 kqr
->kqr_wakeup_indexes
&= KQWL_STAYACTIVE_FIRED_BIT
;
6335 kqr
->kqr_wakeup_indexes
|= (1 << kqr
->kqr_stayactive_qos
);
6337 kqr
->kqr_wakeup_indexes
= 0;
6339 for (i
= THREAD_QOS_UNSPECIFIED
+ 1; i
< KQWL_BUCKET_STAYACTIVE
; i
++) {
6340 if (!TAILQ_EMPTY(&kqwl
->kqwl_queue
[i
])) {
6341 kqr
->kqr_wakeup_indexes
|= (1 << i
);
6344 if (kqr
->kqr_wakeup_indexes
) {
6345 kqr
->kqr_state
|= KQR_WAKEUP
;
6346 kqworkloop_request_fire_r2k_notification(kqwl
);
6348 kqr
->kqr_state
&= ~KQR_WAKEUP
;
6352 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE
:
6353 kqr
->kqr_override_index
= qos
;
6356 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE
:
6359 * When modifying the wakeup QoS or the override QoS, we always need to
6360 * maintain our invariant that kqr_override_index is at least as large
6361 * as the highest QoS for which an event is fired.
6363 * However this override index can be larger when there is an overriden
6364 * suppressed knote pushing on the kqueue.
6366 if (kqr
->kqr_wakeup_indexes
> (1 << qos
)) {
6367 qos
= fls(kqr
->kqr_wakeup_indexes
) - 1; /* fls is 1-based */
6369 if (kqr
->kqr_override_index
< qos
) {
6370 kqr
->kqr_override_index
= qos
;
6374 case KQWL_UTQ_REDRIVE_EVENTS
:
6377 case KQWL_UTQ_SET_QOS_INDEX
:
6378 kqr
->kqr_qos_index
= qos
;
6382 panic("unknown kqwl thread qos update operation: %d", op
);
6385 thread_t kqwl_owner
= kqwl
->kqwl_owner
;
6386 thread_t servicer
= kqr
->kqr_thread
;
6387 boolean_t qos_changed
= FALSE
;
6388 kq_index_t new_owner_override
= kqworkloop_owner_override(kqwl
);
6391 * Apply the diffs to the owner if applicable
6395 /* JMM - need new trace hooks for owner overrides */
6396 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST
),
6397 kqwl
->kqwl_dynamicid
, thread_tid(kqwl_owner
), kqr
->kqr_qos_index
,
6398 (kqr
->kqr_override_index
<< 16) | kqr
->kqr_state
);
6400 if (new_owner_override
== old_owner_override
) {
6402 } else if (old_owner_override
== THREAD_QOS_UNSPECIFIED
) {
6403 thread_add_ipc_override(kqwl_owner
, new_owner_override
);
6404 } else if (new_owner_override
== THREAD_QOS_UNSPECIFIED
) {
6405 thread_drop_ipc_override(kqwl_owner
);
6406 } else /* old_owner_override != new_owner_override */ {
6407 thread_update_ipc_override(kqwl_owner
, new_owner_override
);
6412 * apply the diffs to the servicer
6414 if ((kqr
->kqr_state
& KQR_THREQUESTED
) == 0) {
6416 * No servicer, nor thread-request
6418 * Make a new thread request, unless there is an owner (or the workloop
6419 * is suspended in userland) or if there is no asynchronous work in the
6423 if (kqwl_owner
== NULL
&& (kqr
->kqr_state
& KQR_WAKEUP
)) {
6424 int initiate_flags
= 0;
6425 if (op
== KQWL_UTQ_UNBINDING
) {
6426 initiate_flags
= WORKQ_THREADREQ_ATTEMPT_REBIND
;
6428 kqueue_threadreq_initiate(kq
, kqr
, new_owner_override
,
6431 } else if (servicer
) {
6433 * Servicer in flight
6435 * Just apply the diff to the servicer
6437 struct uthread
*ut
= get_bsdthread_info(servicer
);
6438 if (ut
->uu_kqueue_override
!= kqr
->kqr_override_index
) {
6439 if (ut
->uu_kqueue_override
== THREAD_QOS_UNSPECIFIED
) {
6440 thread_add_ipc_override(servicer
, kqr
->kqr_override_index
);
6441 } else if (kqr
->kqr_override_index
== THREAD_QOS_UNSPECIFIED
) {
6442 thread_drop_ipc_override(servicer
);
6443 } else /* ut->uu_kqueue_override != kqr->kqr_override_index */ {
6444 thread_update_ipc_override(servicer
, kqr
->kqr_override_index
);
6446 ut
->uu_kqueue_override
= kqr
->kqr_override_index
;
6449 } else if (new_owner_override
== THREAD_QOS_UNSPECIFIED
) {
6451 * No events to deliver anymore.
6453 * However canceling with turnstiles is challenging, so the fact that
6454 * the request isn't useful will be discovered by the servicer himself
6457 } else if (old_owner_override
!= new_owner_override
) {
6459 * Request is in flight
6461 * Apply the diff to the thread request
6463 kqueue_threadreq_modify(kq
, kqr
, new_owner_override
);
6468 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST
), kqwl
->kqwl_dynamicid
,
6469 thread_tid(kqr
->kqr_thread
), kqr
->kqr_qos_index
,
6470 (kqr
->kqr_override_index
<< 16) | kqr
->kqr_state
);
6475 kqworkloop_request_help(struct kqworkloop
*kqwl
, kq_index_t qos_index
)
6477 /* convert to thread qos value */
6478 assert(qos_index
< KQWL_NBUCKETS
);
6481 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_UPDATE_WAKEUP_QOS
, qos_index
);
6482 kq_req_unlock(kqwl
);
6485 static struct kqtailq
*
6486 kqueue_get_queue(struct kqueue
*kq
, kq_index_t qos_index
)
6488 if (kq
->kq_state
& KQ_WORKQ
) {
6489 assert(qos_index
< KQWQ_NBUCKETS
);
6490 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
6491 assert(qos_index
< KQWL_NBUCKETS
);
6493 assert(qos_index
== QOS_INDEX_KQFILE
);
6495 static_assert(offsetof(struct kqueue
, kq_queue
) == sizeof(struct kqueue
),
6496 "struct kqueue::kq_queue must be exactly at the end");
6497 return &kq
->kq_queue
[qos_index
];
6501 kqueue_queue_empty(struct kqueue
*kq
, kq_index_t qos_index
)
6503 return TAILQ_EMPTY(kqueue_get_queue(kq
, qos_index
));
6506 static struct kqtailq
*
6507 kqueue_get_suppressed_queue(kqueue_t kq
, struct knote
*kn
)
6509 if (kq
.kq
->kq_state
& KQ_WORKQ
) {
6510 return &kqworkq_get_request(kq
.kqwq
, kn
->kn_qos_index
)->kqr_suppressed
;
6511 } else if (kq
.kq
->kq_state
& KQ_WORKLOOP
) {
6512 return &kq
.kqwl
->kqwl_request
.kqr_suppressed
;
6514 return &kq
.kqf
->kqf_suppressed
;
6518 static struct turnstile
*
6519 kqueue_get_turnstile(kqueue_t kqu
, bool can_alloc
)
6523 if ((kqu
.kq
->kq_state
& KQ_WORKLOOP
) == 0) {
6524 return TURNSTILE_NULL
;
6527 kqr_state
= os_atomic_load(&kqu
.kqwl
->kqwl_request
.kqr_state
, relaxed
);
6528 if (kqr_state
& KQR_ALLOCATED_TURNSTILE
) {
6529 /* force a dependency to pair with the atomic or with release below */
6530 return os_atomic_load_with_dependency_on(&kqu
.kqwl
->kqwl_turnstile
,
6535 return TURNSTILE_NULL
;
6538 struct turnstile
*ts
= turnstile_alloc(), *free_ts
= TURNSTILE_NULL
;
6541 if (filt_wlturnstile_interlock_is_workq(kqu
.kqwl
)) {
6542 workq_kern_threadreq_lock(kqu
.kqwl
->kqwl_p
);
6545 if (kqu
.kqwl
->kqwl_request
.kqr_state
& KQR_ALLOCATED_TURNSTILE
) {
6547 ts
= kqu
.kqwl
->kqwl_turnstile
;
6549 ts
= turnstile_prepare((uintptr_t)kqu
.kqwl
, &kqu
.kqwl
->kqwl_turnstile
,
6550 ts
, TURNSTILE_WORKLOOPS
);
6552 /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
6553 os_atomic_or(&kqu
.kqwl
->kqwl_request
.kqr_state
,
6554 KQR_ALLOCATED_TURNSTILE
, release
);
6557 if (filt_wlturnstile_interlock_is_workq(kqu
.kqwl
)) {
6558 workq_kern_threadreq_unlock(kqu
.kqwl
->kqwl_p
);
6560 kq_req_unlock(kqu
.kqwl
);
6563 turnstile_deallocate(free_ts
);
6569 kqueue_turnstile(struct kqueue
*kq
)
6571 return kqueue_get_turnstile(kq
, false);
6575 kqueue_alloc_turnstile(struct kqueue
*kq
)
6577 return kqueue_get_turnstile(kq
, true);
6580 static struct kqtailq
*
6581 knote_get_queue(struct knote
*kn
)
6583 return kqueue_get_queue(knote_get_kq(kn
), kn
->kn_qos_index
);
6587 knote_reset_priority(struct knote
*kn
, pthread_priority_t pp
)
6589 struct kqueue
*kq
= knote_get_kq(kn
);
6590 kq_index_t qos
= _pthread_priority_thread_qos(pp
);
6592 assert((kn
->kn_status
& KN_QUEUED
) == 0);
6594 if (kq
->kq_state
& KQ_WORKQ
) {
6595 if (qos
== THREAD_QOS_UNSPECIFIED
) {
6596 /* On workqueues, outside of QoS means MANAGER */
6597 qos
= KQWQ_QOS_MANAGER
;
6598 pp
= _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
;
6600 pp
= _pthread_priority_normalize(pp
);
6602 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
6603 assert((pp
& _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
) == 0);
6604 pp
= _pthread_priority_normalize(pp
);
6606 pp
= _pthread_unspecified_priority();
6607 qos
= THREAD_QOS_UNSPECIFIED
;
6611 kn
->kn_req_index
= qos
;
6613 if ((kn
->kn_status
& KN_MERGE_QOS
) == 0 || qos
> kn
->kn_qos_override
) {
6614 /* Never lower QoS when in "Merge" mode */
6615 kn
->kn_qos_override
= qos
;
6618 /* only adjust in-use qos index when not suppressed */
6619 if ((kn
->kn_status
& KN_SUPPRESSED
) == 0) {
6620 kn
->kn_qos_index
= qos
;
6621 } else if (kq
->kq_state
& KQ_WORKQ
) {
6622 kqworkq_update_override((struct kqworkq
*)kq
, kn
, qos
);
6623 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
6624 kqworkloop_update_override((struct kqworkloop
*)kq
, qos
);
6629 knote_set_qos_overcommit(struct knote
*kn
)
6631 struct kqueue
*kq
= knote_get_kq(kn
);
6633 /* turn overcommit on for the appropriate thread request? */
6634 if ((kn
->kn_qos
& _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
) &&
6635 (kq
->kq_state
& KQ_WORKLOOP
)) {
6636 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
6637 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
6640 * This test is racy, but since we never remove this bit,
6641 * it allows us to avoid taking a lock.
6643 if (kqr
->kqr_state
& KQR_THOVERCOMMIT
) {
6648 kqr
->kqr_state
|= KQR_THOVERCOMMIT
;
6649 if (!kqr
->kqr_thread
&& (kqr
->kqr_state
& KQR_THREQUESTED
)) {
6650 kqueue_threadreq_modify(kq
, kqr
, kqr
->kqr_req
.tr_qos
);
6652 kq_req_unlock(kqwl
);
6657 knote_get_qos_override_index(struct knote
*kn
)
6659 return kn
->kn_qos_override
;
6663 kqworkq_update_override(struct kqworkq
*kqwq
, struct knote
*kn
,
6664 kq_index_t override_index
)
6666 struct kqrequest
*kqr
;
6667 kq_index_t old_override_index
;
6668 kq_index_t queue_index
= kn
->kn_qos_index
;
6670 if (override_index
<= queue_index
) {
6674 kqr
= kqworkq_get_request(kqwq
, queue_index
);
6677 old_override_index
= kqr
->kqr_override_index
;
6678 if (override_index
> MAX(kqr
->kqr_qos_index
, old_override_index
)) {
6679 kqr
->kqr_override_index
= override_index
;
6681 /* apply the override to [incoming?] servicing thread */
6682 if (kqr
->kqr_thread
) {
6683 if (old_override_index
)
6684 thread_update_ipc_override(kqr
->kqr_thread
, override_index
);
6686 thread_add_ipc_override(kqr
->kqr_thread
, override_index
);
6689 kq_req_unlock(kqwq
);
6693 kqworkloop_update_override(struct kqworkloop
*kqwl
, kq_index_t override_index
)
6696 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE
,
6698 kq_req_unlock(kqwl
);
6702 kqworkloop_unbind_locked(struct kqworkloop
*kqwl
, thread_t thread
)
6704 struct uthread
*ut
= get_bsdthread_info(thread
);
6705 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
6706 kq_index_t ipc_override
= ut
->uu_kqueue_override
;
6708 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND
), kqwl
->kqwl_dynamicid
,
6709 thread_tid(thread
), 0, 0);
6712 assert(ut
->uu_kqr_bound
== kqr
);
6713 ut
->uu_kqr_bound
= NULL
;
6714 ut
->uu_kqueue_override
= THREAD_QOS_UNSPECIFIED
;
6716 if (kqwl
->kqwl_owner
== NULL
&& kqwl
->kqwl_turnstile
) {
6717 turnstile_update_inheritor(kqwl
->kqwl_turnstile
,
6718 TURNSTILE_INHERITOR_NULL
, TURNSTILE_IMMEDIATE_UPDATE
);
6719 turnstile_update_inheritor_complete(kqwl
->kqwl_turnstile
,
6720 TURNSTILE_INTERLOCK_HELD
);
6723 kqr
->kqr_thread
= NULL
;
6724 kqr
->kqr_state
&= ~(KQR_THREQUESTED
| KQR_R2K_NOTIF_ARMED
);
6725 return ipc_override
;
6729 * kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
6731 * It will acknowledge events, and possibly request a new thread if:
6732 * - there were active events left
6733 * - we pended waitq hook callouts during processing
6734 * - we pended wakeups while processing (or unsuppressing)
6736 * Called with kqueue lock held.
6739 kqworkloop_unbind(proc_t p
, struct kqworkloop
*kqwl
)
6741 struct kqueue
*kq
= &kqwl
->kqwl_kqueue
;
6742 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
6743 thread_t thread
= kqr
->kqr_thread
;
6744 int op
= KQWL_UTQ_PARKING
;
6745 kq_index_t ipc_override
, qos_override
= THREAD_QOS_UNSPECIFIED
;
6747 assert(thread
== current_thread());
6752 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
6753 * unsuppressing knotes not to be applied until the eventual call to
6754 * kqworkloop_update_threads_qos() below.
6756 assert((kq
->kq_state
& KQ_PROCESSING
) == 0);
6757 if (!TAILQ_EMPTY(&kqr
->kqr_suppressed
)) {
6758 kq
->kq_state
|= KQ_PROCESSING
;
6759 qos_override
= kqworkloop_acknowledge_events(kqwl
);
6760 kq
->kq_state
&= ~KQ_PROCESSING
;
6765 ipc_override
= kqworkloop_unbind_locked(kqwl
, thread
);
6766 kqworkloop_update_threads_qos(kqwl
, op
, qos_override
);
6768 kq_req_unlock(kqwl
);
6773 * Drop the override on the current thread last, after the call to
6774 * kqworkloop_update_threads_qos above.
6777 thread_drop_ipc_override(thread
);
6780 /* If last reference, dealloc the workloop kq */
6781 kqueue_release_last(p
, kqwl
);
6785 kqworkq_unbind_locked(__assert_only
struct kqworkq
*kqwq
,
6786 struct kqrequest
*kqr
, thread_t thread
)
6788 struct uthread
*ut
= get_bsdthread_info(thread
);
6789 kq_index_t old_override
= kqr
->kqr_override_index
;
6791 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND
), -1,
6792 thread_tid(kqr
->kqr_thread
), kqr
->kqr_qos_index
, 0);
6795 assert(ut
->uu_kqr_bound
== kqr
);
6796 ut
->uu_kqr_bound
= NULL
;
6797 kqr
->kqr_thread
= NULL
;
6798 kqr
->kqr_state
&= ~(KQR_THREQUESTED
| KQR_R2K_NOTIF_ARMED
);
6799 kqr
->kqr_override_index
= THREAD_QOS_UNSPECIFIED
;
6801 return old_override
;
6805 * kqworkq_unbind - unbind of a workq kqueue from a thread
6807 * We may have to request new threads.
6808 * This can happen there are no waiting processing threads and:
6809 * - there were active events we never got to (count > 0)
6810 * - we pended waitq hook callouts during processing
6811 * - we pended wakeups while processing (or unsuppressing)
6814 kqworkq_unbind(proc_t p
, struct kqrequest
*kqr
)
6816 struct kqworkq
*kqwq
= (struct kqworkq
*)p
->p_fd
->fd_wqkqueue
;
6817 __assert_only
int rc
;
6820 rc
= kqworkq_acknowledge_events(kqwq
, kqr
, 0, KQWQAE_UNBIND
);
6826 kqworkq_get_request(struct kqworkq
*kqwq
, kq_index_t qos_index
)
6828 assert(qos_index
< KQWQ_NBUCKETS
);
6829 return &kqwq
->kqwq_request
[qos_index
];
6833 knote_apply_qos_override(struct knote
*kn
, kq_index_t qos_index
)
6835 assert((kn
->kn_status
& KN_QUEUED
) == 0);
6837 kn
->kn_qos_override
= qos_index
;
6839 if (kn
->kn_status
& KN_SUPPRESSED
) {
6840 struct kqueue
*kq
= knote_get_kq(kn
);
6842 * For suppressed events, the kn_qos_index field cannot be touched as it
6843 * allows us to know on which supress queue the knote is for a kqworkq.
6845 * Also, there's no natural push applied on the kqueues when this field
6846 * changes anyway. We hence need to apply manual overrides in this case,
6847 * which will be cleared when the events are later acknowledged.
6849 if (kq
->kq_state
& KQ_WORKQ
) {
6850 kqworkq_update_override((struct kqworkq
*)kq
, kn
, qos_index
);
6852 kqworkloop_update_override((struct kqworkloop
*)kq
, qos_index
);
6855 kn
->kn_qos_index
= qos_index
;
6860 knote_should_apply_qos_override(struct kqueue
*kq
, struct knote
*kn
, int result
,
6861 thread_qos_t
*qos_out
)
6863 thread_qos_t qos_index
= (result
>> FILTER_ADJUST_EVENT_QOS_SHIFT
) & 7;
6867 assert(result
& FILTER_ADJUST_EVENT_QOS_BIT
);
6868 assert(qos_index
< THREAD_QOS_LAST
);
6871 * Early exit for knotes that should not change QoS
6873 * It is safe to test kn_req_index against MANAGER / STAYACTIVE because
6874 * knotes with such kn_req_index values never change for their entire
6877 if (__improbable(!knote_fops(kn
)->f_adjusts_qos
)) {
6878 panic("filter %d cannot change QoS", kn
->kn_filtid
);
6879 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
6880 if (kn
->kn_req_index
== KQWL_BUCKET_STAYACTIVE
) {
6883 } else if (kq
->kq_state
& KQ_WORKQ
) {
6884 if (kn
->kn_req_index
== KQWQ_QOS_MANAGER
) {
6892 * knotes with the FALLBACK flag will only use their registration QoS if the
6893 * incoming event has no QoS, else, the registration QoS acts as a floor.
6895 if (kn
->kn_qos
& _PTHREAD_PRIORITY_FALLBACK_FLAG
) {
6896 if (qos_index
== THREAD_QOS_UNSPECIFIED
)
6897 qos_index
= kn
->kn_req_index
;
6899 if (qos_index
< kn
->kn_req_index
)
6900 qos_index
= kn
->kn_req_index
;
6902 if ((kn
->kn_status
& KN_MERGE_QOS
) && (qos_index
< kn
->kn_qos_override
)) {
6903 /* Never lower QoS when in "Merge" mode */
6907 if ((kn
->kn_status
& KN_LOCKED
) && kn
->kn_inuse
) {
6909 * When we're trying to update the QoS override and that both an
6910 * f_event() and other f_* calls are running concurrently, any of these
6911 * in flight calls may want to perform overrides that aren't properly
6912 * serialized with each other.
6914 * The first update that observes this racy situation enters a "Merge"
6915 * mode which causes subsequent override requests to saturate the
6916 * override instead of replacing its value.
6918 * This mode is left when knote_unlock() or knote_call_filter_event()
6919 * observe that no other f_* routine is in flight.
6921 kn
->kn_status
|= KN_MERGE_QOS
;
6924 if (kn
->kn_qos_override
== qos_index
) {
6928 *qos_out
= qos_index
;
6933 knote_adjust_qos(struct kqueue
*kq
, struct knote
*kn
, int result
)
6936 if (knote_should_apply_qos_override(kq
, kn
, result
, &qos
)) {
6938 knote_apply_qos_override(kn
, qos
);
6939 if (knote_enqueue(kn
) && (kn
->kn_status
& KN_ACTIVE
)) {
6946 knote_wakeup(struct knote
*kn
)
6948 struct kqueue
*kq
= knote_get_kq(kn
);
6952 if (kq
->kq_state
& KQ_WORKQ
) {
6953 struct kqworkq
*kqwq
= (struct kqworkq
*)kq
;
6955 kqworkq_request_help(kqwq
, kn
->kn_qos_index
);
6956 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
6957 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
6960 * kqworkloop_end_processing() will perform the required QoS
6961 * computations when it unsets the processing mode.
6963 if (!kqworkloop_is_processing_on_current_thread(kqwl
)) {
6964 kqworkloop_request_help(kqwl
, kn
->kn_qos_index
);
6967 struct kqfile
*kqf
= (struct kqfile
*)kq
;
6969 /* flag wakeups during processing */
6970 if (kq
->kq_state
& KQ_PROCESSING
)
6971 kq
->kq_state
|= KQ_WAKEUP
;
6973 /* wakeup a thread waiting on this queue */
6974 if (kq
->kq_state
& (KQ_SLEEP
| KQ_SEL
)) {
6975 kq
->kq_state
&= ~(KQ_SLEEP
| KQ_SEL
);
6976 waitq_wakeup64_all((struct waitq
*)&kq
->kq_wqs
, KQ_EVENT
,
6977 THREAD_AWAKENED
, WAITQ_ALL_PRIORITIES
);
6980 /* wakeup other kqueues/select sets we're inside */
6981 KNOTE(&kqf
->kqf_sel
.si_note
, 0);
6986 * Called with the kqueue locked
6989 kqueue_interrupt(struct kqueue
*kq
)
6991 assert((kq
->kq_state
& KQ_WORKQ
) == 0);
6993 /* wakeup sleeping threads */
6994 if ((kq
->kq_state
& (KQ_SLEEP
| KQ_SEL
)) != 0) {
6995 kq
->kq_state
&= ~(KQ_SLEEP
| KQ_SEL
);
6996 (void)waitq_wakeup64_all((struct waitq
*)&kq
->kq_wqs
,
6999 WAITQ_ALL_PRIORITIES
);
7002 /* wakeup threads waiting their turn to process */
7003 if (kq
->kq_state
& KQ_PROCWAIT
) {
7004 struct kqtailq
*suppressq
;
7006 assert(kq
->kq_state
& KQ_PROCESSING
);
7008 kq
->kq_state
&= ~KQ_PROCWAIT
;
7009 suppressq
= kqueue_get_suppressed_queue(kq
, NULL
);
7010 (void)waitq_wakeup64_all((struct waitq
*)&kq
->kq_wqs
,
7011 CAST_EVENT64_T(suppressq
),
7013 WAITQ_ALL_PRIORITIES
);
7018 * Called back from waitq code when no threads waiting and the hook was set.
7020 * Interrupts are likely disabled and spin locks are held - minimal work
7021 * can be done in this context!!!
7023 * JMM - in the future, this will try to determine which knotes match the
7024 * wait queue wakeup and apply these wakeups against those knotes themselves.
7025 * For now, all the events dispatched this way are dispatch-manager handled,
7026 * so hard-code that for now.
7029 waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook
, void *knote_hook
, int qos
)
7031 #pragma unused(knote_hook, qos)
7033 struct kqueue
*kq
= (struct kqueue
*)kq_hook
;
7035 if (kq
->kq_state
& KQ_WORKQ
) {
7036 struct kqworkq
*kqwq
= (struct kqworkq
*)kq
;
7038 kqworkq_request_help(kqwq
, KQWQ_QOS_MANAGER
);
7039 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
7040 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
7042 kqworkloop_request_help(kqwl
, KQWL_BUCKET_STAYACTIVE
);
7047 klist_init(struct klist
*list
)
7054 * Query/Post each knote in the object's list
7056 * The object lock protects the list. It is assumed
7057 * that the filter/event routine for the object can
7058 * determine that the object is already locked (via
7059 * the hint) and not deadlock itself.
7061 * The object lock should also hold off pending
7062 * detach/drop operations.
7065 knote(struct klist
*list
, long hint
)
7069 SLIST_FOREACH(kn
, list
, kn_selnext
) {
7070 struct kqueue
*kq
= knote_get_kq(kn
);
7072 knote_call_filter_event(kq
, kn
, hint
);
7078 * attach a knote to the specified list. Return true if this is the first entry.
7079 * The list is protected by whatever lock the object it is associated with uses.
7082 knote_attach(struct klist
*list
, struct knote
*kn
)
7084 int ret
= SLIST_EMPTY(list
);
7085 SLIST_INSERT_HEAD(list
, kn
, kn_selnext
);
7090 * detach a knote from the specified list. Return true if that was the last entry.
7091 * The list is protected by whatever lock the object it is associated with uses.
7094 knote_detach(struct klist
*list
, struct knote
*kn
)
7096 SLIST_REMOVE(list
, kn
, knote
, kn_selnext
);
7097 return (SLIST_EMPTY(list
));
7101 * knote_vanish - Indicate that the source has vanished
7103 * If the knote has requested EV_VANISHED delivery,
7104 * arrange for that. Otherwise, deliver a NOTE_REVOKE
7105 * event for backward compatibility.
7107 * The knote is marked as having vanished, but is not
7108 * actually detached from the source in this instance.
7109 * The actual detach is deferred until the knote drop.
7111 * Our caller already has the object lock held. Calling
7112 * the detach routine would try to take that lock
7113 * recursively - which likely is not supported.
7116 knote_vanish(struct klist
*list
)
7119 struct knote
*kn_next
;
7121 SLIST_FOREACH_SAFE(kn
, list
, kn_selnext
, kn_next
) {
7122 struct kqueue
*kq
= knote_get_kq(kn
);
7125 if (kn
->kn_status
& KN_REQVANISH
) {
7126 /* If EV_VANISH supported - prepare to deliver one */
7127 kn
->kn_status
|= KN_VANISHED
;
7130 knote_call_filter_event(kq
, kn
, NOTE_REVOKE
);
7137 * Force a lazy allocation of the waitqset link
7138 * of the kq_wqs associated with the kn
7139 * if it wasn't already allocated.
7141 * This allows knote_link_waitq to never block
7142 * if reserved_link is not NULL.
7145 knote_link_waitqset_lazy_alloc(struct knote
*kn
)
7147 struct kqueue
*kq
= knote_get_kq(kn
);
7148 waitq_set_lazy_init_link(&kq
->kq_wqs
);
7152 * Check if a lazy allocation for the waitqset link
7153 * of the kq_wqs is needed.
7156 knote_link_waitqset_should_lazy_alloc(struct knote
*kn
)
7158 struct kqueue
*kq
= knote_get_kq(kn
);
7159 return waitq_set_should_lazy_init_link(&kq
->kq_wqs
);
7163 * For a given knote, link a provided wait queue directly with the kqueue.
7164 * Wakeups will happen via recursive wait queue support. But nothing will move
7165 * the knote to the active list at wakeup (nothing calls knote()). Instead,
7166 * we permanently enqueue them here.
7168 * kqueue and knote references are held by caller.
7169 * waitq locked by caller.
7171 * caller provides the wait queue link structure and insures that the kq->kq_wqs
7172 * is linked by previously calling knote_link_waitqset_lazy_alloc.
7175 knote_link_waitq(struct knote
*kn
, struct waitq
*wq
, uint64_t *reserved_link
)
7177 struct kqueue
*kq
= knote_get_kq(kn
);
7180 kr
= waitq_link(wq
, &kq
->kq_wqs
, WAITQ_ALREADY_LOCKED
, reserved_link
);
7181 if (kr
== KERN_SUCCESS
) {
7182 knote_markstayactive(kn
);
7190 * Unlink the provided wait queue from the kqueue associated with a knote.
7191 * Also remove it from the magic list of directly attached knotes.
7193 * Note that the unlink may have already happened from the other side, so
7194 * ignore any failures to unlink and just remove it from the kqueue list.
7196 * On success, caller is responsible for the link structure
7199 knote_unlink_waitq(struct knote
*kn
, struct waitq
*wq
)
7201 struct kqueue
*kq
= knote_get_kq(kn
);
7204 kr
= waitq_unlink(wq
, &kq
->kq_wqs
);
7205 knote_clearstayactive(kn
);
7206 return ((kr
!= KERN_SUCCESS
) ? EINVAL
: 0);
7210 * remove all knotes referencing a specified fd
7212 * Entered with the proc_fd lock already held.
7213 * It returns the same way, but may drop it temporarily.
7216 knote_fdclose(struct proc
*p
, int fd
)
7220 KNOTE_LOCK_CTX(knlc
);
7223 list
= &p
->p_fd
->fd_knlist
[fd
];
7224 SLIST_FOREACH(kn
, list
, kn_link
) {
7225 struct kqueue
*kq
= knote_get_kq(kn
);
7230 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
7231 __func__
, kq
->kq_p
, p
);
7234 * If the knote supports EV_VANISHED delivery,
7235 * transition it to vanished mode (or skip over
7236 * it if already vanished).
7238 if (kn
->kn_status
& KN_VANISHED
) {
7244 if (!knote_lock(kq
, kn
, &knlc
, KNOTE_KQ_LOCK_ON_SUCCESS
)) {
7245 /* the knote was dropped by someone, nothing to do */
7246 } else if (kn
->kn_status
& KN_REQVANISH
) {
7247 kn
->kn_status
|= KN_VANISHED
;
7248 kn
->kn_status
&= ~KN_ATTACHED
;
7251 knote_fops(kn
)->f_detach(kn
);
7252 if (knote_fops(kn
)->f_isfd
)
7253 fp_drop(p
, kn
->kn_id
, kn
->kn_fp
, 0);
7257 knote_unlock(kq
, kn
, &knlc
, KNOTE_KQ_UNLOCK
);
7259 knote_drop(kq
, kn
, &knlc
);
7268 * knote_fdfind - lookup a knote in the fd table for process
7270 * If the filter is file-based, lookup based on fd index.
7271 * Otherwise use a hash based on the ident.
7273 * Matching is based on kq, filter, and ident. Optionally,
7274 * it may also be based on the udata field in the kevent -
7275 * allowing multiple event registration for the file object
7278 * fd_knhashlock or fdlock held on entry (and exit)
7280 static struct knote
*
7281 knote_fdfind(struct kqueue
*kq
,
7282 struct kevent_internal_s
*kev
,
7286 struct filedesc
*fdp
= p
->p_fd
;
7287 struct klist
*list
= NULL
;
7288 struct knote
*kn
= NULL
;
7291 * determine where to look for the knote
7294 /* fd-based knotes are linked off the fd table */
7295 if (kev
->ident
< (u_int
)fdp
->fd_knlistsize
) {
7296 list
= &fdp
->fd_knlist
[kev
->ident
];
7298 } else if (fdp
->fd_knhashmask
!= 0) {
7299 /* hash non-fd knotes here too */
7300 list
= &fdp
->fd_knhash
[KN_HASH((u_long
)kev
->ident
, fdp
->fd_knhashmask
)];
7304 * scan the selected list looking for a match
7307 SLIST_FOREACH(kn
, list
, kn_link
) {
7308 if (kq
== knote_get_kq(kn
) &&
7309 kev
->ident
== kn
->kn_id
&&
7310 kev
->filter
== kn
->kn_filter
) {
7311 if (kev
->flags
& EV_UDATA_SPECIFIC
) {
7312 if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) &&
7313 kev
->udata
== kn
->kn_udata
) {
7314 break; /* matching udata-specific knote */
7316 } else if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) == 0) {
7317 break; /* matching non-udata-specific knote */
7326 * kq_add_knote- Add knote to the fd table for process
7327 * while checking for duplicates.
7329 * All file-based filters associate a list of knotes by file
7330 * descriptor index. All other filters hash the knote by ident.
7332 * May have to grow the table of knote lists to cover the
7333 * file descriptor index presented.
7335 * fd_knhashlock and fdlock unheld on entry (and exit).
7337 * Takes a rwlock boost if inserting the knote is successful.
7340 kq_add_knote(struct kqueue
*kq
, struct knote
*kn
, struct knote_lock_ctx
*knlc
,
7343 struct filedesc
*fdp
= p
->p_fd
;
7344 struct klist
*list
= NULL
;
7346 bool is_fd
= knote_fops(kn
)->f_isfd
;
7353 if (knote_fdfind(kq
, &kn
->kn_kevent
, is_fd
, p
) != NULL
) {
7354 /* found an existing knote: we can't add this one */
7359 /* knote was not found: add it now */
7361 if (fdp
->fd_knhashmask
== 0) {
7364 list
= hashinit(CONFIG_KN_HASHSIZE
, M_KQUEUE
, &size
);
7370 fdp
->fd_knhash
= list
;
7371 fdp
->fd_knhashmask
= size
;
7374 list
= &fdp
->fd_knhash
[KN_HASH(kn
->kn_id
, fdp
->fd_knhashmask
)];
7375 SLIST_INSERT_HEAD(list
, kn
, kn_link
);
7380 /* knote is fd based */
7382 if ((u_int
)fdp
->fd_knlistsize
<= kn
->kn_id
) {
7385 if (kn
->kn_id
>= (uint64_t)p
->p_rlimit
[RLIMIT_NOFILE
].rlim_cur
7386 || kn
->kn_id
>= (uint64_t)maxfiles
) {
7390 /* have to grow the fd_knlist */
7391 size
= fdp
->fd_knlistsize
;
7392 while (size
<= kn
->kn_id
)
7395 if (size
>= (UINT_MAX
/sizeof(struct klist
*))) {
7400 MALLOC(list
, struct klist
*,
7401 size
* sizeof(struct klist
*), M_KQUEUE
, M_WAITOK
);
7407 bcopy((caddr_t
)fdp
->fd_knlist
, (caddr_t
)list
,
7408 fdp
->fd_knlistsize
* sizeof(struct klist
*));
7409 bzero((caddr_t
)list
+
7410 fdp
->fd_knlistsize
* sizeof(struct klist
*),
7411 (size
- fdp
->fd_knlistsize
) * sizeof(struct klist
*));
7412 FREE(fdp
->fd_knlist
, M_KQUEUE
);
7413 fdp
->fd_knlist
= list
;
7414 fdp
->fd_knlistsize
= size
;
7417 list
= &fdp
->fd_knlist
[kn
->kn_id
];
7418 SLIST_INSERT_HEAD(list
, kn
, kn_link
);
7427 assert((kn
->kn_status
& KN_LOCKED
) == 0);
7428 (void)knote_lock(kq
, kn
, knlc
, KNOTE_KQ_UNLOCK
);
7439 * kq_remove_knote - remove a knote from the fd table for process
7441 * If the filter is file-based, remove based on fd index.
7442 * Otherwise remove from the hash based on the ident.
7444 * fd_knhashlock and fdlock unheld on entry (and exit).
7447 kq_remove_knote(struct kqueue
*kq
, struct knote
*kn
, struct proc
*p
,
7448 struct knote_lock_ctx
*knlc
)
7450 struct filedesc
*fdp
= p
->p_fd
;
7451 struct klist
*list
= NULL
;
7455 is_fd
= knote_fops(kn
)->f_isfd
;
7463 assert ((u_int
)fdp
->fd_knlistsize
> kn
->kn_id
);
7464 list
= &fdp
->fd_knlist
[kn
->kn_id
];
7466 list
= &fdp
->fd_knhash
[KN_HASH(kn
->kn_id
, fdp
->fd_knhashmask
)];
7468 SLIST_REMOVE(list
, kn
, knote
, kn_link
);
7471 kq_state
= kq
->kq_state
;
7473 knote_unlock_cancel(kq
, kn
, knlc
, KNOTE_KQ_UNLOCK
);
7482 if (kq_state
& KQ_DYNAMIC
)
7483 kqueue_release_last(p
, kq
);
7487 * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
7488 * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
7490 * fd_knhashlock or fdlock unheld on entry (and exit)
7493 static struct knote
*
7494 kq_find_knote_and_kq_lock(struct kqueue
*kq
, struct kevent_internal_s
*kev
,
7495 bool is_fd
, struct proc
*p
)
7504 ret
= knote_fdfind(kq
, kev
, is_fd
, p
);
7518 * knote_drop - disconnect and drop the knote
7520 * Called with the kqueue locked, returns with the kqueue unlocked.
7522 * If a knote locking context is passed, it is canceled.
7524 * The knote may have already been detached from
7525 * (or not yet attached to) its source object.
7528 knote_drop(struct kqueue
*kq
, struct knote
*kn
, struct knote_lock_ctx
*knlc
)
7530 struct proc
*p
= kq
->kq_p
;
7534 assert((kn
->kn_status
& KN_DROPPING
) == 0);
7536 assert((kn
->kn_status
& KN_LOCKED
) == 0);
7538 kn
->kn_status
|= KN_DROPPING
;
7540 knote_unsuppress(kn
);
7542 knote_wait_for_filter_events(kq
, kn
);
7544 /* If we are attached, disconnect from the source first */
7545 if (kn
->kn_status
& KN_ATTACHED
) {
7546 knote_fops(kn
)->f_detach(kn
);
7549 /* kq may be freed when kq_remove_knote() returns */
7550 kq_remove_knote(kq
, kn
, p
, knlc
);
7551 if (knote_fops(kn
)->f_isfd
&& ((kn
->kn_status
& KN_VANISHED
) == 0))
7552 fp_drop(p
, kn
->kn_id
, kn
->kn_fp
, 0);
7557 /* called with kqueue lock held */
7559 knote_activate(struct knote
*kn
)
7561 if (kn
->kn_status
& KN_ACTIVE
)
7564 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE
),
7565 kn
->kn_udata
, kn
->kn_status
| (kn
->kn_id
<< 32),
7568 kn
->kn_status
|= KN_ACTIVE
;
7569 if (knote_enqueue(kn
))
7573 /* called with kqueue lock held */
7575 knote_deactivate(struct knote
*kn
)
7577 kn
->kn_status
&= ~KN_ACTIVE
;
7578 if ((kn
->kn_status
& KN_STAYACTIVE
) == 0)
7582 /* called with kqueue lock held */
7584 knote_enable(struct knote
*kn
)
7586 if ((kn
->kn_status
& KN_DISABLED
) == 0)
7589 kn
->kn_status
&= ~KN_DISABLED
;
7591 if (kn
->kn_status
& KN_SUPPRESSED
) {
7593 * it is possible for userland to have knotes registered for a given
7594 * workloop `wl_orig` but really handled on another workloop `wl_new`.
7596 * In that case, rearming will happen from the servicer thread of
7597 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
7598 * this knote to stay suppressed forever if we only relied on
7599 * kqworkloop_acknowledge_events to be called by `wl_orig`.
7601 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
7602 * unsuppress because that would mess with the processing phase of
7603 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
7606 struct kqueue
*kq
= knote_get_kq(kn
);
7607 if ((kq
->kq_state
& KQ_PROCESSING
) == 0) {
7608 knote_unsuppress(kn
);
7610 } else if (knote_enqueue(kn
)) {
7615 /* called with kqueue lock held */
7617 knote_disable(struct knote
*kn
)
7619 if (kn
->kn_status
& KN_DISABLED
)
7622 kn
->kn_status
|= KN_DISABLED
;
7626 /* called with kqueue lock held */
7628 knote_suppress(struct knote
*kn
)
7630 struct kqtailq
*suppressq
;
7631 struct kqueue
*kq
= knote_get_kq(kn
);
7635 if (kn
->kn_status
& KN_SUPPRESSED
)
7639 kn
->kn_status
|= KN_SUPPRESSED
;
7640 suppressq
= kqueue_get_suppressed_queue(kq
, kn
);
7641 TAILQ_INSERT_TAIL(suppressq
, kn
, kn_tqe
);
7644 /* called with kqueue lock held */
7646 knote_unsuppress(struct knote
*kn
)
7648 struct kqtailq
*suppressq
;
7649 struct kqueue
*kq
= knote_get_kq(kn
);
7653 if ((kn
->kn_status
& KN_SUPPRESSED
) == 0)
7656 kn
->kn_status
&= ~KN_SUPPRESSED
;
7657 suppressq
= kqueue_get_suppressed_queue(kq
, kn
);
7658 TAILQ_REMOVE(suppressq
, kn
, kn_tqe
);
7661 * If the knote is no longer active, reset its push,
7662 * and resynchronize kn_qos_index with kn_qos_override
7664 if ((kn
->kn_status
& KN_ACTIVE
) == 0) {
7665 kn
->kn_qos_override
= kn
->kn_req_index
;
7667 kn
->kn_qos_index
= kn
->kn_qos_override
;
7669 /* don't wakeup if unsuppressing just a stay-active knote */
7670 if (knote_enqueue(kn
) && (kn
->kn_status
& KN_ACTIVE
)) {
7674 if ((kq
->kq_state
& KQ_WORKLOOP
) && TAILQ_EMPTY(suppressq
)) {
7675 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
7677 if (kqworkloop_is_processing_on_current_thread(kqwl
)) {
7679 * kqworkloop_end_processing() or kqworkloop_begin_processing()
7680 * will perform the required QoS computations when it unsets the
7685 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_RESET_WAKEUP_OVERRIDE
, 0);
7686 kq_req_unlock(kqwl
);
7691 /* called with kqueue lock held */
7693 knote_enqueue(struct knote
*kn
)
7695 if ((kn
->kn_status
& (KN_ACTIVE
| KN_STAYACTIVE
)) == 0 ||
7696 (kn
->kn_status
& (KN_DISABLED
| KN_SUPPRESSED
| KN_DROPPING
)))
7699 if ((kn
->kn_status
& KN_QUEUED
) == 0) {
7700 struct kqtailq
*queue
= knote_get_queue(kn
);
7701 struct kqueue
*kq
= knote_get_kq(kn
);
7704 TAILQ_INSERT_TAIL(queue
, kn
, kn_tqe
);
7705 kn
->kn_status
|= KN_QUEUED
;
7709 return ((kn
->kn_status
& KN_STAYACTIVE
) != 0);
7713 /* called with kqueue lock held */
7715 knote_dequeue(struct knote
*kn
)
7717 struct kqueue
*kq
= knote_get_kq(kn
);
7718 struct kqtailq
*queue
;
7722 if ((kn
->kn_status
& KN_QUEUED
) == 0)
7725 queue
= knote_get_queue(kn
);
7726 TAILQ_REMOVE(queue
, kn
, kn_tqe
);
7727 kn
->kn_status
&= ~KN_QUEUED
;
7734 knote_zone
= zinit(sizeof(struct knote
), 8192*sizeof(struct knote
),
7735 8192, "knote zone");
7737 kqfile_zone
= zinit(sizeof(struct kqfile
), 8192*sizeof(struct kqfile
),
7738 8192, "kqueue file zone");
7740 kqworkq_zone
= zinit(sizeof(struct kqworkq
), 8192*sizeof(struct kqworkq
),
7741 8192, "kqueue workq zone");
7743 kqworkloop_zone
= zinit(sizeof(struct kqworkloop
), 8192*sizeof(struct kqworkloop
),
7744 8192, "kqueue workloop zone");
7746 /* allocate kq lock group attribute and group */
7747 kq_lck_grp_attr
= lck_grp_attr_alloc_init();
7749 kq_lck_grp
= lck_grp_alloc_init("kqueue", kq_lck_grp_attr
);
7751 /* Allocate kq lock attribute */
7752 kq_lck_attr
= lck_attr_alloc_init();
7754 #if CONFIG_MEMORYSTATUS
7755 /* Initialize the memorystatus list lock */
7756 memorystatus_kevent_init(kq_lck_grp
, kq_lck_attr
);
7759 SYSINIT(knote
, SI_SUB_PSEUDO
, SI_ORDER_ANY
, knote_init
, NULL
)
7761 const struct filterops
*
7762 knote_fops(struct knote
*kn
)
7764 return sysfilt_ops
[kn
->kn_filtid
];
7767 static struct knote
*
7770 struct knote
*kn
= ((struct knote
*)zalloc(knote_zone
));
7771 bzero(kn
, sizeof(struct knote
));
7776 knote_free(struct knote
*kn
)
7778 assert(kn
->kn_inuse
== 0);
7779 assert((kn
->kn_status
& KN_LOCKED
) == 0);
7780 zfree(knote_zone
, kn
);
7784 #include <sys/param.h>
7785 #include <sys/socket.h>
7786 #include <sys/protosw.h>
7787 #include <sys/domain.h>
7788 #include <sys/mbuf.h>
7789 #include <sys/kern_event.h>
7790 #include <sys/malloc.h>
7791 #include <sys/sys_domain.h>
7792 #include <sys/syslog.h>
7795 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
7799 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
7802 static lck_grp_attr_t
*kev_lck_grp_attr
;
7803 static lck_attr_t
*kev_lck_attr
;
7804 static lck_grp_t
*kev_lck_grp
;
7805 static decl_lck_rw_data(,kev_lck_data
);
7806 static lck_rw_t
*kev_rwlock
= &kev_lck_data
;
7808 static int kev_attach(struct socket
*so
, int proto
, struct proc
*p
);
7809 static int kev_detach(struct socket
*so
);
7810 static int kev_control(struct socket
*so
, u_long cmd
, caddr_t data
,
7811 struct ifnet
*ifp
, struct proc
*p
);
7812 static lck_mtx_t
* event_getlock(struct socket
*, int);
7813 static int event_lock(struct socket
*, int, void *);
7814 static int event_unlock(struct socket
*, int, void *);
7816 static int event_sofreelastref(struct socket
*);
7817 static void kev_delete(struct kern_event_pcb
*);
7819 static struct pr_usrreqs event_usrreqs
= {
7820 .pru_attach
= kev_attach
,
7821 .pru_control
= kev_control
,
7822 .pru_detach
= kev_detach
,
7823 .pru_soreceive
= soreceive
,
7826 static struct protosw eventsw
[] = {
7828 .pr_type
= SOCK_RAW
,
7829 .pr_protocol
= SYSPROTO_EVENT
,
7830 .pr_flags
= PR_ATOMIC
,
7831 .pr_usrreqs
= &event_usrreqs
,
7832 .pr_lock
= event_lock
,
7833 .pr_unlock
= event_unlock
,
7834 .pr_getlock
= event_getlock
,
7838 __private_extern__
int kevt_getstat SYSCTL_HANDLER_ARGS
;
7839 __private_extern__
int kevt_pcblist SYSCTL_HANDLER_ARGS
;
7841 SYSCTL_NODE(_net_systm
, OID_AUTO
, kevt
,
7842 CTLFLAG_RW
|CTLFLAG_LOCKED
, 0, "Kernel event family");
7844 struct kevtstat kevtstat
;
7845 SYSCTL_PROC(_net_systm_kevt
, OID_AUTO
, stats
,
7846 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
, 0, 0,
7847 kevt_getstat
, "S,kevtstat", "");
7849 SYSCTL_PROC(_net_systm_kevt
, OID_AUTO
, pcblist
,
7850 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
, 0, 0,
7851 kevt_pcblist
, "S,xkevtpcb", "");
7854 event_getlock(struct socket
*so
, int flags
)
7856 #pragma unused(flags)
7857 struct kern_event_pcb
*ev_pcb
= (struct kern_event_pcb
*)so
->so_pcb
;
7859 if (so
->so_pcb
!= NULL
) {
7860 if (so
->so_usecount
< 0)
7861 panic("%s: so=%p usecount=%d lrh= %s\n", __func__
,
7862 so
, so
->so_usecount
, solockhistory_nr(so
));
7865 panic("%s: so=%p NULL NO so_pcb %s\n", __func__
,
7866 so
, solockhistory_nr(so
));
7869 return (&ev_pcb
->evp_mtx
);
7873 event_lock(struct socket
*so
, int refcount
, void *lr
)
7878 lr_saved
= __builtin_return_address(0);
7882 if (so
->so_pcb
!= NULL
) {
7883 lck_mtx_lock(&((struct kern_event_pcb
*)so
->so_pcb
)->evp_mtx
);
7885 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__
,
7886 so
, lr_saved
, solockhistory_nr(so
));
7890 if (so
->so_usecount
< 0) {
7891 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__
,
7892 so
, so
->so_pcb
, lr_saved
, so
->so_usecount
,
7893 solockhistory_nr(so
));
7900 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
7901 so
->next_lock_lr
= (so
->next_lock_lr
+1) % SO_LCKDBG_MAX
;
7906 event_unlock(struct socket
*so
, int refcount
, void *lr
)
7909 lck_mtx_t
*mutex_held
;
7912 lr_saved
= __builtin_return_address(0);
7919 if (so
->so_usecount
< 0) {
7920 panic("%s: so=%p usecount=%d lrh= %s\n", __func__
,
7921 so
, so
->so_usecount
, solockhistory_nr(so
));
7924 if (so
->so_pcb
== NULL
) {
7925 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__
,
7926 so
, so
->so_usecount
, (void *)lr_saved
,
7927 solockhistory_nr(so
));
7930 mutex_held
= (&((struct kern_event_pcb
*)so
->so_pcb
)->evp_mtx
);
7932 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
7933 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
7934 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
7936 if (so
->so_usecount
== 0) {
7937 VERIFY(so
->so_flags
& SOF_PCBCLEARING
);
7938 event_sofreelastref(so
);
7940 lck_mtx_unlock(mutex_held
);
7947 event_sofreelastref(struct socket
*so
)
7949 struct kern_event_pcb
*ev_pcb
= (struct kern_event_pcb
*)so
->so_pcb
;
7951 LCK_MTX_ASSERT(&(ev_pcb
->evp_mtx
), LCK_MTX_ASSERT_OWNED
);
7956 * Disable upcall in the event another thread is in kev_post_msg()
7957 * appending record to the receive socket buffer, since sbwakeup()
7958 * may release the socket lock otherwise.
7960 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
7961 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
7962 so
->so_event
= sonullevent
;
7963 lck_mtx_unlock(&(ev_pcb
->evp_mtx
));
7965 LCK_MTX_ASSERT(&(ev_pcb
->evp_mtx
), LCK_MTX_ASSERT_NOTOWNED
);
7966 lck_rw_lock_exclusive(kev_rwlock
);
7967 LIST_REMOVE(ev_pcb
, evp_link
);
7968 kevtstat
.kes_pcbcount
--;
7969 kevtstat
.kes_gencnt
++;
7970 lck_rw_done(kev_rwlock
);
7973 sofreelastref(so
, 1);
7977 static int event_proto_count
= (sizeof (eventsw
) / sizeof (struct protosw
));
7980 struct kern_event_head kern_event_head
;
7982 static u_int32_t static_event_id
= 0;
7984 #define EVPCB_ZONE_MAX 65536
7985 #define EVPCB_ZONE_NAME "kerneventpcb"
7986 static struct zone
*ev_pcb_zone
;
7989 * Install the protosw's for the NKE manager. Invoked at extension load time
7992 kern_event_init(struct domain
*dp
)
7997 VERIFY(!(dp
->dom_flags
& DOM_INITIALIZED
));
7998 VERIFY(dp
== systemdomain
);
8000 kev_lck_grp_attr
= lck_grp_attr_alloc_init();
8001 if (kev_lck_grp_attr
== NULL
) {
8002 panic("%s: lck_grp_attr_alloc_init failed\n", __func__
);
8006 kev_lck_grp
= lck_grp_alloc_init("Kernel Event Protocol",
8008 if (kev_lck_grp
== NULL
) {
8009 panic("%s: lck_grp_alloc_init failed\n", __func__
);
8013 kev_lck_attr
= lck_attr_alloc_init();
8014 if (kev_lck_attr
== NULL
) {
8015 panic("%s: lck_attr_alloc_init failed\n", __func__
);
8019 lck_rw_init(kev_rwlock
, kev_lck_grp
, kev_lck_attr
);
8020 if (kev_rwlock
== NULL
) {
8021 panic("%s: lck_mtx_alloc_init failed\n", __func__
);
8025 for (i
= 0, pr
= &eventsw
[0]; i
< event_proto_count
; i
++, pr
++)
8026 net_add_proto(pr
, dp
, 1);
8028 ev_pcb_zone
= zinit(sizeof(struct kern_event_pcb
),
8029 EVPCB_ZONE_MAX
* sizeof(struct kern_event_pcb
), 0, EVPCB_ZONE_NAME
);
8030 if (ev_pcb_zone
== NULL
) {
8031 panic("%s: failed allocating ev_pcb_zone", __func__
);
8034 zone_change(ev_pcb_zone
, Z_EXPAND
, TRUE
);
8035 zone_change(ev_pcb_zone
, Z_CALLERACCT
, TRUE
);
8039 kev_attach(struct socket
*so
, __unused
int proto
, __unused
struct proc
*p
)
8042 struct kern_event_pcb
*ev_pcb
;
8044 error
= soreserve(so
, KEV_SNDSPACE
, KEV_RECVSPACE
);
8048 if ((ev_pcb
= (struct kern_event_pcb
*)zalloc(ev_pcb_zone
)) == NULL
) {
8051 bzero(ev_pcb
, sizeof(struct kern_event_pcb
));
8052 lck_mtx_init(&ev_pcb
->evp_mtx
, kev_lck_grp
, kev_lck_attr
);
8054 ev_pcb
->evp_socket
= so
;
8055 ev_pcb
->evp_vendor_code_filter
= 0xffffffff;
8057 so
->so_pcb
= (caddr_t
) ev_pcb
;
8058 lck_rw_lock_exclusive(kev_rwlock
);
8059 LIST_INSERT_HEAD(&kern_event_head
, ev_pcb
, evp_link
);
8060 kevtstat
.kes_pcbcount
++;
8061 kevtstat
.kes_gencnt
++;
8062 lck_rw_done(kev_rwlock
);
8068 kev_delete(struct kern_event_pcb
*ev_pcb
)
8070 VERIFY(ev_pcb
!= NULL
);
8071 lck_mtx_destroy(&ev_pcb
->evp_mtx
, kev_lck_grp
);
8072 zfree(ev_pcb_zone
, ev_pcb
);
8076 kev_detach(struct socket
*so
)
8078 struct kern_event_pcb
*ev_pcb
= (struct kern_event_pcb
*) so
->so_pcb
;
8080 if (ev_pcb
!= NULL
) {
8081 soisdisconnected(so
);
8082 so
->so_flags
|= SOF_PCBCLEARING
;
8089 * For now, kev_vendor_code and mbuf_tags use the same
8092 errno_t
kev_vendor_code_find(
8094 u_int32_t
*out_vendor_code
)
8096 if (strlen(string
) >= KEV_VENDOR_CODE_MAX_STR_LEN
) {
8099 return (net_str_id_find_internal(string
, out_vendor_code
,
8100 NSI_VENDOR_CODE
, 1));
8104 kev_msg_post(struct kev_msg
*event_msg
)
8106 mbuf_tag_id_t min_vendor
, max_vendor
;
8108 net_str_id_first_last(&min_vendor
, &max_vendor
, NSI_VENDOR_CODE
);
8110 if (event_msg
== NULL
)
8114 * Limit third parties to posting events for registered vendor codes
8117 if (event_msg
->vendor_code
< min_vendor
||
8118 event_msg
->vendor_code
> max_vendor
) {
8119 OSIncrementAtomic64((SInt64
*)&kevtstat
.kes_badvendor
);
8122 return (kev_post_msg(event_msg
));
8126 kev_post_msg(struct kev_msg
*event_msg
)
8128 struct mbuf
*m
, *m2
;
8129 struct kern_event_pcb
*ev_pcb
;
8130 struct kern_event_msg
*ev
;
8132 u_int32_t total_size
;
8135 /* Verify the message is small enough to fit in one mbuf w/o cluster */
8136 total_size
= KEV_MSG_HEADER_SIZE
;
8138 for (i
= 0; i
< 5; i
++) {
8139 if (event_msg
->dv
[i
].data_length
== 0)
8141 total_size
+= event_msg
->dv
[i
].data_length
;
8144 if (total_size
> MLEN
) {
8145 OSIncrementAtomic64((SInt64
*)&kevtstat
.kes_toobig
);
8149 m
= m_get(M_WAIT
, MT_DATA
);
8151 OSIncrementAtomic64((SInt64
*)&kevtstat
.kes_nomem
);
8154 ev
= mtod(m
, struct kern_event_msg
*);
8155 total_size
= KEV_MSG_HEADER_SIZE
;
8157 tmp
= (char *) &ev
->event_data
[0];
8158 for (i
= 0; i
< 5; i
++) {
8159 if (event_msg
->dv
[i
].data_length
== 0)
8162 total_size
+= event_msg
->dv
[i
].data_length
;
8163 bcopy(event_msg
->dv
[i
].data_ptr
, tmp
,
8164 event_msg
->dv
[i
].data_length
);
8165 tmp
+= event_msg
->dv
[i
].data_length
;
8168 ev
->id
= ++static_event_id
;
8169 ev
->total_size
= total_size
;
8170 ev
->vendor_code
= event_msg
->vendor_code
;
8171 ev
->kev_class
= event_msg
->kev_class
;
8172 ev
->kev_subclass
= event_msg
->kev_subclass
;
8173 ev
->event_code
= event_msg
->event_code
;
8175 m
->m_len
= total_size
;
8176 lck_rw_lock_shared(kev_rwlock
);
8177 for (ev_pcb
= LIST_FIRST(&kern_event_head
);
8179 ev_pcb
= LIST_NEXT(ev_pcb
, evp_link
)) {
8180 lck_mtx_lock(&ev_pcb
->evp_mtx
);
8181 if (ev_pcb
->evp_socket
->so_pcb
== NULL
) {
8182 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8185 if (ev_pcb
->evp_vendor_code_filter
!= KEV_ANY_VENDOR
) {
8186 if (ev_pcb
->evp_vendor_code_filter
!= ev
->vendor_code
) {
8187 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8191 if (ev_pcb
->evp_class_filter
!= KEV_ANY_CLASS
) {
8192 if (ev_pcb
->evp_class_filter
!= ev
->kev_class
) {
8193 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8197 if ((ev_pcb
->evp_subclass_filter
!=
8198 KEV_ANY_SUBCLASS
) &&
8199 (ev_pcb
->evp_subclass_filter
!=
8200 ev
->kev_subclass
)) {
8201 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8207 m2
= m_copym(m
, 0, m
->m_len
, M_WAIT
);
8209 OSIncrementAtomic64((SInt64
*)&kevtstat
.kes_nomem
);
8211 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8212 lck_rw_done(kev_rwlock
);
8215 if (sbappendrecord(&ev_pcb
->evp_socket
->so_rcv
, m2
)) {
8217 * We use "m" for the socket stats as it would be
8218 * unsafe to use "m2"
8220 so_inc_recv_data_stat(ev_pcb
->evp_socket
,
8221 1, m
->m_len
, MBUF_TC_BE
);
8223 sorwakeup(ev_pcb
->evp_socket
);
8224 OSIncrementAtomic64((SInt64
*)&kevtstat
.kes_posted
);
8226 OSIncrementAtomic64((SInt64
*)&kevtstat
.kes_fullsock
);
8228 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8231 lck_rw_done(kev_rwlock
);
8237 kev_control(struct socket
*so
,
8240 __unused
struct ifnet
*ifp
,
8241 __unused
struct proc
*p
)
8243 struct kev_request
*kev_req
= (struct kev_request
*) data
;
8244 struct kern_event_pcb
*ev_pcb
;
8245 struct kev_vendor_code
*kev_vendor
;
8246 u_int32_t
*id_value
= (u_int32_t
*) data
;
8250 *id_value
= static_event_id
;
8253 ev_pcb
= (struct kern_event_pcb
*) so
->so_pcb
;
8254 ev_pcb
->evp_vendor_code_filter
= kev_req
->vendor_code
;
8255 ev_pcb
->evp_class_filter
= kev_req
->kev_class
;
8256 ev_pcb
->evp_subclass_filter
= kev_req
->kev_subclass
;
8259 ev_pcb
= (struct kern_event_pcb
*) so
->so_pcb
;
8260 kev_req
->vendor_code
= ev_pcb
->evp_vendor_code_filter
;
8261 kev_req
->kev_class
= ev_pcb
->evp_class_filter
;
8262 kev_req
->kev_subclass
= ev_pcb
->evp_subclass_filter
;
8264 case SIOCGKEVVENDOR
:
8265 kev_vendor
= (struct kev_vendor_code
*)data
;
8266 /* Make sure string is NULL terminated */
8267 kev_vendor
->vendor_string
[KEV_VENDOR_CODE_MAX_STR_LEN
-1] = 0;
8268 return (net_str_id_find_internal(kev_vendor
->vendor_string
,
8269 &kev_vendor
->vendor_code
, NSI_VENDOR_CODE
, 0));
8278 kevt_getstat SYSCTL_HANDLER_ARGS
8280 #pragma unused(oidp, arg1, arg2)
8283 lck_rw_lock_shared(kev_rwlock
);
8285 if (req
->newptr
!= USER_ADDR_NULL
) {
8289 if (req
->oldptr
== USER_ADDR_NULL
) {
8290 req
->oldidx
= sizeof(struct kevtstat
);
8294 error
= SYSCTL_OUT(req
, &kevtstat
,
8295 MIN(sizeof(struct kevtstat
), req
->oldlen
));
8297 lck_rw_done(kev_rwlock
);
8302 __private_extern__
int
8303 kevt_pcblist SYSCTL_HANDLER_ARGS
8305 #pragma unused(oidp, arg1, arg2)
8308 struct xsystmgen xsg
;
8310 size_t item_size
= ROUNDUP64(sizeof (struct xkevtpcb
)) +
8311 ROUNDUP64(sizeof (struct xsocket_n
)) +
8312 2 * ROUNDUP64(sizeof (struct xsockbuf_n
)) +
8313 ROUNDUP64(sizeof (struct xsockstat_n
));
8314 struct kern_event_pcb
*ev_pcb
;
8316 buf
= _MALLOC(item_size
, M_TEMP
, M_WAITOK
| M_ZERO
);
8320 lck_rw_lock_shared(kev_rwlock
);
8322 n
= kevtstat
.kes_pcbcount
;
8324 if (req
->oldptr
== USER_ADDR_NULL
) {
8325 req
->oldidx
= (n
+ n
/8) * item_size
;
8328 if (req
->newptr
!= USER_ADDR_NULL
) {
8332 bzero(&xsg
, sizeof (xsg
));
8333 xsg
.xg_len
= sizeof (xsg
);
8335 xsg
.xg_gen
= kevtstat
.kes_gencnt
;
8336 xsg
.xg_sogen
= so_gencnt
;
8337 error
= SYSCTL_OUT(req
, &xsg
, sizeof (xsg
));
8342 * We are done if there is no pcb
8349 for (i
= 0, ev_pcb
= LIST_FIRST(&kern_event_head
);
8350 i
< n
&& ev_pcb
!= NULL
;
8351 i
++, ev_pcb
= LIST_NEXT(ev_pcb
, evp_link
)) {
8352 struct xkevtpcb
*xk
= (struct xkevtpcb
*)buf
;
8353 struct xsocket_n
*xso
= (struct xsocket_n
*)
8354 ADVANCE64(xk
, sizeof (*xk
));
8355 struct xsockbuf_n
*xsbrcv
= (struct xsockbuf_n
*)
8356 ADVANCE64(xso
, sizeof (*xso
));
8357 struct xsockbuf_n
*xsbsnd
= (struct xsockbuf_n
*)
8358 ADVANCE64(xsbrcv
, sizeof (*xsbrcv
));
8359 struct xsockstat_n
*xsostats
= (struct xsockstat_n
*)
8360 ADVANCE64(xsbsnd
, sizeof (*xsbsnd
));
8362 bzero(buf
, item_size
);
8364 lck_mtx_lock(&ev_pcb
->evp_mtx
);
8366 xk
->kep_len
= sizeof(struct xkevtpcb
);
8367 xk
->kep_kind
= XSO_EVT
;
8368 xk
->kep_evtpcb
= (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb
);
8369 xk
->kep_vendor_code_filter
= ev_pcb
->evp_vendor_code_filter
;
8370 xk
->kep_class_filter
= ev_pcb
->evp_class_filter
;
8371 xk
->kep_subclass_filter
= ev_pcb
->evp_subclass_filter
;
8373 sotoxsocket_n(ev_pcb
->evp_socket
, xso
);
8374 sbtoxsockbuf_n(ev_pcb
->evp_socket
?
8375 &ev_pcb
->evp_socket
->so_rcv
: NULL
, xsbrcv
);
8376 sbtoxsockbuf_n(ev_pcb
->evp_socket
?
8377 &ev_pcb
->evp_socket
->so_snd
: NULL
, xsbsnd
);
8378 sbtoxsockstat_n(ev_pcb
->evp_socket
, xsostats
);
8380 lck_mtx_unlock(&ev_pcb
->evp_mtx
);
8382 error
= SYSCTL_OUT(req
, buf
, item_size
);
8387 * Give the user an updated idea of our state.
8388 * If the generation differs from what we told
8389 * her before, she knows that something happened
8390 * while we were processing this request, and it
8391 * might be necessary to retry.
8393 bzero(&xsg
, sizeof (xsg
));
8394 xsg
.xg_len
= sizeof (xsg
);
8396 xsg
.xg_gen
= kevtstat
.kes_gencnt
;
8397 xsg
.xg_sogen
= so_gencnt
;
8398 error
= SYSCTL_OUT(req
, &xsg
, sizeof (xsg
));
8405 lck_rw_done(kev_rwlock
);
8410 #endif /* SOCKETS */
8414 fill_kqueueinfo(struct kqueue
*kq
, struct kqueue_info
* kinfo
)
8416 struct vinfo_stat
* st
;
8418 st
= &kinfo
->kq_stat
;
8420 st
->vst_size
= kq
->kq_count
;
8421 if (kq
->kq_state
& KQ_KEV_QOS
)
8422 st
->vst_blksize
= sizeof(struct kevent_qos_s
);
8423 else if (kq
->kq_state
& KQ_KEV64
)
8424 st
->vst_blksize
= sizeof(struct kevent64_s
);
8426 st
->vst_blksize
= sizeof(struct kevent
);
8427 st
->vst_mode
= S_IFIFO
;
8428 st
->vst_ino
= (kq
->kq_state
& KQ_DYNAMIC
) ?
8429 ((struct kqworkloop
*)kq
)->kqwl_dynamicid
: 0;
8431 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
8432 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
8433 kinfo
->kq_state
= kq
->kq_state
& PROC_KQUEUE_MASK
;
8439 fill_kqueue_dyninfo(struct kqueue
*kq
, struct kqueue_dyninfo
*kqdi
)
8441 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
8442 struct kqrequest
*kqr
= &kqwl
->kqwl_request
;
8443 workq_threadreq_param_t trp
= {};
8446 if ((kq
->kq_state
& KQ_WORKLOOP
) == 0) {
8450 if ((err
= fill_kqueueinfo(kq
, &kqdi
->kqdi_info
))) {
8456 kqdi
->kqdi_servicer
= thread_tid(kqr
->kqr_thread
);
8457 kqdi
->kqdi_owner
= thread_tid(kqwl
->kqwl_owner
);
8458 kqdi
->kqdi_request_state
= kqr
->kqr_state
;
8459 kqdi
->kqdi_async_qos
= kqr
->kqr_qos_index
;
8460 kqdi
->kqdi_events_qos
= kqr
->kqr_override_index
;
8461 kqdi
->kqdi_sync_waiters
= kqr
->kqr_dsync_waiters
;
8462 kqdi
->kqdi_sync_waiter_qos
= 0;
8464 trp
.trp_value
= kqwl
->kqwl_params
;
8465 if (trp
.trp_flags
& TRP_PRIORITY
)
8466 kqdi
->kqdi_pri
= trp
.trp_pri
;
8470 if (trp
.trp_flags
& TRP_POLICY
)
8471 kqdi
->kqdi_pol
= trp
.trp_pol
;
8475 if (trp
.trp_flags
& TRP_CPUPERCENT
)
8476 kqdi
->kqdi_cpupercent
= trp
.trp_cpupercent
;
8478 kqdi
->kqdi_cpupercent
= 0;
8480 kq_req_unlock(kqwl
);
8487 knote_markstayactive(struct knote
*kn
)
8489 struct kqueue
*kq
= knote_get_kq(kn
);
8493 kn
->kn_status
|= KN_STAYACTIVE
;
8496 * Making a knote stay active is a property of the knote that must be
8497 * established before it is fully attached.
8499 assert(kn
->kn_status
& KN_ATTACHING
);
8500 assert((kn
->kn_status
& (KN_QUEUED
| KN_SUPPRESSED
)) == 0);
8502 /* handle all stayactive knotes on the (appropriate) manager */
8503 if (kq
->kq_state
& KQ_WORKQ
) {
8504 qos
= KQWQ_QOS_MANAGER
;
8505 } else if (kq
->kq_state
& KQ_WORKLOOP
) {
8506 struct kqworkloop
*kqwl
= (struct kqworkloop
*)kq
;
8508 qos
= _pthread_priority_thread_qos(kn
->kn_qos
);
8509 assert(qos
&& qos
< THREAD_QOS_LAST
);
8511 kqworkloop_update_threads_qos(kqwl
, KQWL_UTQ_UPDATE_STAYACTIVE_QOS
, qos
);
8513 qos
= KQWL_BUCKET_STAYACTIVE
;
8515 qos
= THREAD_QOS_UNSPECIFIED
;
8518 kn
->kn_req_index
= qos
;
8519 kn
->kn_qos_override
= qos
;
8520 kn
->kn_qos_index
= qos
;
8527 knote_clearstayactive(struct knote
*kn
)
8529 kqlock(knote_get_kq(kn
));
8530 kn
->kn_status
&= ~KN_STAYACTIVE
;
8531 knote_deactivate(kn
);
8532 kqunlock(knote_get_kq(kn
));
8535 static unsigned long
8536 kevent_extinfo_emit(struct kqueue
*kq
, struct knote
*kn
, struct kevent_extinfo
*buf
,
8537 unsigned long buflen
, unsigned long nknotes
)
8539 for (; kn
; kn
= SLIST_NEXT(kn
, kn_link
)) {
8540 if (kq
== knote_get_kq(kn
)) {
8541 if (nknotes
< buflen
) {
8542 struct kevent_extinfo
*info
= &buf
[nknotes
];
8543 struct kevent_internal_s
*kevp
= &kn
->kn_kevent
;
8547 info
->kqext_kev
= (struct kevent_qos_s
){
8548 .ident
= kevp
->ident
,
8549 .filter
= kevp
->filter
,
8550 .flags
= kevp
->flags
,
8551 .fflags
= kevp
->fflags
,
8552 .data
= (int64_t)kevp
->data
,
8553 .udata
= kevp
->udata
,
8554 .ext
[0] = kevp
->ext
[0],
8555 .ext
[1] = kevp
->ext
[1],
8556 .ext
[2] = kevp
->ext
[2],
8557 .ext
[3] = kevp
->ext
[3],
8558 .qos
= kn
->kn_req_index
,
8560 info
->kqext_sdata
= kn
->kn_sdata
;
8561 info
->kqext_status
= kn
->kn_status
;
8562 info
->kqext_sfflags
= kn
->kn_sfflags
;
8567 /* we return total number of knotes, which may be more than requested */
8576 kevent_copyout_proc_dynkqids(void *proc
, user_addr_t ubuf
, uint32_t ubufsize
,
8577 int32_t *nkqueues_out
)
8579 proc_t p
= (proc_t
)proc
;
8580 struct filedesc
*fdp
= p
->p_fd
;
8581 unsigned int nkqueues
= 0;
8582 unsigned long ubuflen
= ubufsize
/ sizeof(kqueue_id_t
);
8583 size_t buflen
, bufsize
;
8584 kqueue_id_t
*kq_ids
= NULL
;
8589 if (ubuf
== USER_ADDR_NULL
&& ubufsize
!= 0) {
8594 buflen
= min(ubuflen
, PROC_PIDDYNKQUEUES_MAX
);
8597 if (os_mul_overflow(sizeof(kqueue_id_t
), buflen
, &bufsize
)) {
8601 kq_ids
= kalloc(bufsize
);
8606 bzero(kq_ids
, bufsize
);
8611 if (fdp
->fd_kqhashmask
> 0) {
8612 for (uint32_t i
= 0; i
< fdp
->fd_kqhashmask
+ 1; i
++) {
8613 struct kqworkloop
*kqwl
;
8615 SLIST_FOREACH(kqwl
, &fdp
->fd_kqhash
[i
], kqwl_hashlink
) {
8616 /* report the number of kqueues, even if they don't all fit */
8617 if (nkqueues
< buflen
) {
8618 kq_ids
[nkqueues
] = kqwl
->kqwl_dynamicid
;
8629 if (os_mul_overflow(sizeof(kqueue_id_t
), min(buflen
, nkqueues
), ©size
)) {
8634 assert(ubufsize
>= copysize
);
8635 err
= copyout(kq_ids
, ubuf
, copysize
);
8640 kfree(kq_ids
, bufsize
);
8644 *nkqueues_out
= (int)min(nkqueues
, PROC_PIDDYNKQUEUES_MAX
);
8650 kevent_copyout_dynkqinfo(void *proc
, kqueue_id_t kq_id
, user_addr_t ubuf
,
8651 uint32_t ubufsize
, int32_t *size_out
)
8653 proc_t p
= (proc_t
)proc
;
8656 struct kqueue_dyninfo kqdi
= { };
8660 if (ubufsize
< sizeof(struct kqueue_info
)) {
8665 kq
= kqueue_hash_lookup(p
, kq_id
);
8674 * backward compatibility: allow the argument to this call to only be
8675 * a struct kqueue_info
8677 if (ubufsize
>= sizeof(struct kqueue_dyninfo
)) {
8678 ubufsize
= sizeof(struct kqueue_dyninfo
);
8679 err
= fill_kqueue_dyninfo(kq
, &kqdi
);
8681 ubufsize
= sizeof(struct kqueue_info
);
8682 err
= fill_kqueueinfo(kq
, &kqdi
.kqdi_info
);
8684 if (err
== 0 && (err
= copyout(&kqdi
, ubuf
, ubufsize
)) == 0) {
8685 *size_out
= ubufsize
;
8687 kqueue_release_last(p
, kq
);
8692 kevent_copyout_dynkqextinfo(void *proc
, kqueue_id_t kq_id
, user_addr_t ubuf
,
8693 uint32_t ubufsize
, int32_t *nknotes_out
)
8695 proc_t p
= (proc_t
)proc
;
8702 kq
= kqueue_hash_lookup(p
, kq_id
);
8710 err
= pid_kqueue_extinfo(p
, kq
, ubuf
, ubufsize
, nknotes_out
);
8711 kqueue_release_last(p
, kq
);
8716 pid_kqueue_extinfo(proc_t p
, struct kqueue
*kq
, user_addr_t ubuf
,
8717 uint32_t bufsize
, int32_t *retval
)
8722 struct filedesc
*fdp
= p
->p_fd
;
8723 unsigned long nknotes
= 0;
8724 unsigned long buflen
= bufsize
/ sizeof(struct kevent_extinfo
);
8725 struct kevent_extinfo
*kqext
= NULL
;
8727 /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
8728 buflen
= min(buflen
, PROC_PIDFDKQUEUE_KNOTES_MAX
);
8730 kqext
= kalloc(buflen
* sizeof(struct kevent_extinfo
));
8731 if (kqext
== NULL
) {
8735 bzero(kqext
, buflen
* sizeof(struct kevent_extinfo
));
8738 for (i
= 0; i
< fdp
->fd_knlistsize
; i
++) {
8739 kn
= SLIST_FIRST(&fdp
->fd_knlist
[i
]);
8740 nknotes
= kevent_extinfo_emit(kq
, kn
, kqext
, buflen
, nknotes
);
8744 if (fdp
->fd_knhashmask
!= 0) {
8745 for (i
= 0; i
< (int)fdp
->fd_knhashmask
+ 1; i
++) {
8747 kn
= SLIST_FIRST(&fdp
->fd_knhash
[i
]);
8748 nknotes
= kevent_extinfo_emit(kq
, kn
, kqext
, buflen
, nknotes
);
8753 assert(bufsize
>= sizeof(struct kevent_extinfo
) * min(buflen
, nknotes
));
8754 err
= copyout(kqext
, ubuf
, sizeof(struct kevent_extinfo
) * min(buflen
, nknotes
));
8758 kfree(kqext
, buflen
* sizeof(struct kevent_extinfo
));
8763 *retval
= min(nknotes
, PROC_PIDFDKQUEUE_KNOTES_MAX
);
8769 klist_copy_udata(struct klist
*list
, uint64_t *buf
,
8770 unsigned int buflen
, unsigned int nknotes
)
8772 struct kevent_internal_s
*kev
;
8774 SLIST_FOREACH(kn
, list
, kn_link
) {
8775 if (nknotes
< buflen
) {
8776 struct kqueue
*kq
= knote_get_kq(kn
);
8778 kev
= &(kn
->kn_kevent
);
8779 buf
[nknotes
] = kev
->udata
;
8782 /* we return total number of knotes, which may be more than requested */
8790 kqlist_copy_dynamicids(__assert_only proc_t p
, struct kqlist
*list
,
8791 uint64_t *buf
, unsigned int buflen
, unsigned int nids
)
8793 kqhash_lock_held(p
);
8794 struct kqworkloop
*kqwl
;
8795 SLIST_FOREACH(kqwl
, list
, kqwl_hashlink
) {
8796 if (nids
< buflen
) {
8797 buf
[nids
] = kqwl
->kqwl_dynamicid
;
8805 kevent_proc_copy_uptrs(void *proc
, uint64_t *buf
, int bufsize
)
8807 proc_t p
= (proc_t
)proc
;
8808 struct filedesc
*fdp
= p
->p_fd
;
8809 unsigned int nuptrs
= 0;
8810 unsigned long buflen
= bufsize
/ sizeof(uint64_t);
8813 assert(buf
!= NULL
);
8817 for (int i
= 0; i
< fdp
->fd_knlistsize
; i
++) {
8818 nuptrs
= klist_copy_udata(&fdp
->fd_knlist
[i
], buf
, buflen
, nuptrs
);
8822 if (fdp
->fd_knhashmask
!= 0) {
8823 for (int i
= 0; i
< (int)fdp
->fd_knhashmask
+ 1; i
++) {
8824 nuptrs
= klist_copy_udata(&fdp
->fd_knhash
[i
], buf
, buflen
, nuptrs
);
8830 if (fdp
->fd_kqhashmask
!= 0) {
8831 for (int i
= 0; i
< (int)fdp
->fd_kqhashmask
+ 1; i
++) {
8832 nuptrs
= kqlist_copy_dynamicids(p
, &fdp
->fd_kqhash
[i
], buf
, buflen
,
8842 kevent_set_return_to_kernel_user_tsd(proc_t p
, thread_t thread
)
8845 bool proc_is_64bit
= !!(p
->p_flag
& P_LP64
);
8846 size_t user_addr_size
= proc_is_64bit
? 8 : 4;
8847 uint32_t ast_flags32
= 0;
8848 uint64_t ast_flags64
= 0;
8849 struct uthread
*ut
= get_bsdthread_info(thread
);
8851 if (ut
->uu_kqr_bound
!= NULL
) {
8852 ast_flags64
|= R2K_WORKLOOP_PENDING_EVENTS
;
8855 if (ast_flags64
== 0) {
8859 if (!(p
->p_flag
& P_LP64
)) {
8860 ast_flags32
= (uint32_t)ast_flags64
;
8861 assert(ast_flags64
< 0x100000000ull
);
8864 ast_addr
= thread_rettokern_addr(thread
);
8865 if (ast_addr
== 0) {
8869 if (copyout((proc_is_64bit
? (void *)&ast_flags64
: (void *)&ast_flags32
),
8870 (user_addr_t
)ast_addr
,
8871 user_addr_size
) != 0) {
8872 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
8873 "ast_addr = %llu\n", p
->p_pid
, thread_tid(current_thread()), ast_addr
);
8878 kevent_ast(thread_t thread
, uint16_t bits
)
8880 proc_t p
= current_proc();
8882 if (bits
& AST_KEVENT_REDRIVE_THREADREQ
) {
8883 workq_kern_threadreq_redrive(p
, WORKQ_THREADREQ_CAN_CREATE_THREADS
);
8885 if (bits
& AST_KEVENT_RETURN_TO_KERNEL
) {
8886 kevent_set_return_to_kernel_user_tsd(p
, thread
);
8890 #if DEVELOPMENT || DEBUG
8892 #define KEVENT_SYSCTL_BOUND_ID 1
8895 kevent_sysctl SYSCTL_HANDLER_ARGS
8897 #pragma unused(oidp, arg2)
8898 uintptr_t type
= (uintptr_t)arg1
;
8899 uint64_t bound_id
= 0;
8901 if (type
!= KEVENT_SYSCTL_BOUND_ID
) {
8909 struct uthread
*ut
= get_bsdthread_info(current_thread());
8914 struct kqrequest
*kqr
= ut
->uu_kqr_bound
;
8916 if (kqr
->kqr_state
& KQR_WORKLOOP
) {
8917 bound_id
= kqr_kqworkloop(kqr
)->kqwl_dynamicid
;
8923 return sysctl_io_number(req
, bound_id
, sizeof(bound_id
), NULL
, NULL
);
8926 SYSCTL_NODE(_kern
, OID_AUTO
, kevent
, CTLFLAG_RW
| CTLFLAG_LOCKED
, 0,
8927 "kevent information");
8929 SYSCTL_PROC(_kern_kevent
, OID_AUTO
, bound_id
,
8930 CTLTYPE_QUAD
| CTLFLAG_RD
| CTLFLAG_LOCKED
| CTLFLAG_MASKED
,
8931 (void *)KEVENT_SYSCTL_BOUND_ID
,
8932 sizeof(kqueue_id_t
), kevent_sysctl
, "Q",
8933 "get the ID of the bound kqueue");
8935 #endif /* DEVELOPMENT || DEBUG */