]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_event.c
f64bef436b145a5d12446f9c6199ae7f39ec6cc4
[apple/xnu.git] / bsd / kern / kern_event.c
1 /*
2 * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29 /*-
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54 /*
55 * @(#)kern_event.c 1.0 (3/31/2000)
56 */
57 #include <stdint.h>
58 #include <stdatomic.h>
59
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/filedesc.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
67 #include <sys/unistd.h>
68 #include <sys/file_internal.h>
69 #include <sys/fcntl.h>
70 #include <sys/select.h>
71 #include <sys/queue.h>
72 #include <sys/event.h>
73 #include <sys/eventvar.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/socketvar.h>
77 #include <sys/stat.h>
78 #include <sys/sysctl.h>
79 #include <sys/uio.h>
80 #include <sys/sysproto.h>
81 #include <sys/user.h>
82 #include <sys/vnode_internal.h>
83 #include <string.h>
84 #include <sys/proc_info.h>
85 #include <sys/codesign.h>
86 #include <sys/pthread_shims.h>
87 #include <sys/kdebug.h>
88 #include <sys/reason.h>
89 #include <os/reason_private.h>
90
91 #include <kern/locks.h>
92 #include <kern/clock.h>
93 #include <kern/cpu_data.h>
94 #include <kern/policy_internal.h>
95 #include <kern/thread_call.h>
96 #include <kern/sched_prim.h>
97 #include <kern/waitq.h>
98 #include <kern/zalloc.h>
99 #include <kern/kalloc.h>
100 #include <kern/assert.h>
101 #include <kern/ast.h>
102 #include <kern/thread.h>
103 #include <kern/kcdata.h>
104
105 #include <libkern/libkern.h>
106 #include <libkern/OSAtomic.h>
107
108 #include "net/net_str_id.h"
109
110 #include <mach/task.h>
111 #include <libkern/section_keywords.h>
112
113 #if CONFIG_MEMORYSTATUS
114 #include <sys/kern_memorystatus.h>
115 #endif
116
117 extern thread_t port_name_to_thread(mach_port_name_t port_name); /* osfmk/kern/ipc_tt.h */
118 extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc/ipc_entry.h */
119
120 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
121
122 /*
123 * JMM - this typedef needs to be unified with pthread_priority_t
124 * and mach_msg_priority_t. It also needs to be the same type
125 * everywhere.
126 */
127 typedef int32_t qos_t;
128
129 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
130
131 #define KQ_EVENT NO_EVENT64
132
133 #define KNUSE_NONE 0x0
134 #define KNUSE_STEAL_DROP 0x1
135 #define KNUSE_BOOST 0x2
136 static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn, int flags);
137 static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
138 static int kqlock2knotedetach(struct kqueue *kq, struct knote *kn, int flags);
139 static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int flags);
140
141 static int kqueue_read(struct fileproc *fp, struct uio *uio,
142 int flags, vfs_context_t ctx);
143 static int kqueue_write(struct fileproc *fp, struct uio *uio,
144 int flags, vfs_context_t ctx);
145 static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
146 vfs_context_t ctx);
147 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
148 vfs_context_t ctx);
149 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
150 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
151 struct kevent_internal_s *kev, vfs_context_t ctx);
152 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
153
154 static const struct fileops kqueueops = {
155 .fo_type = DTYPE_KQUEUE,
156 .fo_read = kqueue_read,
157 .fo_write = kqueue_write,
158 .fo_ioctl = kqueue_ioctl,
159 .fo_select = kqueue_select,
160 .fo_close = kqueue_close,
161 .fo_kqfilter = kqueue_kqfilter,
162 .fo_drain = kqueue_drain,
163 };
164
165 static void kevent_put_kq(struct proc *p, kqueue_id_t id, struct fileproc *fp, struct kqueue *kq);
166 static int kevent_internal(struct proc *p,
167 kqueue_id_t id, kqueue_id_t *id_out,
168 user_addr_t changelist, int nchanges,
169 user_addr_t eventlist, int nevents,
170 user_addr_t data_out, uint64_t data_available,
171 unsigned int flags, user_addr_t utimeout,
172 kqueue_continue_t continuation,
173 int32_t *retval);
174 static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp,
175 struct proc *p, unsigned int flags);
176 static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
177 struct proc *p, unsigned int flags);
178 char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
179
180 static void kqueue_interrupt(struct kqueue *kq);
181 static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
182 void *data);
183 static void kevent_continue(struct kqueue *kq, void *data, int error);
184 static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
185 static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data,
186 struct filt_process_s *process_data, int *countp, struct proc *p);
187 static struct kqtailq *kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index);
188 static struct kqtailq *kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index);
189 static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index);
190
191 static struct kqtailq *kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index);
192
193 static void kqworkq_request_thread(struct kqworkq *kqwq, kq_index_t qos_index);
194 static void kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index);
195 static void kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index);
196 static void kqworkq_bind_thread_impl(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
197 static void kqworkq_unbind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
198 static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
199
200 enum {
201 KQWL_UO_NONE = 0,
202 KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI = 0x1,
203 KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI = 0x2,
204 KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS = 0x4,
205 KQWL_UO_UPDATE_OVERRIDE_LAZY = 0x8
206 };
207
208 static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t qos_index, kq_index_t override_index, uint32_t flags);
209 static void kqworkloop_bind_thread_impl(struct kqworkloop *kqwl, thread_t thread, unsigned int flags);
210 static void kqworkloop_unbind_thread(struct kqworkloop *kqwl, thread_t thread, unsigned int flags);
211 static inline kq_index_t kqworkloop_combined_qos(struct kqworkloop *kqwl, boolean_t *);
212 static void kqworkloop_update_suppress_sync_count(struct kqrequest *kqr, uint32_t flags);
213 enum {
214 KQWL_UTQ_NONE,
215 /*
216 * The wakeup qos is the qos of QUEUED knotes.
217 *
218 * This QoS is accounted for with the events override in the
219 * kqr_override_index field. It is raised each time a new knote is queued at
220 * a given QoS. The kqr_wakeup_indexes field is a superset of the non empty
221 * knote buckets and is recomputed after each event delivery.
222 */
223 KQWL_UTQ_UPDATE_WAKEUP_QOS,
224 KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
225 KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
226 /*
227 * The wakeup override is for suppressed knotes that have fired again at
228 * a higher QoS than the one for which they are suppressed already.
229 * This override is cleared when the knote suppressed list becomes empty.
230 */
231 KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
232 KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
233 /*
234 * The async QoS is the maximum QoS of an event enqueued on this workloop in
235 * userland. It is copied from the only EVFILT_WORKLOOP knote with
236 * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
237 * such knote, this QoS is 0.
238 */
239 KQWL_UTQ_SET_ASYNC_QOS,
240 /*
241 * The sync waiters QoS is the maximum QoS of any thread blocked on an
242 * EVFILT_WORKLOOP knote marked with the NOTE_WL_SYNC_WAIT bit.
243 * If there is no such knote, this QoS is 0.
244 */
245 KQWL_UTQ_SET_SYNC_WAITERS_QOS,
246 KQWL_UTQ_REDRIVE_EVENTS,
247 };
248 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
249 static void kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index);
250
251 static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data,
252 struct filt_process_s *process_data, struct proc *p);
253 #if 0
254 static void knote_put(struct knote *kn);
255 #endif
256
257 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
258 struct kevent_internal_s *kev, struct proc *p, int *knoteuse_flags);
259 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, bool is_fd, struct proc *p);
260 static void kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, kn_status_t *kn_status, uint16_t *kq_state);
261
262 static void knote_drop(struct knote *kn, struct proc *p);
263 static struct knote *knote_alloc(void);
264 static void knote_free(struct knote *kn);
265
266 static void knote_activate(struct knote *kn);
267 static void knote_deactivate(struct knote *kn);
268
269 static void knote_enable(struct knote *kn);
270 static void knote_disable(struct knote *kn);
271
272 static int knote_enqueue(struct knote *kn);
273 static void knote_dequeue(struct knote *kn);
274
275 static void knote_suppress(struct knote *kn);
276 static void knote_unsuppress(struct knote *kn);
277 static void knote_wakeup(struct knote *kn);
278
279 static kq_index_t knote_get_queue_index(struct knote *kn);
280 static struct kqtailq *knote_get_queue(struct knote *kn);
281 static kq_index_t knote_get_req_index(struct knote *kn);
282 static kq_index_t knote_get_qos_index(struct knote *kn);
283 static void knote_set_qos_index(struct knote *kn, kq_index_t qos_index);
284 static kq_index_t knote_get_qos_override_index(struct knote *kn);
285 static kq_index_t knote_get_sync_qos_override_index(struct knote *kn);
286 static void knote_set_qos_override_index(struct knote *kn, kq_index_t qos_index, boolean_t override_is_sync);
287 static void knote_set_qos_overcommit(struct knote *kn);
288
289 static int filt_fileattach(struct knote *kn, struct kevent_internal_s *kev);
290 SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
291 .f_isfd = 1,
292 .f_attach = filt_fileattach,
293 };
294
295 static void filt_kqdetach(struct knote *kn);
296 static int filt_kqueue(struct knote *kn, long hint);
297 static int filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev);
298 static int filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
299 SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
300 .f_isfd = 1,
301 .f_detach = filt_kqdetach,
302 .f_event = filt_kqueue,
303 .f_touch = filt_kqtouch,
304 .f_process = filt_kqprocess,
305 };
306
307 /* placeholder for not-yet-implemented filters */
308 static int filt_badattach(struct knote *kn, struct kevent_internal_s *kev);
309 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
310 .f_attach = filt_badattach,
311 };
312
313 static int filt_procattach(struct knote *kn, struct kevent_internal_s *kev);
314 static void filt_procdetach(struct knote *kn);
315 static int filt_proc(struct knote *kn, long hint);
316 static int filt_proctouch(struct knote *kn, struct kevent_internal_s *kev);
317 static int filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
318 SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
319 .f_attach = filt_procattach,
320 .f_detach = filt_procdetach,
321 .f_event = filt_proc,
322 .f_touch = filt_proctouch,
323 .f_process = filt_procprocess,
324 };
325
326 #if CONFIG_MEMORYSTATUS
327 extern const struct filterops memorystatus_filtops;
328 #endif /* CONFIG_MEMORYSTATUS */
329
330 extern const struct filterops fs_filtops;
331
332 extern const struct filterops sig_filtops;
333
334 static zone_t knote_zone;
335 static zone_t kqfile_zone;
336 static zone_t kqworkq_zone;
337 static zone_t kqworkloop_zone;
338
339 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
340
341 /* Mach portset filter */
342 extern const struct filterops machport_filtops;
343
344 /* User filter */
345 static int filt_userattach(struct knote *kn, struct kevent_internal_s *kev);
346 static void filt_userdetach(struct knote *kn);
347 static int filt_user(struct knote *kn, long hint);
348 static int filt_usertouch(struct knote *kn, struct kevent_internal_s *kev);
349 static int filt_userprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
350 SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
351 .f_attach = filt_userattach,
352 .f_detach = filt_userdetach,
353 .f_event = filt_user,
354 .f_touch = filt_usertouch,
355 .f_process = filt_userprocess,
356 };
357
358 static lck_spin_t _filt_userlock;
359 static void filt_userlock(void);
360 static void filt_userunlock(void);
361
362 /* Workloop filter */
363 static bool filt_wlneeds_boost(struct kevent_internal_s *kev);
364 static int filt_wlattach(struct knote *kn, struct kevent_internal_s *kev);
365 static int filt_wlpost_attach(struct knote *kn, struct kevent_internal_s *kev);
366 static void filt_wldetach(struct knote *kn);
367 static int filt_wlevent(struct knote *kn, long hint);
368 static int filt_wltouch(struct knote *kn, struct kevent_internal_s *kev);
369 static int filt_wldrop_and_unlock(struct knote *kn, struct kevent_internal_s *kev);
370 static int filt_wlprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
371 SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
372 .f_needs_boost = filt_wlneeds_boost,
373 .f_attach = filt_wlattach,
374 .f_post_attach = filt_wlpost_attach,
375 .f_detach = filt_wldetach,
376 .f_event = filt_wlevent,
377 .f_touch = filt_wltouch,
378 .f_drop_and_unlock = filt_wldrop_and_unlock,
379 .f_process = filt_wlprocess,
380 };
381
382 extern const struct filterops pipe_rfiltops;
383 extern const struct filterops pipe_wfiltops;
384 extern const struct filterops ptsd_kqops;
385 extern const struct filterops soread_filtops;
386 extern const struct filterops sowrite_filtops;
387 extern const struct filterops sock_filtops;
388 extern const struct filterops soexcept_filtops;
389 extern const struct filterops spec_filtops;
390 extern const struct filterops bpfread_filtops;
391 extern const struct filterops necp_fd_rfiltops;
392 extern const struct filterops fsevent_filtops;
393 extern const struct filterops vnode_filtops;
394 extern const struct filterops tty_filtops;
395
396 const static struct filterops timer_filtops;
397
398 /*
399 *
400 * Rules for adding new filters to the system:
401 * Public filters:
402 * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
403 * in the exported section of the header
404 * - Update the EVFILT_SYSCOUNT value to reflect the new addition
405 * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
406 * of the Public Filters section in the array.
407 * Private filters:
408 * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
409 * in the XNU_KERNEL_PRIVATE section of the header
410 * - Update the EVFILTID_MAX value to reflect the new addition
411 * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
412 * the Private filters section of the array.
413 */
414 SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = {
415 /* Public Filters */
416 [~EVFILT_READ] = &file_filtops,
417 [~EVFILT_WRITE] = &file_filtops,
418 [~EVFILT_AIO] = &bad_filtops,
419 [~EVFILT_VNODE] = &file_filtops,
420 [~EVFILT_PROC] = &proc_filtops,
421 [~EVFILT_SIGNAL] = &sig_filtops,
422 [~EVFILT_TIMER] = &timer_filtops,
423 [~EVFILT_MACHPORT] = &machport_filtops,
424 [~EVFILT_FS] = &fs_filtops,
425 [~EVFILT_USER] = &user_filtops,
426 &bad_filtops,
427 &bad_filtops,
428 [~EVFILT_SOCK] = &file_filtops,
429 #if CONFIG_MEMORYSTATUS
430 [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops,
431 #else
432 [~EVFILT_MEMORYSTATUS] = &bad_filtops,
433 #endif
434 [~EVFILT_EXCEPT] = &file_filtops,
435
436 [~EVFILT_WORKLOOP] = &workloop_filtops,
437
438 /* Private filters */
439 [EVFILTID_KQREAD] = &kqread_filtops,
440 [EVFILTID_PIPE_R] = &pipe_rfiltops,
441 [EVFILTID_PIPE_W] = &pipe_wfiltops,
442 [EVFILTID_PTSD] = &ptsd_kqops,
443 [EVFILTID_SOREAD] = &soread_filtops,
444 [EVFILTID_SOWRITE] = &sowrite_filtops,
445 [EVFILTID_SCK] = &sock_filtops,
446 [EVFILTID_SOEXCEPT] = &soexcept_filtops,
447 [EVFILTID_SPEC] = &spec_filtops,
448 [EVFILTID_BPFREAD] = &bpfread_filtops,
449 [EVFILTID_NECP_FD] = &necp_fd_rfiltops,
450 [EVFILTID_FSEVENT] = &fsevent_filtops,
451 [EVFILTID_VN] = &vnode_filtops,
452 [EVFILTID_TTY] = &tty_filtops
453 };
454
455 /* waitq prepost callback */
456 void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos);
457
458 #ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
459 #define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */
460 #endif
461 #ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
462 #define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG 0x80000000 /* request overcommit threads */
463 #endif
464 #ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK
465 #define _PTHREAD_PRIORITY_QOS_CLASS_MASK 0x003fff00 /* QoS class mask */
466 #endif
467 #ifndef _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32
468 #define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 8
469 #endif
470
471 static inline __kdebug_only
472 uintptr_t
473 kqr_thread_id(struct kqrequest *kqr)
474 {
475 return (uintptr_t)thread_tid(kqr->kqr_thread);
476 }
477
478 static inline
479 boolean_t is_workqueue_thread(thread_t thread)
480 {
481 return (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE);
482 }
483
484 static inline
485 void knote_canonicalize_kevent_qos(struct knote *kn)
486 {
487 struct kqueue *kq = knote_get_kq(kn);
488 unsigned long canonical;
489
490 if ((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0)
491 return;
492
493 /* preserve manager and overcommit flags in this case */
494 canonical = pthread_priority_canonicalize(kn->kn_qos, FALSE);
495 kn->kn_qos = (qos_t)canonical;
496 }
497
498 static inline
499 kq_index_t qos_index_from_qos(struct knote *kn, qos_t qos, boolean_t propagation)
500 {
501 struct kqueue *kq = knote_get_kq(kn);
502 kq_index_t qos_index;
503 unsigned long flags = 0;
504
505 if ((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0)
506 return QOS_INDEX_KQFILE;
507
508 qos_index = (kq_index_t)thread_qos_from_pthread_priority(
509 (unsigned long)qos, &flags);
510
511 if (kq->kq_state & KQ_WORKQ) {
512 /* workq kqueues support requesting a manager thread (non-propagation) */
513 if (!propagation && (flags & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG))
514 return KQWQ_QOS_MANAGER;
515 }
516
517 return qos_index;
518 }
519
520 static inline
521 qos_t qos_from_qos_index(kq_index_t qos_index)
522 {
523 /* should only happen for KQ_WORKQ */
524 if (qos_index == KQWQ_QOS_MANAGER)
525 return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
526
527 if (qos_index == 0)
528 return THREAD_QOS_UNSPECIFIED;
529
530 /* Should have support from pthread kext support */
531 return (1 << (qos_index - 1 +
532 _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32));
533 }
534
535 /* kqr lock must be held */
536 static inline
537 unsigned long pthread_priority_for_kqrequest(
538 struct kqrequest *kqr,
539 kq_index_t qos_index)
540 {
541 unsigned long priority = qos_from_qos_index(qos_index);
542 if (kqr->kqr_state & KQR_THOVERCOMMIT) {
543 priority |= _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
544 }
545 return priority;
546 }
547
548 static inline
549 kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags)
550 {
551 #pragma unused(thread)
552 kq_index_t qos_index;
553
554 if (flags & KEVENT_FLAG_WORKQ_MANAGER)
555 return KQWQ_QOS_MANAGER;
556
557 qos_index = (kq_index_t)qos_class;
558 assert(qos_index > 0 && qos_index < KQWQ_QOS_MANAGER);
559
560 return qos_index;
561 }
562
563 /*
564 * kqueue/note lock implementations
565 *
566 * The kqueue lock guards the kq state, the state of its queues,
567 * and the kqueue-aware status and use counts of individual knotes.
568 *
569 * The kqueue workq lock is used to protect state guarding the
570 * interaction of the kqueue with the workq. This state cannot
571 * be guarded by the kq lock - as it needs to be taken when we
572 * already have the waitq set lock held (during the waitq hook
573 * callback). It might be better to use the waitq lock itself
574 * for this, but the IRQ requirements make that difficult).
575 *
576 * Knote flags, filter flags, and associated data are protected
577 * by the underlying object lock - and are only ever looked at
578 * by calling the filter to get a [consistent] snapshot of that
579 * data.
580 */
581 lck_grp_attr_t * kq_lck_grp_attr;
582 lck_grp_t * kq_lck_grp;
583 lck_attr_t * kq_lck_attr;
584
585 static inline void
586 kqlock(struct kqueue *kq)
587 {
588 lck_spin_lock(&kq->kq_lock);
589 }
590
591 static inline void
592 kqlock_held(__assert_only struct kqueue *kq)
593 {
594 LCK_SPIN_ASSERT(&kq->kq_lock, LCK_ASSERT_OWNED);
595 }
596
597 static inline void
598 kqunlock(struct kqueue *kq)
599 {
600 lck_spin_unlock(&kq->kq_lock);
601 }
602
603 static inline void
604 knhash_lock(proc_t p)
605 {
606 lck_mtx_lock(&p->p_fd->fd_knhashlock);
607 }
608
609 static inline void
610 knhash_unlock(proc_t p)
611 {
612 lck_mtx_unlock(&p->p_fd->fd_knhashlock);
613 }
614
615
616 /*
617 * Convert a kq lock to a knote use referece.
618 *
619 * If the knote is being dropped, or has
620 * vanished, we can't get a use reference.
621 * Just return with it still locked.
622 *
623 * - kq locked at entry
624 * - unlock on exit if we get the use reference
625 */
626 static int
627 kqlock2knoteuse(struct kqueue *kq, struct knote *kn, int flags)
628 {
629 if (kn->kn_status & (KN_DROPPING | KN_VANISHED))
630 return (0);
631
632 assert(kn->kn_status & KN_ATTACHED);
633 kn->kn_inuse++;
634 if (flags & KNUSE_BOOST) {
635 set_thread_rwlock_boost();
636 }
637 kqunlock(kq);
638 return (1);
639 }
640
641 /*
642 * - kq locked at entry
643 * - kq unlocked at exit
644 */
645 __disable_tail_calls
646 static wait_result_t
647 knoteusewait(struct kqueue *kq, struct knote *kn)
648 {
649 kn->kn_status |= KN_USEWAIT;
650 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
651 CAST_EVENT64_T(&kn->kn_status),
652 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
653 kqunlock(kq);
654 return thread_block(THREAD_CONTINUE_NULL);
655 }
656
657 static bool
658 knoteuse_needs_boost(struct knote *kn, struct kevent_internal_s *kev)
659 {
660 if (knote_fops(kn)->f_needs_boost) {
661 return knote_fops(kn)->f_needs_boost(kev);
662 }
663 return false;
664 }
665
666 /*
667 * Convert from a knote use reference back to kq lock.
668 *
669 * Drop a use reference and wake any waiters if
670 * this is the last one.
671 *
672 * If someone is trying to drop the knote, but the
673 * caller has events they must deliver, take
674 * responsibility for the drop later - and wake the
675 * other attempted dropper in a manner that informs
676 * him of the transfer of responsibility.
677 *
678 * The exit return indicates if the knote is still alive
679 * (or if not, the other dropper has been given the green
680 * light to drop it).
681 *
682 * The kqueue lock is re-taken unconditionally.
683 */
684 static int
685 knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int flags)
686 {
687 int dropped = 0;
688 int steal_drop = (flags & KNUSE_STEAL_DROP);
689
690 kqlock(kq);
691 if (flags & KNUSE_BOOST) {
692 clear_thread_rwlock_boost();
693 }
694
695 if (--kn->kn_inuse == 0) {
696
697 if ((kn->kn_status & KN_ATTACHING) != 0) {
698 kn->kn_status &= ~KN_ATTACHING;
699 }
700
701 if ((kn->kn_status & KN_USEWAIT) != 0) {
702 wait_result_t result;
703
704 /* If we need to, try and steal the drop */
705 if (kn->kn_status & KN_DROPPING) {
706 if (steal_drop && !(kn->kn_status & KN_STOLENDROP)) {
707 kn->kn_status |= KN_STOLENDROP;
708 } else {
709 dropped = 1;
710 }
711 }
712
713 /* wakeup indicating if ANY USE stole the drop */
714 result = (kn->kn_status & KN_STOLENDROP) ?
715 THREAD_RESTART : THREAD_AWAKENED;
716
717 kn->kn_status &= ~KN_USEWAIT;
718 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
719 CAST_EVENT64_T(&kn->kn_status),
720 result,
721 WAITQ_ALL_PRIORITIES);
722 } else {
723 /* should have seen use-wait if dropping with use refs */
724 assert((kn->kn_status & (KN_DROPPING|KN_STOLENDROP)) == 0);
725 }
726
727 } else if (kn->kn_status & KN_DROPPING) {
728 /* not the last ref but want to steal a drop if present */
729 if (steal_drop && ((kn->kn_status & KN_STOLENDROP) == 0)) {
730 kn->kn_status |= KN_STOLENDROP;
731
732 /* but we now have to wait to be the last ref */
733 knoteusewait(kq, kn);
734 kqlock(kq);
735 } else {
736 dropped = 1;
737 }
738 }
739
740 return (!dropped);
741 }
742
743 /*
744 * Convert a kq lock to a knote use reference
745 * (for the purpose of detaching AND vanishing it).
746 *
747 * If the knote is being dropped, we can't get
748 * a detach reference, so wait for the knote to
749 * finish dropping before returning.
750 *
751 * If the knote is being used for other purposes,
752 * we cannot detach it until those uses are done
753 * as well. Again, just wait for them to finish
754 * (caller will start over at lookup).
755 *
756 * - kq locked at entry
757 * - unlocked on exit
758 */
759 static int
760 kqlock2knotedetach(struct kqueue *kq, struct knote *kn, int flags)
761 {
762 if ((kn->kn_status & KN_DROPPING) || kn->kn_inuse) {
763 /* have to wait for dropper or current uses to go away */
764 knoteusewait(kq, kn);
765 return (0);
766 }
767 assert((kn->kn_status & KN_VANISHED) == 0);
768 assert(kn->kn_status & KN_ATTACHED);
769 kn->kn_status &= ~KN_ATTACHED;
770 kn->kn_status |= KN_VANISHED;
771 if (flags & KNUSE_BOOST) {
772 clear_thread_rwlock_boost();
773 }
774 kn->kn_inuse++;
775 kqunlock(kq);
776 return (1);
777 }
778
779 /*
780 * Convert a kq lock to a knote drop reference.
781 *
782 * If the knote is in use, wait for the use count
783 * to subside. We first mark our intention to drop
784 * it - keeping other users from "piling on."
785 * If we are too late, we have to wait for the
786 * other drop to complete.
787 *
788 * - kq locked at entry
789 * - always unlocked on exit.
790 * - caller can't hold any locks that would prevent
791 * the other dropper from completing.
792 */
793 static int
794 kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
795 {
796 int oktodrop;
797 wait_result_t result;
798
799 oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
800 /* if another thread is attaching, they will become the dropping thread */
801 kn->kn_status |= KN_DROPPING;
802 knote_unsuppress(kn);
803 knote_dequeue(kn);
804 if (oktodrop) {
805 if (kn->kn_inuse == 0) {
806 kqunlock(kq);
807 return (oktodrop);
808 }
809 }
810 result = knoteusewait(kq, kn);
811 /* THREAD_RESTART == another thread stole the knote drop */
812 return (result == THREAD_AWAKENED);
813 }
814
815 #if 0
816 /*
817 * Release a knote use count reference.
818 */
819 static void
820 knote_put(struct knote *kn)
821 {
822 struct kqueue *kq = knote_get_kq(kn);
823
824 kqlock(kq);
825 if (--kn->kn_inuse == 0) {
826 if ((kn->kn_status & KN_USEWAIT) != 0) {
827 kn->kn_status &= ~KN_USEWAIT;
828 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
829 CAST_EVENT64_T(&kn->kn_status),
830 THREAD_AWAKENED,
831 WAITQ_ALL_PRIORITIES);
832 }
833 }
834 kqunlock(kq);
835 }
836 #endif
837
838 static int
839 filt_fileattach(struct knote *kn, struct kevent_internal_s *kev)
840 {
841 return (fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current()));
842 }
843
844 #define f_flag f_fglob->fg_flag
845 #define f_msgcount f_fglob->fg_msgcount
846 #define f_cred f_fglob->fg_cred
847 #define f_ops f_fglob->fg_ops
848 #define f_offset f_fglob->fg_offset
849 #define f_data f_fglob->fg_data
850
851 static void
852 filt_kqdetach(struct knote *kn)
853 {
854 struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
855 struct kqueue *kq = &kqf->kqf_kqueue;
856
857 kqlock(kq);
858 KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
859 kqunlock(kq);
860 }
861
862 /*ARGSUSED*/
863 static int
864 filt_kqueue(struct knote *kn, __unused long hint)
865 {
866 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
867 int count;
868
869 count = kq->kq_count;
870 return (count > 0);
871 }
872
873 static int
874 filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev)
875 {
876 #pragma unused(kev)
877 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
878 int res;
879
880 kqlock(kq);
881 kn->kn_data = kq->kq_count;
882 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
883 kn->kn_udata = kev->udata;
884 res = (kn->kn_data > 0);
885
886 kqunlock(kq);
887
888 return res;
889 }
890
891 static int
892 filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
893 {
894 #pragma unused(data)
895 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
896 int res;
897
898 kqlock(kq);
899 kn->kn_data = kq->kq_count;
900 res = (kn->kn_data > 0);
901 if (res) {
902 *kev = kn->kn_kevent;
903 if (kn->kn_flags & EV_CLEAR)
904 kn->kn_data = 0;
905 }
906 kqunlock(kq);
907
908 return res;
909 }
910
911 #pragma mark EVFILT_PROC
912
913 static int
914 filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
915 {
916 struct proc *p;
917
918 assert(PID_MAX < NOTE_PDATAMASK);
919
920 if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
921 kn->kn_flags = EV_ERROR;
922 kn->kn_data = ENOTSUP;
923 return 0;
924 }
925
926 p = proc_find(kn->kn_id);
927 if (p == NULL) {
928 kn->kn_flags = EV_ERROR;
929 kn->kn_data = ESRCH;
930 return 0;
931 }
932
933 const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
934
935 if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
936 do {
937 pid_t selfpid = proc_selfpid();
938
939 if (p->p_ppid == selfpid)
940 break; /* parent => ok */
941
942 if ((p->p_lflag & P_LTRACED) != 0 &&
943 (p->p_oppid == selfpid))
944 break; /* parent-in-waiting => ok */
945
946 proc_rele(p);
947 kn->kn_flags = EV_ERROR;
948 kn->kn_data = EACCES;
949 return 0;
950 } while (0);
951
952 proc_klist_lock();
953
954 kn->kn_ptr.p_proc = p; /* store the proc handle */
955
956 KNOTE_ATTACH(&p->p_klist, kn);
957
958 proc_klist_unlock();
959
960 proc_rele(p);
961
962 /*
963 * only captures edge-triggered events after this point
964 * so it can't already be fired.
965 */
966 return (0);
967 }
968
969
970 /*
971 * The knote may be attached to a different process, which may exit,
972 * leaving nothing for the knote to be attached to. In that case,
973 * the pointer to the process will have already been nulled out.
974 */
975 static void
976 filt_procdetach(struct knote *kn)
977 {
978 struct proc *p;
979
980 proc_klist_lock();
981
982 p = kn->kn_ptr.p_proc;
983 if (p != PROC_NULL) {
984 kn->kn_ptr.p_proc = PROC_NULL;
985 KNOTE_DETACH(&p->p_klist, kn);
986 }
987
988 proc_klist_unlock();
989 }
990
991 static int
992 filt_proc(struct knote *kn, long hint)
993 {
994 u_int event;
995
996 /* ALWAYS CALLED WITH proc_klist_lock */
997
998 /*
999 * Note: a lot of bits in hint may be obtained from the knote
1000 * To free some of those bits, see <rdar://problem/12592988> Freeing up
1001 * bits in hint for filt_proc
1002 *
1003 * mask off extra data
1004 */
1005 event = (u_int)hint & NOTE_PCTRLMASK;
1006
1007 /*
1008 * termination lifecycle events can happen while a debugger
1009 * has reparented a process, in which case notifications
1010 * should be quashed except to the tracing parent. When
1011 * the debugger reaps the child (either via wait4(2) or
1012 * process exit), the child will be reparented to the original
1013 * parent and these knotes re-fired.
1014 */
1015 if (event & NOTE_EXIT) {
1016 if ((kn->kn_ptr.p_proc->p_oppid != 0)
1017 && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
1018 /*
1019 * This knote is not for the current ptrace(2) parent, ignore.
1020 */
1021 return 0;
1022 }
1023 }
1024
1025 /*
1026 * if the user is interested in this event, record it.
1027 */
1028 if (kn->kn_sfflags & event)
1029 kn->kn_fflags |= event;
1030
1031 #pragma clang diagnostic push
1032 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1033 if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
1034 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1035 }
1036 #pragma clang diagnostic pop
1037
1038
1039 /*
1040 * The kernel has a wrapper in place that returns the same data
1041 * as is collected here, in kn_data. Any changes to how
1042 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
1043 * should also be reflected in the proc_pidnoteexit() wrapper.
1044 */
1045 if (event == NOTE_EXIT) {
1046 kn->kn_data = 0;
1047 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
1048 kn->kn_fflags |= NOTE_EXITSTATUS;
1049 kn->kn_data |= (hint & NOTE_PDATAMASK);
1050 }
1051 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
1052 kn->kn_fflags |= NOTE_EXIT_DETAIL;
1053 if ((kn->kn_ptr.p_proc->p_lflag &
1054 P_LTERM_DECRYPTFAIL) != 0) {
1055 kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
1056 }
1057 if ((kn->kn_ptr.p_proc->p_lflag &
1058 P_LTERM_JETSAM) != 0) {
1059 kn->kn_data |= NOTE_EXIT_MEMORY;
1060 switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) {
1061 case P_JETSAM_VMPAGESHORTAGE:
1062 kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
1063 break;
1064 case P_JETSAM_VMTHRASHING:
1065 kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
1066 break;
1067 case P_JETSAM_FCTHRASHING:
1068 kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
1069 break;
1070 case P_JETSAM_VNODE:
1071 kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
1072 break;
1073 case P_JETSAM_HIWAT:
1074 kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
1075 break;
1076 case P_JETSAM_PID:
1077 kn->kn_data |= NOTE_EXIT_MEMORY_PID;
1078 break;
1079 case P_JETSAM_IDLEEXIT:
1080 kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
1081 break;
1082 }
1083 }
1084 if ((kn->kn_ptr.p_proc->p_csflags &
1085 CS_KILLED) != 0) {
1086 kn->kn_data |= NOTE_EXIT_CSERROR;
1087 }
1088 }
1089 }
1090
1091 /* if we have any matching state, activate the knote */
1092 return (kn->kn_fflags != 0);
1093 }
1094
1095 static int
1096 filt_proctouch(struct knote *kn, struct kevent_internal_s *kev)
1097 {
1098 int res;
1099
1100 proc_klist_lock();
1101
1102 /* accept new filter flags and mask off output events no long interesting */
1103 kn->kn_sfflags = kev->fflags;
1104 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1105 kn->kn_udata = kev->udata;
1106
1107 /* restrict the current results to the (smaller?) set of new interest */
1108 /*
1109 * For compatibility with previous implementations, we leave kn_fflags
1110 * as they were before.
1111 */
1112 //kn->kn_fflags &= kn->kn_sfflags;
1113
1114 res = (kn->kn_fflags != 0);
1115
1116 proc_klist_unlock();
1117
1118 return res;
1119 }
1120
1121 static int
1122 filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
1123 {
1124 #pragma unused(data)
1125 int res;
1126
1127 proc_klist_lock();
1128 res = (kn->kn_fflags != 0);
1129 if (res) {
1130 *kev = kn->kn_kevent;
1131 kn->kn_flags |= EV_CLEAR; /* automatically set */
1132 kn->kn_fflags = 0;
1133 kn->kn_data = 0;
1134 }
1135 proc_klist_unlock();
1136 return res;
1137 }
1138
1139
1140 #pragma mark EVFILT_TIMER
1141
1142
1143 /*
1144 * Values stored in the knote at rest (using Mach absolute time units)
1145 *
1146 * kn->kn_hook where the thread_call object is stored
1147 * kn->kn_ext[0] next deadline or 0 if immediate expiration
1148 * kn->kn_ext[1] leeway value
1149 * kn->kn_sdata interval timer: the interval
1150 * absolute/deadline timer: 0
1151 * kn->kn_data fire count
1152 */
1153
1154 static lck_mtx_t _filt_timerlock;
1155
1156 static void filt_timerlock(void) { lck_mtx_lock(&_filt_timerlock); }
1157 static void filt_timerunlock(void) { lck_mtx_unlock(&_filt_timerlock); }
1158
1159 static inline void filt_timer_assert_locked(void)
1160 {
1161 LCK_MTX_ASSERT(&_filt_timerlock, LCK_MTX_ASSERT_OWNED);
1162 }
1163
1164 /* state flags stored in kn_hookid */
1165 #define TIMER_RUNNING 0x1
1166 #define TIMER_CANCELWAIT 0x2
1167
1168 /*
1169 * filt_timervalidate - process data from user
1170 *
1171 * Sets up the deadline, interval, and leeway from the provided user data
1172 *
1173 * Input:
1174 * kn_sdata timer deadline or interval time
1175 * kn_sfflags style of timer, unit of measurement
1176 *
1177 * Output:
1178 * kn_sdata either interval in abstime or 0 if non-repeating timer
1179 * ext[0] fire deadline in abs/cont time
1180 * (or 0 if NOTE_ABSOLUTE and deadline is in past)
1181 *
1182 * Returns:
1183 * EINVAL Invalid user data parameters
1184 *
1185 * Called with timer filter lock held.
1186 */
1187 static int
1188 filt_timervalidate(struct knote *kn)
1189 {
1190 /*
1191 * There are 4 knobs that need to be chosen for a timer registration:
1192 *
1193 * A) Units of time (what is the time duration of the specified number)
1194 * Absolute and interval take:
1195 * NOTE_SECONDS, NOTE_USECONDS, NOTE_NSECONDS, NOTE_MACHTIME
1196 * Defaults to milliseconds if not specified
1197 *
1198 * B) Clock epoch (what is the zero point of the specified number)
1199 * For interval, there is none
1200 * For absolute, defaults to the gettimeofday/calendar epoch
1201 * With NOTE_MACHTIME, uses mach_absolute_time()
1202 * With NOTE_MACHTIME and NOTE_MACH_CONTINUOUS_TIME, uses mach_continuous_time()
1203 *
1204 * C) The knote's behavior on delivery
1205 * Interval timer causes the knote to arm for the next interval unless one-shot is set
1206 * Absolute is a forced one-shot timer which deletes on delivery
1207 * TODO: Add a way for absolute to be not forced one-shot
1208 *
1209 * D) Whether the time duration is relative to now or absolute
1210 * Interval fires at now + duration when it is set up
1211 * Absolute fires at now + difference between now walltime and passed in walltime
1212 * With NOTE_MACHTIME it fires at an absolute MAT or MCT.
1213 *
1214 * E) Whether the timer continues to tick across sleep
1215 * By default all three do not.
1216 * For interval and absolute, NOTE_MACH_CONTINUOUS_TIME causes them to tick across sleep
1217 * With NOTE_ABSOLUTE | NOTE_MACHTIME | NOTE_MACH_CONTINUOUS_TIME:
1218 * expires when mach_continuous_time() is > the passed in value.
1219 */
1220
1221 filt_timer_assert_locked();
1222
1223 uint64_t multiplier;
1224
1225 boolean_t use_abstime = FALSE;
1226
1227 switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS|NOTE_MACHTIME)) {
1228 case NOTE_SECONDS:
1229 multiplier = NSEC_PER_SEC;
1230 break;
1231 case NOTE_USECONDS:
1232 multiplier = NSEC_PER_USEC;
1233 break;
1234 case NOTE_NSECONDS:
1235 multiplier = 1;
1236 break;
1237 case NOTE_MACHTIME:
1238 multiplier = 0;
1239 use_abstime = TRUE;
1240 break;
1241 case 0: /* milliseconds (default) */
1242 multiplier = NSEC_PER_SEC / 1000;
1243 break;
1244 default:
1245 return (EINVAL);
1246 }
1247
1248 /* transform the leeway in kn_ext[1] to same time scale */
1249 if (kn->kn_sfflags & NOTE_LEEWAY) {
1250 uint64_t leeway_abs;
1251
1252 if (use_abstime) {
1253 leeway_abs = (uint64_t)kn->kn_ext[1];
1254 } else {
1255 uint64_t leeway_ns;
1256 if (os_mul_overflow((uint64_t)kn->kn_ext[1], multiplier, &leeway_ns))
1257 return (ERANGE);
1258
1259 nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
1260 }
1261
1262 kn->kn_ext[1] = leeway_abs;
1263 }
1264
1265 if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1266 uint64_t deadline_abs;
1267
1268 if (use_abstime) {
1269 deadline_abs = (uint64_t)kn->kn_sdata;
1270 } else {
1271 uint64_t calendar_deadline_ns;
1272
1273 if (os_mul_overflow((uint64_t)kn->kn_sdata, multiplier, &calendar_deadline_ns))
1274 return (ERANGE);
1275
1276 /* calendar_deadline_ns is in nanoseconds since the epoch */
1277
1278 clock_sec_t seconds;
1279 clock_nsec_t nanoseconds;
1280
1281 /*
1282 * Note that the conversion through wall-time is only done once.
1283 *
1284 * If the relationship between MAT and gettimeofday changes,
1285 * the underlying timer does not update.
1286 *
1287 * TODO: build a wall-time denominated timer_call queue
1288 * and a flag to request DTRTing with wall-time timers
1289 */
1290 clock_get_calendar_nanotime(&seconds, &nanoseconds);
1291
1292 uint64_t calendar_now_ns = (uint64_t)seconds * NSEC_PER_SEC + nanoseconds;
1293
1294 /* if deadline is in the future */
1295 if (calendar_now_ns < calendar_deadline_ns) {
1296 uint64_t interval_ns = calendar_deadline_ns - calendar_now_ns;
1297 uint64_t interval_abs;
1298
1299 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1300
1301 /*
1302 * Note that the NOTE_MACH_CONTINUOUS_TIME flag here only
1303 * causes the timer to keep ticking across sleep, but
1304 * it does not change the calendar timebase.
1305 */
1306
1307 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1308 clock_continuoustime_interval_to_deadline(interval_abs,
1309 &deadline_abs);
1310 else
1311 clock_absolutetime_interval_to_deadline(interval_abs,
1312 &deadline_abs);
1313 } else {
1314 deadline_abs = 0; /* cause immediate expiration */
1315 }
1316 }
1317
1318 kn->kn_ext[0] = deadline_abs;
1319 kn->kn_sdata = 0; /* NOTE_ABSOLUTE is non-repeating */
1320 } else if (kn->kn_sdata < 0) {
1321 /*
1322 * Negative interval timers fire immediately, once.
1323 *
1324 * Ideally a negative interval would be an error, but certain clients
1325 * pass negative values on accident, and expect an event back.
1326 *
1327 * In the old implementation the timer would repeat with no delay
1328 * N times until mach_absolute_time() + (N * interval) underflowed,
1329 * then it would wait ~forever by accidentally arming a timer for the far future.
1330 *
1331 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
1332 */
1333
1334 kn->kn_sdata = 0; /* non-repeating */
1335 kn->kn_ext[0] = 0; /* expire immediately */
1336 } else {
1337 uint64_t interval_abs = 0;
1338
1339 if (use_abstime) {
1340 interval_abs = (uint64_t)kn->kn_sdata;
1341 } else {
1342 uint64_t interval_ns;
1343 if (os_mul_overflow((uint64_t)kn->kn_sdata, multiplier, &interval_ns))
1344 return (ERANGE);
1345
1346 nanoseconds_to_absolutetime(interval_ns, &interval_abs);
1347 }
1348
1349 uint64_t deadline = 0;
1350
1351 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1352 clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
1353 else
1354 clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
1355
1356 kn->kn_sdata = interval_abs; /* default to a repeating timer */
1357 kn->kn_ext[0] = deadline;
1358 }
1359
1360 return (0);
1361 }
1362
1363
1364
1365
1366 /*
1367 * filt_timerexpire - the timer callout routine
1368 *
1369 * Just propagate the timer event into the knote
1370 * filter routine (by going through the knote
1371 * synchronization point). Pass a hint to
1372 * indicate this is a real event, not just a
1373 * query from above.
1374 */
1375 static void
1376 filt_timerexpire(void *knx, __unused void *spare)
1377 {
1378 struct klist timer_list;
1379 struct knote *kn = knx;
1380
1381 filt_timerlock();
1382
1383 kn->kn_hookid &= ~TIMER_RUNNING;
1384
1385 /* no "object" for timers, so fake a list */
1386 SLIST_INIT(&timer_list);
1387 SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
1388
1389 KNOTE(&timer_list, 1);
1390
1391 /* if someone is waiting for timer to pop */
1392 if (kn->kn_hookid & TIMER_CANCELWAIT) {
1393 struct kqueue *kq = knote_get_kq(kn);
1394 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
1395 CAST_EVENT64_T(&kn->kn_hook),
1396 THREAD_AWAKENED,
1397 WAITQ_ALL_PRIORITIES);
1398
1399 kn->kn_hookid &= ~TIMER_CANCELWAIT;
1400 }
1401
1402 filt_timerunlock();
1403 }
1404
1405 /*
1406 * Cancel a running timer (or wait for the pop).
1407 * Timer filter lock is held.
1408 * May drop and retake the timer filter lock.
1409 */
1410 static void
1411 filt_timercancel(struct knote *kn)
1412 {
1413 filt_timer_assert_locked();
1414
1415 assert((kn->kn_hookid & TIMER_CANCELWAIT) == 0);
1416
1417 /* if no timer, then we're good */
1418 if ((kn->kn_hookid & TIMER_RUNNING) == 0)
1419 return;
1420
1421 thread_call_t callout = (thread_call_t)kn->kn_hook;
1422
1423 /* cancel the callout if we can */
1424 if (thread_call_cancel(callout)) {
1425 kn->kn_hookid &= ~TIMER_RUNNING;
1426 return;
1427 }
1428
1429 /* cancel failed, we have to wait for the in-flight expire routine */
1430
1431 kn->kn_hookid |= TIMER_CANCELWAIT;
1432
1433 struct kqueue *kq = knote_get_kq(kn);
1434
1435 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
1436 CAST_EVENT64_T(&kn->kn_hook),
1437 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1438
1439 filt_timerunlock();
1440 thread_block(THREAD_CONTINUE_NULL);
1441 filt_timerlock();
1442
1443 assert((kn->kn_hookid & TIMER_CANCELWAIT) == 0);
1444 assert((kn->kn_hookid & TIMER_RUNNING) == 0);
1445 }
1446
1447 static void
1448 filt_timerarm(struct knote *kn)
1449 {
1450 filt_timer_assert_locked();
1451
1452 assert((kn->kn_hookid & TIMER_RUNNING) == 0);
1453
1454 thread_call_t callout = (thread_call_t)kn->kn_hook;
1455
1456 uint64_t deadline = kn->kn_ext[0];
1457 uint64_t leeway = kn->kn_ext[1];
1458
1459 int filter_flags = kn->kn_sfflags;
1460 unsigned int timer_flags = 0;
1461
1462 if (filter_flags & NOTE_CRITICAL)
1463 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1464 else if (filter_flags & NOTE_BACKGROUND)
1465 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1466 else
1467 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1468
1469 if (filter_flags & NOTE_LEEWAY)
1470 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1471
1472 if (filter_flags & NOTE_MACH_CONTINUOUS_TIME)
1473 timer_flags |= THREAD_CALL_CONTINUOUS;
1474
1475 thread_call_enter_delayed_with_leeway(callout, NULL,
1476 deadline, leeway,
1477 timer_flags);
1478
1479 kn->kn_hookid |= TIMER_RUNNING;
1480 }
1481
1482 /*
1483 * Does this knote need a timer armed for it, or should it be ready immediately?
1484 */
1485 static boolean_t
1486 filt_timer_is_ready(struct knote *kn)
1487 {
1488 uint64_t now;
1489
1490 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1491 now = mach_continuous_time();
1492 else
1493 now = mach_absolute_time();
1494
1495 uint64_t deadline = kn->kn_ext[0];
1496
1497 if (deadline < now)
1498 return TRUE;
1499 else
1500 return FALSE;
1501 }
1502
1503 /*
1504 * Allocate a thread call for the knote's lifetime, and kick off the timer.
1505 */
1506 static int
1507 filt_timerattach(struct knote *kn, __unused struct kevent_internal_s *kev)
1508 {
1509 thread_call_t callout;
1510 int error;
1511
1512 callout = thread_call_allocate_with_options(filt_timerexpire,
1513 (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
1514 THREAD_CALL_OPTIONS_ONCE);
1515
1516 if (NULL == callout) {
1517 kn->kn_flags = EV_ERROR;
1518 kn->kn_data = ENOMEM;
1519 return 0;
1520 }
1521
1522 filt_timerlock();
1523
1524 if ((error = filt_timervalidate(kn)) != 0) {
1525 kn->kn_flags = EV_ERROR;
1526 kn->kn_data = error;
1527 filt_timerunlock();
1528
1529 __assert_only boolean_t freed = thread_call_free(callout);
1530 assert(freed);
1531 return 0;
1532 }
1533
1534 kn->kn_hook = (void*)callout;
1535 kn->kn_hookid = 0;
1536 kn->kn_flags |= EV_CLEAR;
1537
1538 /* NOTE_ABSOLUTE implies EV_ONESHOT */
1539 if (kn->kn_sfflags & NOTE_ABSOLUTE)
1540 kn->kn_flags |= EV_ONESHOT;
1541
1542 boolean_t timer_ready = FALSE;
1543
1544 if ((timer_ready = filt_timer_is_ready(kn))) {
1545 /* cause immediate expiration */
1546 kn->kn_data = 1;
1547 } else {
1548 filt_timerarm(kn);
1549 }
1550
1551 filt_timerunlock();
1552
1553 return timer_ready;
1554 }
1555
1556 /*
1557 * Shut down the timer if it's running, and free the callout.
1558 */
1559 static void
1560 filt_timerdetach(struct knote *kn)
1561 {
1562 thread_call_t callout;
1563
1564 filt_timerlock();
1565
1566 callout = (thread_call_t)kn->kn_hook;
1567 filt_timercancel(kn);
1568
1569 filt_timerunlock();
1570
1571 __assert_only boolean_t freed = thread_call_free(callout);
1572 assert(freed);
1573 }
1574
1575 /*
1576 * filt_timerevent - post events to a timer knote
1577 *
1578 * Called in the context of filt_timerexpire with
1579 * the filt_timerlock held
1580 */
1581 static int
1582 filt_timerevent(struct knote *kn, __unused long hint)
1583 {
1584 filt_timer_assert_locked();
1585
1586 kn->kn_data = 1;
1587 return (1);
1588 }
1589
1590 /*
1591 * filt_timertouch - update timer knote with new user input
1592 *
1593 * Cancel and restart the timer based on new user data. When
1594 * the user picks up a knote, clear the count of how many timer
1595 * pops have gone off (in kn_data).
1596 */
1597 static int
1598 filt_timertouch(
1599 struct knote *kn,
1600 struct kevent_internal_s *kev)
1601 {
1602 int error;
1603
1604 filt_timerlock();
1605
1606 /*
1607 * cancel current call - drops and retakes lock
1608 * TODO: not safe against concurrent touches?
1609 */
1610 filt_timercancel(kn);
1611
1612 /* clear if the timer had previously fired, the user no longer wants to see it */
1613 kn->kn_data = 0;
1614
1615 /* capture the new values used to compute deadline */
1616 kn->kn_sdata = kev->data;
1617 kn->kn_sfflags = kev->fflags;
1618 kn->kn_ext[0] = kev->ext[0];
1619 kn->kn_ext[1] = kev->ext[1];
1620
1621 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1622 kn->kn_udata = kev->udata;
1623
1624 /* recalculate deadline */
1625 error = filt_timervalidate(kn);
1626 if (error) {
1627 /* no way to report error, so mark it in the knote */
1628 kn->kn_flags |= EV_ERROR;
1629 kn->kn_data = error;
1630 filt_timerunlock();
1631 return 1;
1632 }
1633
1634 boolean_t timer_ready = FALSE;
1635
1636 if ((timer_ready = filt_timer_is_ready(kn))) {
1637 /* cause immediate expiration */
1638 kn->kn_data = 1;
1639 } else {
1640 filt_timerarm(kn);
1641 }
1642
1643 filt_timerunlock();
1644
1645 return timer_ready;
1646 }
1647
1648 /*
1649 * filt_timerprocess - query state of knote and snapshot event data
1650 *
1651 * Determine if the timer has fired in the past, snapshot the state
1652 * of the kevent for returning to user-space, and clear pending event
1653 * counters for the next time.
1654 */
1655 static int
1656 filt_timerprocess(
1657 struct knote *kn,
1658 __unused struct filt_process_s *data,
1659 struct kevent_internal_s *kev)
1660 {
1661 filt_timerlock();
1662
1663 if (kn->kn_data == 0 || (kn->kn_hookid & TIMER_CANCELWAIT)) {
1664 /*
1665 * kn_data = 0:
1666 * The timer hasn't yet fired, so there's nothing to deliver
1667 * TIMER_CANCELWAIT:
1668 * touch is in the middle of canceling the timer,
1669 * so don't deliver or re-arm anything
1670 *
1671 * This can happen if a touch resets a timer that had fired
1672 * without being processed
1673 */
1674 filt_timerunlock();
1675 return 0;
1676 }
1677
1678 if (kn->kn_sdata != 0 && ((kn->kn_flags & EV_ERROR) == 0)) {
1679 /*
1680 * This is a 'repeating' timer, so we have to emit
1681 * how many intervals expired between the arm
1682 * and the process.
1683 *
1684 * A very strange style of interface, because
1685 * this could easily be done in the client...
1686 */
1687
1688 /* The timer better have had expired... */
1689 assert((kn->kn_hookid & TIMER_RUNNING) == 0);
1690
1691 uint64_t now;
1692
1693 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1694 now = mach_continuous_time();
1695 else
1696 now = mach_absolute_time();
1697
1698 uint64_t first_deadline = kn->kn_ext[0];
1699 uint64_t interval_abs = kn->kn_sdata;
1700 uint64_t orig_arm_time = first_deadline - interval_abs;
1701
1702 assert(now > orig_arm_time);
1703 assert(now > first_deadline);
1704
1705 uint64_t elapsed = now - orig_arm_time;
1706
1707 uint64_t num_fired = elapsed / interval_abs;
1708
1709 /*
1710 * To reach this code, we must have seen the timer pop
1711 * and be in repeating mode, so therefore it must have been
1712 * more than 'interval' time since the attach or last
1713 * successful touch.
1714 *
1715 * An unsuccessful touch would:
1716 * disarm the timer
1717 * clear kn_data
1718 * clear kn_sdata
1719 * set EV_ERROR
1720 * all of which will prevent this code from running.
1721 */
1722 assert(num_fired > 0);
1723
1724 /* report how many intervals have elapsed to the user */
1725 kn->kn_data = (int64_t) num_fired;
1726
1727 /* We only need to re-arm the timer if it's not about to be destroyed */
1728 if ((kn->kn_flags & EV_ONESHOT) == 0) {
1729 /* fire at the end of the next interval */
1730 uint64_t new_deadline = first_deadline + num_fired * interval_abs;
1731
1732 assert(new_deadline > now);
1733
1734 kn->kn_ext[0] = new_deadline;
1735
1736 filt_timerarm(kn);
1737 }
1738 }
1739
1740 /*
1741 * Copy out the interesting kevent state,
1742 * but don't leak out the raw time calculations.
1743 *
1744 * TODO: potential enhancements - tell the user about:
1745 * - deadline to which this timer thought it was expiring
1746 * - return kn_sfflags in the fflags field so the client can know
1747 * under what flags the timer fired
1748 */
1749 *kev = kn->kn_kevent;
1750 kev->ext[0] = 0;
1751 /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */
1752
1753 /* we have delivered the event, reset the timer pop count */
1754 kn->kn_data = 0;
1755
1756 filt_timerunlock();
1757 return 1;
1758 }
1759
1760 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
1761 .f_attach = filt_timerattach,
1762 .f_detach = filt_timerdetach,
1763 .f_event = filt_timerevent,
1764 .f_touch = filt_timertouch,
1765 .f_process = filt_timerprocess,
1766 };
1767
1768
1769 #pragma mark EVFILT_USER
1770
1771
1772 static void
1773 filt_userlock(void)
1774 {
1775 lck_spin_lock(&_filt_userlock);
1776 }
1777
1778 static void
1779 filt_userunlock(void)
1780 {
1781 lck_spin_unlock(&_filt_userlock);
1782 }
1783
1784 static int
1785 filt_userattach(struct knote *kn, __unused struct kevent_internal_s *kev)
1786 {
1787 /* EVFILT_USER knotes are not attached to anything in the kernel */
1788 /* Cant discover this knote until after attach - so no lock needed */
1789 kn->kn_hook = NULL;
1790 if (kn->kn_sfflags & NOTE_TRIGGER) {
1791 kn->kn_hookid = 1;
1792 } else {
1793 kn->kn_hookid = 0;
1794 }
1795 return (kn->kn_hookid);
1796 }
1797
1798 static void
1799 filt_userdetach(__unused struct knote *kn)
1800 {
1801 /* EVFILT_USER knotes are not attached to anything in the kernel */
1802 }
1803
1804 static int
1805 filt_user(
1806 __unused struct knote *kn,
1807 __unused long hint)
1808 {
1809 panic("filt_user");
1810 return 0;
1811 }
1812
1813 static int
1814 filt_usertouch(
1815 struct knote *kn,
1816 struct kevent_internal_s *kev)
1817 {
1818 uint32_t ffctrl;
1819 int fflags;
1820 int active;
1821
1822 filt_userlock();
1823
1824 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1825 fflags = kev->fflags & NOTE_FFLAGSMASK;
1826 switch (ffctrl) {
1827 case NOTE_FFNOP:
1828 break;
1829 case NOTE_FFAND:
1830 kn->kn_sfflags &= fflags;
1831 break;
1832 case NOTE_FFOR:
1833 kn->kn_sfflags |= fflags;
1834 break;
1835 case NOTE_FFCOPY:
1836 kn->kn_sfflags = fflags;
1837 break;
1838 }
1839 kn->kn_sdata = kev->data;
1840
1841 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1842 kn->kn_udata = kev->udata;
1843
1844 if (kev->fflags & NOTE_TRIGGER) {
1845 kn->kn_hookid = 1;
1846 }
1847 active = kn->kn_hookid;
1848
1849 filt_userunlock();
1850
1851 return (active);
1852 }
1853
1854 static int
1855 filt_userprocess(
1856 struct knote *kn,
1857 __unused struct filt_process_s *data,
1858 struct kevent_internal_s *kev)
1859 {
1860 filt_userlock();
1861
1862 if (kn->kn_hookid == 0) {
1863 filt_userunlock();
1864 return 0;
1865 }
1866
1867 *kev = kn->kn_kevent;
1868 kev->fflags = (volatile UInt32)kn->kn_sfflags;
1869 kev->data = kn->kn_sdata;
1870 if (kn->kn_flags & EV_CLEAR) {
1871 kn->kn_hookid = 0;
1872 kn->kn_data = 0;
1873 kn->kn_fflags = 0;
1874 }
1875 filt_userunlock();
1876
1877 return 1;
1878 }
1879
1880 #pragma mark EVFILT_WORKLOOP
1881
1882 #if DEBUG || DEVELOPMENT
1883 /*
1884 * see src/queue_internal.h in libdispatch
1885 */
1886 #define DISPATCH_QUEUE_ENQUEUED 0x1ull
1887 #endif
1888
1889 static inline void
1890 filt_wllock(struct kqworkloop *kqwl)
1891 {
1892 lck_mtx_lock(&kqwl->kqwl_statelock);
1893 }
1894
1895 static inline void
1896 filt_wlunlock(struct kqworkloop *kqwl)
1897 {
1898 lck_mtx_unlock(&kqwl->kqwl_statelock);
1899 }
1900
1901 static inline void
1902 filt_wlheld(__assert_only struct kqworkloop *kqwl)
1903 {
1904 LCK_MTX_ASSERT(&kqwl->kqwl_statelock, LCK_MTX_ASSERT_OWNED);
1905 }
1906
1907 #define WL_OWNER_SUSPENDED ((thread_t)(~0ull)) /* special owner when suspended */
1908
1909 static inline bool
1910 filt_wlowner_is_valid(thread_t owner)
1911 {
1912 return owner != THREAD_NULL && owner != WL_OWNER_SUSPENDED;
1913 }
1914
1915 static inline bool
1916 filt_wlshould_end_ownership(struct kqworkloop *kqwl,
1917 struct kevent_internal_s *kev, int error)
1918 {
1919 thread_t owner = kqwl->kqwl_owner;
1920 return (error == 0 || error == ESTALE) &&
1921 (kev->fflags & NOTE_WL_END_OWNERSHIP) &&
1922 (owner == current_thread() || owner == WL_OWNER_SUSPENDED);
1923 }
1924
1925 static inline bool
1926 filt_wlshould_update_ownership(struct kevent_internal_s *kev, int error)
1927 {
1928 return error == 0 && (kev->fflags & NOTE_WL_DISCOVER_OWNER) &&
1929 kev->ext[EV_EXTIDX_WL_ADDR];
1930 }
1931
1932 static inline bool
1933 filt_wlshould_set_async_qos(struct kevent_internal_s *kev, int error,
1934 kq_index_t async_qos)
1935 {
1936 if (error != 0) {
1937 return false;
1938 }
1939 if (async_qos != THREAD_QOS_UNSPECIFIED) {
1940 return true;
1941 }
1942 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
1943 /* see filt_wlprocess() */
1944 return true;
1945 }
1946 return false;
1947 }
1948
1949 __result_use_check
1950 static int
1951 filt_wlupdateowner(struct kqworkloop *kqwl, struct kevent_internal_s *kev,
1952 int error, kq_index_t async_qos)
1953 {
1954 struct kqrequest *kqr = &kqwl->kqwl_request;
1955 thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
1956 kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
1957 kq_index_t old_owner_override = THREAD_QOS_UNSPECIFIED;
1958 boolean_t ipc_override_is_sync = false;
1959 boolean_t old_owner_override_is_sync = false;
1960 int action = KQWL_UTQ_NONE;
1961
1962 filt_wlheld(kqwl);
1963
1964 /*
1965 * The owner is only changed under both the filt_wllock and the
1966 * kqwl_req_lock. Looking at it with either one held is fine.
1967 */
1968 cur_owner = kqwl->kqwl_owner;
1969 if (filt_wlshould_end_ownership(kqwl, kev, error)) {
1970 new_owner = THREAD_NULL;
1971 } else if (filt_wlshould_update_ownership(kev, error)) {
1972 /*
1973 * Decipher the owner port name, and translate accordingly.
1974 * The low 2 bits were borrowed for other flags, so mask them off.
1975 */
1976 uint64_t udata = kev->ext[EV_EXTIDX_WL_VALUE];
1977 mach_port_name_t new_owner_name = (mach_port_name_t)udata & ~0x3;
1978 if (new_owner_name != MACH_PORT_NULL) {
1979 new_owner_name = ipc_entry_name_mask(new_owner_name);
1980 }
1981
1982 if (MACH_PORT_VALID(new_owner_name)) {
1983 new_owner = port_name_to_thread(new_owner_name);
1984 if (new_owner == THREAD_NULL)
1985 return EOWNERDEAD;
1986 extra_thread_ref = new_owner;
1987 } else if (new_owner_name == MACH_PORT_DEAD) {
1988 new_owner = WL_OWNER_SUSPENDED;
1989 } else {
1990 /*
1991 * We never want to learn a new owner that is NULL.
1992 * Ownership should be ended with END_OWNERSHIP.
1993 */
1994 new_owner = cur_owner;
1995 }
1996 } else {
1997 new_owner = cur_owner;
1998 }
1999
2000 if (filt_wlshould_set_async_qos(kev, error, async_qos)) {
2001 action = KQWL_UTQ_SET_ASYNC_QOS;
2002 }
2003 if (cur_owner == new_owner && action == KQWL_UTQ_NONE) {
2004 goto out;
2005 }
2006
2007 kqwl_req_lock(kqwl);
2008
2009 /* If already tracked as servicer, don't track as owner */
2010 if ((kqr->kqr_state & KQR_BOUND) && new_owner == kqr->kqr_thread) {
2011 kqwl->kqwl_owner = new_owner = THREAD_NULL;
2012 }
2013
2014 if (cur_owner != new_owner) {
2015 kqwl->kqwl_owner = new_owner;
2016 if (new_owner == extra_thread_ref) {
2017 /* we just transfered this ref to kqwl_owner */
2018 extra_thread_ref = THREAD_NULL;
2019 }
2020 cur_override = kqworkloop_combined_qos(kqwl, &ipc_override_is_sync);
2021 old_owner_override = kqr->kqr_dsync_owner_qos;
2022 old_owner_override_is_sync = kqr->kqr_owner_override_is_sync;
2023
2024 if (filt_wlowner_is_valid(new_owner)) {
2025 /* override it before we drop the old */
2026 if (cur_override != THREAD_QOS_UNSPECIFIED) {
2027 thread_add_ipc_override(new_owner, cur_override);
2028 }
2029 if (ipc_override_is_sync) {
2030 thread_add_sync_ipc_override(new_owner);
2031 }
2032 /* Update the kqr to indicate that owner has sync ipc override */
2033 kqr->kqr_dsync_owner_qos = cur_override;
2034 kqr->kqr_owner_override_is_sync = ipc_override_is_sync;
2035 thread_starts_owning_workloop(new_owner);
2036 if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED) {
2037 if (action == KQWL_UTQ_NONE) {
2038 action = KQWL_UTQ_REDRIVE_EVENTS;
2039 }
2040 }
2041 } else if (new_owner == THREAD_NULL) {
2042 kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
2043 kqr->kqr_owner_override_is_sync = false;
2044 if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_WAKEUP)) == KQR_WAKEUP) {
2045 if (action == KQWL_UTQ_NONE) {
2046 action = KQWL_UTQ_REDRIVE_EVENTS;
2047 }
2048 }
2049 }
2050 }
2051
2052 if (action != KQWL_UTQ_NONE) {
2053 kqworkloop_update_threads_qos(kqwl, action, async_qos);
2054 }
2055
2056 kqwl_req_unlock(kqwl);
2057
2058 /* Now that we are unlocked, drop the override and ref on old owner */
2059 if (new_owner != cur_owner && filt_wlowner_is_valid(cur_owner)) {
2060 if (old_owner_override != THREAD_QOS_UNSPECIFIED) {
2061 thread_drop_ipc_override(cur_owner);
2062 }
2063 if (old_owner_override_is_sync) {
2064 thread_drop_sync_ipc_override(cur_owner);
2065 }
2066 thread_ends_owning_workloop(cur_owner);
2067 thread_deallocate(cur_owner);
2068 }
2069
2070 out:
2071 if (extra_thread_ref) {
2072 thread_deallocate(extra_thread_ref);
2073 }
2074 return error;
2075 }
2076
2077 static int
2078 filt_wldebounce(
2079 struct kqworkloop *kqwl,
2080 struct kevent_internal_s *kev,
2081 int default_result)
2082 {
2083 user_addr_t addr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
2084 uint64_t udata;
2085 int error;
2086
2087 /* we must have the workloop state mutex held */
2088 filt_wlheld(kqwl);
2089
2090 /* Do we have a debounce address to work with? */
2091 if (addr) {
2092 uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
2093 uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
2094
2095 error = copyin_word(addr, &udata, sizeof(udata));
2096 if (error) {
2097 return error;
2098 }
2099
2100 /* update state as copied in */
2101 kev->ext[EV_EXTIDX_WL_VALUE] = udata;
2102
2103 /* If the masked bits don't match, reject it as stale */
2104 if ((udata & mask) != (kdata & mask)) {
2105 return ESTALE;
2106 }
2107
2108 #if DEBUG || DEVELOPMENT
2109 if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && !(kev->flags & EV_DELETE)) {
2110 if ((udata & DISPATCH_QUEUE_ENQUEUED) == 0) {
2111 panic("kevent: workloop %#016llx is not enqueued "
2112 "(kev:%p dq_state:%#016llx)", kev->udata, kev, udata);
2113 }
2114 }
2115 #endif
2116 }
2117
2118 return default_result;
2119 }
2120
2121 /*
2122 * Remembers the last updated that came in from userspace for debugging reasons.
2123 * - fflags is mirrored from the userspace kevent
2124 * - ext[i, i != VALUE] is mirrored from the userspace kevent
2125 * - ext[VALUE] is set to what the kernel loaded atomically
2126 * - data is set to the error if any
2127 */
2128 static inline void
2129 filt_wlremember_last_update(
2130 __assert_only struct kqworkloop *kqwl,
2131 struct knote *kn,
2132 struct kevent_internal_s *kev,
2133 int error)
2134 {
2135 filt_wlheld(kqwl);
2136 kn->kn_fflags = kev->fflags;
2137 kn->kn_data = error;
2138 memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
2139 }
2140
2141 /*
2142 * Return which operations on EVFILT_WORKLOOP need to be protected against
2143 * knoteusewait() causing priority inversions.
2144 */
2145 static bool
2146 filt_wlneeds_boost(struct kevent_internal_s *kev)
2147 {
2148 if (kev == NULL) {
2149 /*
2150 * this is an f_process() usecount, and it can cause a drop to wait
2151 */
2152 return true;
2153 }
2154 if (kev->fflags & NOTE_WL_THREAD_REQUEST) {
2155 /*
2156 * All operations on thread requests may starve drops or re-attach of
2157 * the same knote, all of them need boosts. None of what we do under
2158 * thread-request usecount holds blocks anyway.
2159 */
2160 return true;
2161 }
2162 if (kev->fflags & NOTE_WL_SYNC_WAIT) {
2163 /*
2164 * this may call filt_wlwait() and we don't want to hold any boost when
2165 * woken up, this would cause background threads contending on
2166 * dispatch_sync() to wake up at 64 and be preempted immediately when
2167 * this drops.
2168 */
2169 return false;
2170 }
2171
2172 /*
2173 * SYNC_WAIT knotes when deleted don't need to be rushed, there's no
2174 * detach/reattach race with these ever. In addition to this, when the
2175 * SYNC_WAIT knote is dropped, the caller is no longer receiving the
2176 * workloop overrides if any, and we'd rather schedule other threads than
2177 * him, he's not possibly stalling anything anymore.
2178 */
2179 return (kev->flags & EV_DELETE) == 0;
2180 }
2181
2182 static int
2183 filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
2184 {
2185 struct kqueue *kq = knote_get_kq(kn);
2186 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2187 int error = 0;
2188 kq_index_t qos_index = 0;
2189
2190 if ((kq->kq_state & KQ_WORKLOOP) == 0) {
2191 error = ENOTSUP;
2192 goto out;
2193 }
2194
2195 #if DEVELOPMENT || DEBUG
2196 if (kev->ident == 0 && kev->udata == 0 && kev->fflags == 0) {
2197 struct kqrequest *kqr = &kqwl->kqwl_request;
2198
2199 kqwl_req_lock(kqwl);
2200 kev->fflags = 0;
2201 if (kqr->kqr_dsync_waiters) {
2202 kev->fflags |= NOTE_WL_SYNC_WAIT;
2203 }
2204 if (kqr->kqr_qos_index) {
2205 kev->fflags |= NOTE_WL_THREAD_REQUEST;
2206 }
2207 if (kqwl->kqwl_owner == WL_OWNER_SUSPENDED) {
2208 kev->ext[0] = ~0ull;
2209 } else {
2210 kev->ext[0] = thread_tid(kqwl->kqwl_owner);
2211 }
2212 kev->ext[1] = thread_tid(kqwl->kqwl_request.kqr_thread);
2213 kev->ext[2] = thread_owned_workloops_count(current_thread());
2214 kev->ext[3] = kn->kn_kevent.ext[3];
2215 kqwl_req_unlock(kqwl);
2216 error = EBUSY;
2217 goto out;
2218 }
2219 #endif
2220
2221 /* Some simple validation */
2222 int command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
2223 switch (command) {
2224 case NOTE_WL_THREAD_REQUEST:
2225 if (kn->kn_id != kqwl->kqwl_dynamicid) {
2226 error = EINVAL;
2227 goto out;
2228 }
2229 qos_index = qos_index_from_qos(kn, kn->kn_qos, FALSE);
2230 if (qos_index < THREAD_QOS_MAINTENANCE ||
2231 qos_index > THREAD_QOS_USER_INTERACTIVE) {
2232 error = ERANGE;
2233 goto out;
2234 }
2235 break;
2236 case NOTE_WL_SYNC_WAIT:
2237 case NOTE_WL_SYNC_WAKE:
2238 if (kq->kq_state & KQ_NO_WQ_THREAD) {
2239 error = ENOTSUP;
2240 goto out;
2241 }
2242 if (kn->kn_id == kqwl->kqwl_dynamicid) {
2243 error = EINVAL;
2244 goto out;
2245 }
2246 if ((kn->kn_flags & EV_DISABLE) == 0) {
2247 error = EINVAL;
2248 goto out;
2249 }
2250 if (kn->kn_sfflags & NOTE_WL_END_OWNERSHIP) {
2251 error = EINVAL;
2252 goto out;
2253 }
2254 break;
2255 default:
2256 error = EINVAL;
2257 goto out;
2258 }
2259
2260 filt_wllock(kqwl);
2261 kn->kn_hook = NULL;
2262
2263 if (command == NOTE_WL_THREAD_REQUEST && kqwl->kqwl_request.kqr_qos_index) {
2264 /*
2265 * There already is a thread request, and well, you're only allowed
2266 * one per workloop, so fail the attach.
2267 *
2268 * Note: kqr_qos_index is always set with the wllock held, so we
2269 * don't need to take the kqr lock.
2270 */
2271 error = EALREADY;
2272 } else {
2273 /* Make sure user and kernel are in agreement on important state */
2274 error = filt_wldebounce(kqwl, kev, 0);
2275 }
2276
2277 error = filt_wlupdateowner(kqwl, kev, error, qos_index);
2278 filt_wlunlock(kqwl);
2279 out:
2280 if (error) {
2281 kn->kn_flags |= EV_ERROR;
2282 /* If userland wants ESTALE to be hidden, fail the attach anyway */
2283 if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
2284 error = 0;
2285 }
2286 kn->kn_data = error;
2287 return 0;
2288 }
2289
2290 /* Just attaching the thread request successfully will fire it */
2291 return command == NOTE_WL_THREAD_REQUEST;
2292 }
2293
2294 __attribute__((noinline,not_tail_called))
2295 static int
2296 filt_wlwait(struct kqworkloop *kqwl,
2297 struct knote *kn,
2298 struct kevent_internal_s *kev)
2299 {
2300 filt_wlheld(kqwl);
2301 assert((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0);
2302
2303 /*
2304 * Hint to the wakeup side that this thread is waiting. Also used by
2305 * stackshot for waitinfo.
2306 */
2307 kn->kn_hook = current_thread();
2308
2309 thread_set_pending_block_hint(current_thread(), kThreadWaitWorkloopSyncWait);
2310
2311 wait_result_t wr = assert_wait(kn, THREAD_ABORTSAFE);
2312
2313 if (wr == THREAD_WAITING) {
2314 kq_index_t qos_index = qos_index_from_qos(kn, kev->qos, TRUE);
2315 struct kqrequest *kqr = &kqwl->kqwl_request;
2316
2317 thread_t thread_to_handoff = THREAD_NULL; /* holds +1 thread ref */
2318
2319 thread_t kqwl_owner = kqwl->kqwl_owner;
2320 if (filt_wlowner_is_valid(kqwl_owner)) {
2321 thread_reference(kqwl_owner);
2322 thread_to_handoff = kqwl_owner;
2323 }
2324
2325 kqwl_req_lock(kqwl);
2326
2327 if (qos_index) {
2328 assert(kqr->kqr_dsync_waiters < UINT16_MAX);
2329 kqr->kqr_dsync_waiters++;
2330 if (qos_index > kqr->kqr_dsync_waiters_qos) {
2331 kqworkloop_update_threads_qos(kqwl,
2332 KQWL_UTQ_SET_SYNC_WAITERS_QOS, qos_index);
2333 }
2334 }
2335
2336 if ((kqr->kqr_state & KQR_BOUND) && thread_to_handoff == THREAD_NULL) {
2337 assert(kqr->kqr_thread != THREAD_NULL);
2338 thread_t servicer = kqr->kqr_thread;
2339
2340 thread_reference(servicer);
2341 thread_to_handoff = servicer;
2342 }
2343
2344 kqwl_req_unlock(kqwl);
2345
2346 filt_wlunlock(kqwl);
2347
2348 /* TODO: use continuation based blocking <rdar://problem/31299584> */
2349
2350 /* consume a refcount on thread_to_handoff, then thread_block() */
2351 wr = thread_handoff(thread_to_handoff);
2352 thread_to_handoff = THREAD_NULL;
2353
2354 filt_wllock(kqwl);
2355
2356 /* clear waiting state (only one waiting thread - so no race) */
2357 assert(kn->kn_hook == current_thread());
2358
2359 if (qos_index) {
2360 kqwl_req_lock(kqwl);
2361 assert(kqr->kqr_dsync_waiters > 0);
2362 if (--kqr->kqr_dsync_waiters == 0) {
2363 assert(kqr->kqr_dsync_waiters_qos);
2364 kqworkloop_update_threads_qos(kqwl,
2365 KQWL_UTQ_SET_SYNC_WAITERS_QOS, 0);
2366 }
2367 kqwl_req_unlock(kqwl);
2368 }
2369 }
2370
2371 kn->kn_hook = NULL;
2372
2373 switch (wr) {
2374 case THREAD_AWAKENED:
2375 return 0;
2376 case THREAD_INTERRUPTED:
2377 return EINTR;
2378 case THREAD_RESTART:
2379 return ECANCELED;
2380 default:
2381 panic("filt_wlattach: unexpected wait result %d", wr);
2382 return EINVAL;
2383 }
2384 }
2385
2386 /* called in stackshot context to report the thread responsible for blocking this thread */
2387 void
2388 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
2389 event64_t event,
2390 thread_waitinfo_t *waitinfo)
2391 {
2392 struct knote *kn = (struct knote*) event;
2393 assert(kdp_is_in_zone(kn, "knote zone"));
2394
2395 assert(kn->kn_hook == thread);
2396
2397 struct kqueue *kq = knote_get_kq(kn);
2398 assert(kdp_is_in_zone(kq, "kqueue workloop zone"));
2399 assert(kq->kq_state & KQ_WORKLOOP);
2400
2401 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2402 struct kqrequest *kqr = &kqwl->kqwl_request;
2403
2404 thread_t kqwl_owner = kqwl->kqwl_owner;
2405 thread_t servicer = kqr->kqr_thread;
2406
2407 if (kqwl_owner == WL_OWNER_SUSPENDED) {
2408 waitinfo->owner = STACKSHOT_WAITOWNER_SUSPENDED;
2409 } else if (kqwl_owner != THREAD_NULL) {
2410 assert(kdp_is_in_zone(kqwl_owner, "threads"));
2411
2412 waitinfo->owner = thread_tid(kqwl->kqwl_owner);
2413 } else if (servicer != THREAD_NULL) {
2414 assert(kdp_is_in_zone(servicer, "threads"));
2415
2416 waitinfo->owner = thread_tid(servicer);
2417 } else if (kqr->kqr_state & KQR_THREQUESTED) {
2418 waitinfo->owner = STACKSHOT_WAITOWNER_THREQUESTED;
2419 } else {
2420 waitinfo->owner = 0;
2421 }
2422
2423 waitinfo->context = kqwl->kqwl_dynamicid;
2424
2425 return;
2426 }
2427
2428 /*
2429 * Takes kqueue locked, returns locked, may drop in the middle and/or block for a while
2430 */
2431 static int
2432 filt_wlpost_attach(struct knote *kn, struct kevent_internal_s *kev)
2433 {
2434 struct kqueue *kq = knote_get_kq(kn);
2435 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2436 int error = 0;
2437
2438 if (kev->fflags & NOTE_WL_SYNC_WAIT) {
2439 if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
2440 filt_wllock(kqwl);
2441 /* if the wake has already preposted, don't wait */
2442 if ((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0)
2443 error = filt_wlwait(kqwl, kn, kev);
2444 filt_wlunlock(kqwl);
2445 knoteuse2kqlock(kq, kn, KNUSE_NONE);
2446 }
2447 }
2448 return error;
2449 }
2450
2451 static void
2452 filt_wldetach(__assert_only struct knote *kn)
2453 {
2454 assert(knote_get_kq(kn)->kq_state & KQ_WORKLOOP);
2455
2456 /*
2457 * Thread requests have nothing to detach.
2458 * Sync waiters should have been aborted out
2459 * and drop their refs before we could drop/
2460 * detach their knotes.
2461 */
2462 assert(kn->kn_hook == NULL);
2463 }
2464
2465 static int
2466 filt_wlevent(
2467 __unused struct knote *kn,
2468 __unused long hint)
2469 {
2470 panic("filt_wlevent");
2471 return 0;
2472 }
2473
2474 static int
2475 filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev)
2476 {
2477 int new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
2478 int sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
2479 int error = 0;
2480
2481 switch (new_commands) {
2482 case NOTE_WL_THREAD_REQUEST:
2483 /* thread requests can only update themselves */
2484 if (sav_commands != new_commands)
2485 error = EINVAL;
2486 break;
2487
2488 case NOTE_WL_SYNC_WAIT:
2489 if (kev->fflags & NOTE_WL_END_OWNERSHIP)
2490 error = EINVAL;
2491 /* FALLTHROUGH */
2492 case NOTE_WL_SYNC_WAKE:
2493 /* waits and wakes can update themselves or their counterparts */
2494 if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)))
2495 error = EINVAL;
2496 if (kev->fflags & NOTE_WL_UPDATE_QOS)
2497 error = EINVAL;
2498 if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE)
2499 error = EINVAL;
2500 if (kev->flags & EV_DELETE) {
2501 /*
2502 * Really this is not supported: there is absolutely no reason
2503 * whatsoever to want to fail the drop of a NOTE_WL_SYNC_WAIT knote.
2504 */
2505 if (kev->ext[EV_EXTIDX_WL_ADDR] && kev->ext[EV_EXTIDX_WL_MASK]) {
2506 error = EINVAL;
2507 }
2508 }
2509 break;
2510
2511 default:
2512 error = EINVAL;
2513 }
2514 if ((kev->flags & EV_DELETE) && (kev->fflags & NOTE_WL_DISCOVER_OWNER)) {
2515 error = EINVAL;
2516 }
2517 return error;
2518 }
2519
2520 static int
2521 filt_wltouch(
2522 struct knote *kn,
2523 struct kevent_internal_s *kev)
2524 {
2525 struct kqueue *kq = knote_get_kq(kn);
2526 int error = 0;
2527 struct kqworkloop *kqwl;
2528
2529 assert(kq->kq_state & KQ_WORKLOOP);
2530 kqwl = (struct kqworkloop *)kq;
2531
2532 error = filt_wlvalidate_kev_flags(kn, kev);
2533 if (error) {
2534 goto out;
2535 }
2536
2537 filt_wllock(kqwl);
2538
2539 /* Make sure user and kernel are in agreement on important state */
2540 error = filt_wldebounce(kqwl, kev, 0);
2541 if (error) {
2542 error = filt_wlupdateowner(kqwl, kev, error, 0);
2543 goto out_unlock;
2544 }
2545
2546 int new_command = kev->fflags & NOTE_WL_COMMANDS_MASK;
2547 switch (new_command) {
2548 case NOTE_WL_THREAD_REQUEST:
2549 assert(kqwl->kqwl_request.kqr_qos_index != THREAD_QOS_UNSPECIFIED);
2550 break;
2551
2552 case NOTE_WL_SYNC_WAIT:
2553 /*
2554 * we need to allow waiting several times on the same knote because
2555 * of EINTR. If it's already woken though, it won't block.
2556 */
2557 break;
2558
2559 case NOTE_WL_SYNC_WAKE:
2560 if (kn->kn_sfflags & NOTE_WL_SYNC_WAKE) {
2561 /* disallow waking the same knote twice */
2562 error = EALREADY;
2563 goto out_unlock;
2564 }
2565 if (kn->kn_hook) {
2566 thread_wakeup_thread((event_t)kn, (thread_t)kn->kn_hook);
2567 }
2568 break;
2569
2570 default:
2571 error = EINVAL;
2572 goto out_unlock;
2573 }
2574
2575 /*
2576 * Save off any additional fflags/data we just accepted
2577 * But only keep the last round of "update" bits we acted on which helps
2578 * debugging a lot.
2579 */
2580 kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
2581 kn->kn_sfflags |= kev->fflags;
2582 kn->kn_sdata = kev->data;
2583
2584 kq_index_t qos_index = THREAD_QOS_UNSPECIFIED;
2585
2586 if (kev->fflags & NOTE_WL_UPDATE_QOS) {
2587 qos_t qos = pthread_priority_canonicalize(kev->qos, FALSE);
2588
2589 if (kn->kn_qos != qos) {
2590 qos_index = qos_index_from_qos(kn, qos, FALSE);
2591 if (qos_index == THREAD_QOS_UNSPECIFIED) {
2592 error = ERANGE;
2593 goto out_unlock;
2594 }
2595 kqlock(kq);
2596 if (kn->kn_status & KN_QUEUED) {
2597 knote_dequeue(kn);
2598 knote_set_qos_index(kn, qos_index);
2599 knote_enqueue(kn);
2600 knote_wakeup(kn);
2601 } else {
2602 knote_set_qos_index(kn, qos_index);
2603 }
2604 kn->kn_qos = qos;
2605 kqunlock(kq);
2606 }
2607 }
2608
2609 error = filt_wlupdateowner(kqwl, kev, 0, qos_index);
2610 if (error) {
2611 goto out_unlock;
2612 }
2613
2614 if (new_command == NOTE_WL_SYNC_WAIT) {
2615 /* if the wake has already preposted, don't wait */
2616 if ((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0)
2617 error = filt_wlwait(kqwl, kn, kev);
2618 }
2619
2620 out_unlock:
2621 filt_wlremember_last_update(kqwl, kn, kev, error);
2622 filt_wlunlock(kqwl);
2623 out:
2624 if (error) {
2625 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2626 /* If userland wants ESTALE to be hidden, do not activate */
2627 return 0;
2628 }
2629 kev->flags |= EV_ERROR;
2630 kev->data = error;
2631 return 0;
2632 }
2633 /* Just touching the thread request successfully will fire it */
2634 return new_command == NOTE_WL_THREAD_REQUEST;
2635 }
2636
2637 static int
2638 filt_wldrop_and_unlock(
2639 struct knote *kn,
2640 struct kevent_internal_s *kev)
2641 {
2642 struct kqueue *kq = knote_get_kq(kn);
2643 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2644 int error = 0, knoteuse_flags = KNUSE_NONE;
2645
2646 kqlock_held(kq);
2647
2648 assert(kev->flags & EV_DELETE);
2649 assert(kq->kq_state & KQ_WORKLOOP);
2650
2651 error = filt_wlvalidate_kev_flags(kn, kev);
2652 if (error) {
2653 goto out;
2654 }
2655
2656 if (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) {
2657 knoteuse_flags |= KNUSE_BOOST;
2658 }
2659
2660 /* take a usecount to allow taking the filt_wllock */
2661 if (!kqlock2knoteuse(kq, kn, knoteuse_flags)) {
2662 /* knote is being dropped already */
2663 error = EINPROGRESS;
2664 goto out;
2665 }
2666
2667 filt_wllock(kqwl);
2668
2669 /*
2670 * Make sure user and kernel are in agreement on important state
2671 *
2672 * Userland will modify bits to cause this to fail for the touch / drop
2673 * race case (when a drop for a thread request quiescing comes in late after
2674 * the workloop has been woken up again).
2675 */
2676 error = filt_wldebounce(kqwl, kev, 0);
2677
2678 if (!knoteuse2kqlock(kq, kn, knoteuse_flags)) {
2679 /* knote is no longer alive */
2680 error = EINPROGRESS;
2681 goto out_unlock;
2682 }
2683
2684 if (!error && (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) && kn->kn_inuse) {
2685 /*
2686 * There is a concurrent drop or touch happening, we can't resolve this,
2687 * userland has to redrive.
2688 *
2689 * The race we're worried about here is the following:
2690 *
2691 * f_touch | f_drop_and_unlock
2692 * ------------------------+--------------------------------------------
2693 * | kqlock()
2694 * | kqlock2knoteuse()
2695 * | filt_wllock()
2696 * | debounces successfully
2697 * kqlock() |
2698 * kqlock2knoteuse |
2699 * filt_wllock() <BLOCKS> |
2700 * | knoteuse2kqlock()
2701 * | filt_wlunlock()
2702 * | kqlock2knotedrop() <BLOCKS, WAKES f_touch>
2703 * debounces successfully |
2704 * filt_wlunlock() |
2705 * caller WAKES f_drop |
2706 * | performs drop, but f_touch should have won
2707 *
2708 * So if the usecount is not 0 here, we need to wait for it to drop and
2709 * redrive the whole logic (including looking up the knote again).
2710 */
2711 filt_wlunlock(kqwl);
2712 knoteusewait(kq, kn);
2713 return ERESTART;
2714 }
2715
2716 /*
2717 * If error is 0 this will set kqr_qos_index to THREAD_QOS_UNSPECIFIED
2718 *
2719 * If error is 0 or ESTALE this may drop ownership and cause a thread
2720 * request redrive, however the kqlock is held which prevents f_process() to
2721 * run until we did the drop for real.
2722 */
2723 error = filt_wlupdateowner(kqwl, kev, error, 0);
2724 if (error) {
2725 goto out_unlock;
2726 }
2727
2728 if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
2729 NOTE_WL_SYNC_WAIT) {
2730 /*
2731 * When deleting a SYNC_WAIT knote that hasn't been woken up
2732 * explicitly, issue a wake up.
2733 */
2734 kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
2735 if (kn->kn_hook) {
2736 thread_wakeup_thread((event_t)kn, (thread_t)kn->kn_hook);
2737 }
2738 }
2739
2740 out_unlock:
2741 filt_wlremember_last_update(kqwl, kn, kev, error);
2742 filt_wlunlock(kqwl);
2743
2744 out:
2745 if (error == 0) {
2746 /* If nothing failed, do the regular knote drop. */
2747 if (kqlock2knotedrop(kq, kn)) {
2748 knote_drop(kn, current_proc());
2749 } else {
2750 error = EINPROGRESS;
2751 }
2752 } else {
2753 kqunlock(kq);
2754 }
2755 if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
2756 error = 0;
2757 }
2758 if (error == EINPROGRESS) {
2759 /*
2760 * filt_wlprocess() makes sure that no event can be delivered for
2761 * NOTE_WL_THREAD_REQUEST knotes once a drop is happening, and
2762 * NOTE_WL_SYNC_* knotes are never fired.
2763 *
2764 * It means that EINPROGRESS is about a state that userland cannot
2765 * observe for this filter (an event being delivered concurrently from
2766 * a drop), so silence the error.
2767 */
2768 error = 0;
2769 }
2770 return error;
2771 }
2772
2773 static int
2774 filt_wlprocess(
2775 struct knote *kn,
2776 __unused struct filt_process_s *data,
2777 struct kevent_internal_s *kev)
2778 {
2779 struct kqueue *kq = knote_get_kq(kn);
2780 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
2781 struct kqrequest *kqr = &kqwl->kqwl_request;
2782 int rc = 0;
2783
2784 assert(kq->kq_state & KQ_WORKLOOP);
2785
2786 /* only thread requests should get here */
2787 assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
2788 if (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) {
2789 filt_wllock(kqwl);
2790 assert(kqr->kqr_qos_index != THREAD_QOS_UNSPECIFIED);
2791 if (kqwl->kqwl_owner) {
2792 /*
2793 * <rdar://problem/33584321> userspace sometimes due to events being
2794 * delivered but not triggering a drain session can cause a process
2795 * of the thread request knote.
2796 *
2797 * When that happens, the automatic deactivation due to process
2798 * would swallow the event, so we have to activate the knote again.
2799 */
2800 kqlock(kq);
2801 knote_activate(kn);
2802 kqunlock(kq);
2803 } else if (kqr->kqr_qos_index) {
2804 #if DEBUG || DEVELOPMENT
2805 user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
2806 task_t t = current_task();
2807 uint64_t val;
2808 if (addr && task_is_active(t) && !task_is_halting(t) &&
2809 copyin_word(addr, &val, sizeof(val)) == 0 &&
2810 val && (val & DISPATCH_QUEUE_ENQUEUED) == 0) {
2811 panic("kevent: workloop %#016llx is not enqueued "
2812 "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
2813 kn->kn_udata, kn, val,
2814 kn->kn_ext[EV_EXTIDX_WL_VALUE]);
2815 }
2816 #endif
2817 *kev = kn->kn_kevent;
2818 kev->fflags = kn->kn_sfflags;
2819 kev->data = kn->kn_sdata;
2820 kev->qos = kn->kn_qos;
2821 rc = 1;
2822 }
2823 filt_wlunlock(kqwl);
2824 }
2825 return rc;
2826 }
2827
2828 #pragma mark kevent / knotes
2829
2830 /*
2831 * JMM - placeholder for not-yet-implemented filters
2832 */
2833 static int
2834 filt_badattach(__unused struct knote *kn, __unused struct kevent_internal_s *kev)
2835 {
2836 kn->kn_flags |= EV_ERROR;
2837 kn->kn_data = ENOTSUP;
2838 return 0;
2839 }
2840
2841 struct kqueue *
2842 kqueue_alloc(struct proc *p, unsigned int flags)
2843 {
2844 struct filedesc *fdp = p->p_fd;
2845 struct kqueue *kq = NULL;
2846 int policy;
2847 void *hook = NULL;
2848 uint64_t kq_addr_offset;
2849
2850 if (flags & KEVENT_FLAG_WORKQ) {
2851 struct kqworkq *kqwq;
2852 int i;
2853
2854 kqwq = (struct kqworkq *)zalloc(kqworkq_zone);
2855 if (kqwq == NULL)
2856 return NULL;
2857
2858 kq = &kqwq->kqwq_kqueue;
2859 bzero(kqwq, sizeof (struct kqworkq));
2860
2861 kqwq->kqwq_state = KQ_WORKQ;
2862
2863 for (i = 0; i < KQWQ_NBUCKETS; i++) {
2864 TAILQ_INIT(&kq->kq_queue[i]);
2865 }
2866 for (i = 0; i < KQWQ_NQOS; i++) {
2867 kqwq->kqwq_request[i].kqr_qos_index = i;
2868 }
2869
2870 lck_spin_init(&kqwq->kqwq_reqlock, kq_lck_grp, kq_lck_attr);
2871 policy = SYNC_POLICY_FIFO;
2872 hook = (void *)kqwq;
2873
2874 } else if (flags & KEVENT_FLAG_WORKLOOP) {
2875 struct kqworkloop *kqwl;
2876 int i;
2877
2878 kqwl = (struct kqworkloop *)zalloc(kqworkloop_zone);
2879 if (kqwl == NULL)
2880 return NULL;
2881
2882 bzero(kqwl, sizeof (struct kqworkloop));
2883
2884 kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC;
2885 kqwl->kqwl_retains = 1; /* donate a retain to creator */
2886
2887 kq = &kqwl->kqwl_kqueue;
2888 for (i = 0; i < KQWL_NBUCKETS; i++) {
2889 TAILQ_INIT(&kq->kq_queue[i]);
2890 }
2891 TAILQ_INIT(&kqwl->kqwl_request.kqr_suppressed);
2892
2893 lck_spin_init(&kqwl->kqwl_reqlock, kq_lck_grp, kq_lck_attr);
2894 lck_mtx_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr);
2895
2896 policy = SYNC_POLICY_FIFO;
2897 if (flags & KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD) {
2898 policy |= SYNC_POLICY_PREPOST;
2899 kq->kq_state |= KQ_NO_WQ_THREAD;
2900 } else {
2901 hook = (void *)kqwl;
2902 }
2903
2904 } else {
2905 struct kqfile *kqf;
2906
2907 kqf = (struct kqfile *)zalloc(kqfile_zone);
2908 if (kqf == NULL)
2909 return NULL;
2910
2911 kq = &kqf->kqf_kqueue;
2912 bzero(kqf, sizeof (struct kqfile));
2913 TAILQ_INIT(&kq->kq_queue[0]);
2914 TAILQ_INIT(&kqf->kqf_suppressed);
2915
2916 policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST;
2917 }
2918
2919 waitq_set_init(&kq->kq_wqs, policy, NULL, hook);
2920 lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
2921 kq->kq_p = p;
2922
2923 if (fdp->fd_knlistsize < 0) {
2924 proc_fdlock(p);
2925 if (fdp->fd_knlistsize < 0)
2926 fdp->fd_knlistsize = 0; /* this process has had a kq */
2927 proc_fdunlock(p);
2928 }
2929
2930 kq_addr_offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS);
2931 /* Assert that the address can be pointer compacted for use with knote */
2932 assert(kq_addr_offset < (uint64_t)(1ull << KNOTE_KQ_BITSIZE));
2933 return (kq);
2934 }
2935
2936 /*
2937 * knotes_dealloc - detach all knotes for the process and drop them
2938 *
2939 * Called with proc_fdlock held.
2940 * Returns with it locked.
2941 * May drop it temporarily.
2942 * Process is in such a state that it will not try to allocate
2943 * any more knotes during this process (stopped for exit or exec).
2944 */
2945 void
2946 knotes_dealloc(proc_t p)
2947 {
2948 struct filedesc *fdp = p->p_fd;
2949 struct kqueue *kq;
2950 struct knote *kn;
2951 struct klist *kn_hash = NULL;
2952 int i;
2953
2954 /* Close all the fd-indexed knotes up front */
2955 if (fdp->fd_knlistsize > 0) {
2956 for (i = 0; i < fdp->fd_knlistsize; i++) {
2957 while ((kn = SLIST_FIRST(&fdp->fd_knlist[i])) != NULL) {
2958 kq = knote_get_kq(kn);
2959 kqlock(kq);
2960 proc_fdunlock(p);
2961 /* drop it ourselves or wait */
2962 if (kqlock2knotedrop(kq, kn)) {
2963 knote_drop(kn, p);
2964 }
2965 proc_fdlock(p);
2966 }
2967 }
2968 /* free the table */
2969 FREE(fdp->fd_knlist, M_KQUEUE);
2970 fdp->fd_knlist = NULL;
2971 }
2972 fdp->fd_knlistsize = -1;
2973
2974 knhash_lock(p);
2975 proc_fdunlock(p);
2976
2977 /* Clean out all the hashed knotes as well */
2978 if (fdp->fd_knhashmask != 0) {
2979 for (i = 0; i <= (int)fdp->fd_knhashmask; i++) {
2980 while ((kn = SLIST_FIRST(&fdp->fd_knhash[i])) != NULL) {
2981 kq = knote_get_kq(kn);
2982 kqlock(kq);
2983 knhash_unlock(p);
2984 /* drop it ourselves or wait */
2985 if (kqlock2knotedrop(kq, kn)) {
2986 knote_drop(kn, p);
2987 }
2988 knhash_lock(p);
2989 }
2990 }
2991 kn_hash = fdp->fd_knhash;
2992 fdp->fd_knhashmask = 0;
2993 fdp->fd_knhash = NULL;
2994 }
2995
2996 knhash_unlock(p);
2997
2998 /* free the kn_hash table */
2999 if (kn_hash)
3000 FREE(kn_hash, M_KQUEUE);
3001
3002 proc_fdlock(p);
3003 }
3004
3005
3006 /*
3007 * kqueue_dealloc - detach all knotes from a kqueue and free it
3008 *
3009 * We walk each list looking for knotes referencing this
3010 * this kqueue. If we find one, we try to drop it. But
3011 * if we fail to get a drop reference, that will wait
3012 * until it is dropped. So, we can just restart again
3013 * safe in the assumption that the list will eventually
3014 * not contain any more references to this kqueue (either
3015 * we dropped them all, or someone else did).
3016 *
3017 * Assumes no new events are being added to the kqueue.
3018 * Nothing locked on entry or exit.
3019 *
3020 * Workloop kqueues cant get here unless all the knotes
3021 * are already gone and all requested threads have come
3022 * and gone (cancelled or arrived).
3023 */
3024 void
3025 kqueue_dealloc(struct kqueue *kq)
3026 {
3027 struct proc *p;
3028 struct filedesc *fdp;
3029 struct knote *kn;
3030 int i;
3031
3032 if (kq == NULL)
3033 return;
3034
3035 p = kq->kq_p;
3036 fdp = p->p_fd;
3037
3038 proc_fdlock(p);
3039 for (i = 0; i < fdp->fd_knlistsize; i++) {
3040 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
3041 while (kn != NULL) {
3042 if (kq == knote_get_kq(kn)) {
3043 assert((kq->kq_state & KQ_WORKLOOP) == 0);
3044 kqlock(kq);
3045 proc_fdunlock(p);
3046 /* drop it ourselves or wait */
3047 if (kqlock2knotedrop(kq, kn)) {
3048 knote_drop(kn, p);
3049 }
3050 proc_fdlock(p);
3051 /* start over at beginning of list */
3052 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
3053 continue;
3054 }
3055 kn = SLIST_NEXT(kn, kn_link);
3056 }
3057 }
3058 knhash_lock(p);
3059 proc_fdunlock(p);
3060
3061 if (fdp->fd_knhashmask != 0) {
3062 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
3063 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3064 while (kn != NULL) {
3065 if (kq == knote_get_kq(kn)) {
3066 assert((kq->kq_state & KQ_WORKLOOP) == 0);
3067 kqlock(kq);
3068 knhash_unlock(p);
3069 /* drop it ourselves or wait */
3070 if (kqlock2knotedrop(kq, kn)) {
3071 knote_drop(kn, p);
3072 }
3073 knhash_lock(p);
3074 /* start over at beginning of list */
3075 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3076 continue;
3077 }
3078 kn = SLIST_NEXT(kn, kn_link);
3079 }
3080 }
3081 }
3082 knhash_unlock(p);
3083
3084 if (kq->kq_state & KQ_WORKLOOP) {
3085 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3086 struct kqrequest *kqr = &kqwl->kqwl_request;
3087 thread_t cur_owner = kqwl->kqwl_owner;
3088
3089 assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed));
3090 if (filt_wlowner_is_valid(cur_owner)) {
3091 /*
3092 * If the kqueue had an owner that prevented the thread request to
3093 * go through, then no unbind happened, and we may have lingering
3094 * overrides to drop.
3095 */
3096 if (kqr->kqr_dsync_owner_qos != THREAD_QOS_UNSPECIFIED) {
3097 thread_drop_ipc_override(cur_owner);
3098 kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
3099 }
3100
3101 if (kqr->kqr_owner_override_is_sync) {
3102 thread_drop_sync_ipc_override(cur_owner);
3103 kqr->kqr_owner_override_is_sync = 0;
3104 }
3105 thread_ends_owning_workloop(cur_owner);
3106 thread_deallocate(cur_owner);
3107 kqwl->kqwl_owner = THREAD_NULL;
3108 }
3109 }
3110
3111 /*
3112 * waitq_set_deinit() remove the KQ's waitq set from
3113 * any select sets to which it may belong.
3114 */
3115 waitq_set_deinit(&kq->kq_wqs);
3116 lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
3117
3118 if (kq->kq_state & KQ_WORKQ) {
3119 struct kqworkq *kqwq = (struct kqworkq *)kq;
3120
3121 lck_spin_destroy(&kqwq->kqwq_reqlock, kq_lck_grp);
3122 zfree(kqworkq_zone, kqwq);
3123 } else if (kq->kq_state & KQ_WORKLOOP) {
3124 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3125
3126 assert(kqwl->kqwl_retains == 0);
3127 lck_spin_destroy(&kqwl->kqwl_reqlock, kq_lck_grp);
3128 lck_mtx_destroy(&kqwl->kqwl_statelock, kq_lck_grp);
3129 zfree(kqworkloop_zone, kqwl);
3130 } else {
3131 struct kqfile *kqf = (struct kqfile *)kq;
3132
3133 zfree(kqfile_zone, kqf);
3134 }
3135 }
3136
3137 static inline void
3138 kqueue_retain(struct kqueue *kq)
3139 {
3140 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3141 uint32_t previous;
3142
3143 if ((kq->kq_state & KQ_DYNAMIC) == 0)
3144 return;
3145
3146 previous = OSIncrementAtomic(&kqwl->kqwl_retains);
3147 if (previous == KQ_WORKLOOP_RETAINS_MAX)
3148 panic("kq(%p) retain overflow", kq);
3149
3150 if (previous == 0)
3151 panic("kq(%p) resurrection", kq);
3152 }
3153
3154 #define KQUEUE_CANT_BE_LAST_REF 0
3155 #define KQUEUE_MIGHT_BE_LAST_REF 1
3156
3157 static inline int
3158 kqueue_release(struct kqueue *kq, __assert_only int possibly_last)
3159 {
3160 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3161
3162 if ((kq->kq_state & KQ_DYNAMIC) == 0) {
3163 return 0;
3164 }
3165
3166 assert(kq->kq_state & KQ_WORKLOOP); /* for now */
3167 uint32_t refs = OSDecrementAtomic(&kqwl->kqwl_retains);
3168 if (__improbable(refs == 0)) {
3169 panic("kq(%p) over-release", kq);
3170 }
3171 if (refs == 1) {
3172 assert(possibly_last);
3173 }
3174 return refs == 1;
3175 }
3176
3177 int
3178 kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
3179 {
3180 struct kqueue *kq;
3181 struct fileproc *fp;
3182 int fd, error;
3183
3184 error = falloc_withalloc(p,
3185 &fp, &fd, vfs_context_current(), fp_zalloc, cra);
3186 if (error) {
3187 return (error);
3188 }
3189
3190 kq = kqueue_alloc(p, 0);
3191 if (kq == NULL) {
3192 fp_free(p, fd, fp);
3193 return (ENOMEM);
3194 }
3195
3196 fp->f_flag = FREAD | FWRITE;
3197 fp->f_ops = &kqueueops;
3198 fp->f_data = kq;
3199
3200 proc_fdlock(p);
3201 *fdflags(p, fd) |= UF_EXCLOSE;
3202 procfdtbl_releasefd(p, fd, NULL);
3203 fp_drop(p, fd, fp, 1);
3204 proc_fdunlock(p);
3205
3206 *retval = fd;
3207 return (error);
3208 }
3209
3210 int
3211 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
3212 {
3213 return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
3214 }
3215
3216 static int
3217 kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
3218 unsigned int flags)
3219 {
3220 int advance;
3221 int error;
3222
3223 if (flags & KEVENT_FLAG_LEGACY32) {
3224 bzero(kevp, sizeof (*kevp));
3225
3226 if (IS_64BIT_PROCESS(p)) {
3227 struct user64_kevent kev64;
3228
3229 advance = sizeof (kev64);
3230 error = copyin(*addrp, (caddr_t)&kev64, advance);
3231 if (error)
3232 return (error);
3233 kevp->ident = kev64.ident;
3234 kevp->filter = kev64.filter;
3235 kevp->flags = kev64.flags;
3236 kevp->udata = kev64.udata;
3237 kevp->fflags = kev64.fflags;
3238 kevp->data = kev64.data;
3239 } else {
3240 struct user32_kevent kev32;
3241
3242 advance = sizeof (kev32);
3243 error = copyin(*addrp, (caddr_t)&kev32, advance);
3244 if (error)
3245 return (error);
3246 kevp->ident = (uintptr_t)kev32.ident;
3247 kevp->filter = kev32.filter;
3248 kevp->flags = kev32.flags;
3249 kevp->udata = CAST_USER_ADDR_T(kev32.udata);
3250 kevp->fflags = kev32.fflags;
3251 kevp->data = (intptr_t)kev32.data;
3252 }
3253 } else if (flags & KEVENT_FLAG_LEGACY64) {
3254 struct kevent64_s kev64;
3255
3256 bzero(kevp, sizeof (*kevp));
3257
3258 advance = sizeof (struct kevent64_s);
3259 error = copyin(*addrp, (caddr_t)&kev64, advance);
3260 if (error)
3261 return(error);
3262 kevp->ident = kev64.ident;
3263 kevp->filter = kev64.filter;
3264 kevp->flags = kev64.flags;
3265 kevp->udata = kev64.udata;
3266 kevp->fflags = kev64.fflags;
3267 kevp->data = kev64.data;
3268 kevp->ext[0] = kev64.ext[0];
3269 kevp->ext[1] = kev64.ext[1];
3270
3271 } else {
3272 struct kevent_qos_s kevqos;
3273
3274 bzero(kevp, sizeof (*kevp));
3275
3276 advance = sizeof (struct kevent_qos_s);
3277 error = copyin(*addrp, (caddr_t)&kevqos, advance);
3278 if (error)
3279 return error;
3280 kevp->ident = kevqos.ident;
3281 kevp->filter = kevqos.filter;
3282 kevp->flags = kevqos.flags;
3283 kevp->qos = kevqos.qos;
3284 // kevp->xflags = kevqos.xflags;
3285 kevp->udata = kevqos.udata;
3286 kevp->fflags = kevqos.fflags;
3287 kevp->data = kevqos.data;
3288 kevp->ext[0] = kevqos.ext[0];
3289 kevp->ext[1] = kevqos.ext[1];
3290 kevp->ext[2] = kevqos.ext[2];
3291 kevp->ext[3] = kevqos.ext[3];
3292 }
3293 if (!error)
3294 *addrp += advance;
3295 return (error);
3296 }
3297
3298 static int
3299 kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
3300 unsigned int flags)
3301 {
3302 user_addr_t addr = *addrp;
3303 int advance;
3304 int error;
3305
3306 /*
3307 * fully initialize the differnt output event structure
3308 * types from the internal kevent (and some universal
3309 * defaults for fields not represented in the internal
3310 * form).
3311 */
3312 if (flags & KEVENT_FLAG_LEGACY32) {
3313 assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0);
3314
3315 if (IS_64BIT_PROCESS(p)) {
3316 struct user64_kevent kev64;
3317
3318 advance = sizeof (kev64);
3319 bzero(&kev64, advance);
3320
3321 /*
3322 * deal with the special case of a user-supplied
3323 * value of (uintptr_t)-1.
3324 */
3325 kev64.ident = (kevp->ident == (uintptr_t)-1) ?
3326 (uint64_t)-1LL : (uint64_t)kevp->ident;
3327
3328 kev64.filter = kevp->filter;
3329 kev64.flags = kevp->flags;
3330 kev64.fflags = kevp->fflags;
3331 kev64.data = (int64_t) kevp->data;
3332 kev64.udata = kevp->udata;
3333 error = copyout((caddr_t)&kev64, addr, advance);
3334 } else {
3335 struct user32_kevent kev32;
3336
3337 advance = sizeof (kev32);
3338 bzero(&kev32, advance);
3339 kev32.ident = (uint32_t)kevp->ident;
3340 kev32.filter = kevp->filter;
3341 kev32.flags = kevp->flags;
3342 kev32.fflags = kevp->fflags;
3343 kev32.data = (int32_t)kevp->data;
3344 kev32.udata = kevp->udata;
3345 error = copyout((caddr_t)&kev32, addr, advance);
3346 }
3347 } else if (flags & KEVENT_FLAG_LEGACY64) {
3348 struct kevent64_s kev64;
3349
3350 advance = sizeof (struct kevent64_s);
3351 if (flags & KEVENT_FLAG_STACK_EVENTS) {
3352 addr -= advance;
3353 }
3354 bzero(&kev64, advance);
3355 kev64.ident = kevp->ident;
3356 kev64.filter = kevp->filter;
3357 kev64.flags = kevp->flags;
3358 kev64.fflags = kevp->fflags;
3359 kev64.data = (int64_t) kevp->data;
3360 kev64.udata = kevp->udata;
3361 kev64.ext[0] = kevp->ext[0];
3362 kev64.ext[1] = kevp->ext[1];
3363 error = copyout((caddr_t)&kev64, addr, advance);
3364 } else {
3365 struct kevent_qos_s kevqos;
3366
3367 advance = sizeof (struct kevent_qos_s);
3368 if (flags & KEVENT_FLAG_STACK_EVENTS) {
3369 addr -= advance;
3370 }
3371 bzero(&kevqos, advance);
3372 kevqos.ident = kevp->ident;
3373 kevqos.filter = kevp->filter;
3374 kevqos.flags = kevp->flags;
3375 kevqos.qos = kevp->qos;
3376 kevqos.udata = kevp->udata;
3377 kevqos.fflags = kevp->fflags;
3378 kevqos.xflags = 0;
3379 kevqos.data = (int64_t) kevp->data;
3380 kevqos.ext[0] = kevp->ext[0];
3381 kevqos.ext[1] = kevp->ext[1];
3382 kevqos.ext[2] = kevp->ext[2];
3383 kevqos.ext[3] = kevp->ext[3];
3384 error = copyout((caddr_t)&kevqos, addr, advance);
3385 }
3386 if (!error) {
3387 if (flags & KEVENT_FLAG_STACK_EVENTS)
3388 *addrp = addr;
3389 else
3390 *addrp = addr + advance;
3391 }
3392 return (error);
3393 }
3394
3395 static int
3396 kevent_get_data_size(struct proc *p,
3397 uint64_t data_available,
3398 unsigned int flags,
3399 user_size_t *residp)
3400 {
3401 user_size_t resid;
3402 int error = 0;
3403
3404 if (data_available != USER_ADDR_NULL) {
3405 if (flags & KEVENT_FLAG_KERNEL) {
3406 resid = *(user_size_t *)(uintptr_t)data_available;
3407 } else if (IS_64BIT_PROCESS(p)) {
3408 user64_size_t usize;
3409 error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
3410 resid = (user_size_t)usize;
3411 } else {
3412 user32_size_t usize;
3413 error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
3414 resid = (user_size_t)usize;
3415 }
3416 if (error)
3417 return(error);
3418 } else {
3419 resid = 0;
3420 }
3421 *residp = resid;
3422 return 0;
3423 }
3424
3425 static int
3426 kevent_put_data_size(struct proc *p,
3427 uint64_t data_available,
3428 unsigned int flags,
3429 user_size_t resid)
3430 {
3431 int error = 0;
3432
3433 if (data_available) {
3434 if (flags & KEVENT_FLAG_KERNEL) {
3435 *(user_size_t *)(uintptr_t)data_available = resid;
3436 } else if (IS_64BIT_PROCESS(p)) {
3437 user64_size_t usize = (user64_size_t)resid;
3438 error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
3439 } else {
3440 user32_size_t usize = (user32_size_t)resid;
3441 error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
3442 }
3443 }
3444 return error;
3445 }
3446
3447 /*
3448 * kevent_continue - continue a kevent syscall after blocking
3449 *
3450 * assume we inherit a use count on the kq fileglob.
3451 */
3452
3453 __attribute__((noreturn))
3454 static void
3455 kevent_continue(__unused struct kqueue *kq, void *data, int error)
3456 {
3457 struct _kevent *cont_args;
3458 struct fileproc *fp;
3459 uint64_t data_available;
3460 user_size_t data_size;
3461 user_size_t data_resid;
3462 unsigned int flags;
3463 int32_t *retval;
3464 int noutputs;
3465 int fd;
3466 struct proc *p = current_proc();
3467
3468 cont_args = (struct _kevent *)data;
3469 data_available = cont_args->data_available;
3470 flags = cont_args->process_data.fp_flags;
3471 data_size = cont_args->process_data.fp_data_size;
3472 data_resid = cont_args->process_data.fp_data_resid;
3473 noutputs = cont_args->eventout;
3474 retval = cont_args->retval;
3475 fd = cont_args->fd;
3476 fp = cont_args->fp;
3477
3478 kevent_put_kq(p, fd, fp, kq);
3479
3480 /* don't abandon other output just because of residual copyout failures */
3481 if (error == 0 && data_available && data_resid != data_size) {
3482 (void)kevent_put_data_size(p, data_available, flags, data_resid);
3483 }
3484
3485 /* don't restart after signals... */
3486 if (error == ERESTART)
3487 error = EINTR;
3488 else if (error == EWOULDBLOCK)
3489 error = 0;
3490 if (error == 0)
3491 *retval = noutputs;
3492 unix_syscall_return(error);
3493 }
3494
3495 /*
3496 * kevent - [syscall] register and wait for kernel events
3497 *
3498 */
3499 int
3500 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
3501 {
3502 unsigned int flags = KEVENT_FLAG_LEGACY32;
3503
3504 return kevent_internal(p,
3505 (kqueue_id_t)uap->fd, NULL,
3506 uap->changelist, uap->nchanges,
3507 uap->eventlist, uap->nevents,
3508 0ULL, 0ULL,
3509 flags,
3510 uap->timeout,
3511 kevent_continue,
3512 retval);
3513 }
3514
3515 int
3516 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
3517 {
3518 unsigned int flags;
3519
3520 /* restrict to user flags and set legacy64 */
3521 flags = uap->flags & KEVENT_FLAG_USER;
3522 flags |= KEVENT_FLAG_LEGACY64;
3523
3524 return kevent_internal(p,
3525 (kqueue_id_t)uap->fd, NULL,
3526 uap->changelist, uap->nchanges,
3527 uap->eventlist, uap->nevents,
3528 0ULL, 0ULL,
3529 flags,
3530 uap->timeout,
3531 kevent_continue,
3532 retval);
3533 }
3534
3535 int
3536 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
3537 {
3538 /* restrict to user flags */
3539 uap->flags &= KEVENT_FLAG_USER;
3540
3541 return kevent_internal(p,
3542 (kqueue_id_t)uap->fd, NULL,
3543 uap->changelist, uap->nchanges,
3544 uap->eventlist, uap->nevents,
3545 uap->data_out, (uint64_t)uap->data_available,
3546 uap->flags,
3547 0ULL,
3548 kevent_continue,
3549 retval);
3550 }
3551
3552 int
3553 kevent_qos_internal(struct proc *p, int fd,
3554 user_addr_t changelist, int nchanges,
3555 user_addr_t eventlist, int nevents,
3556 user_addr_t data_out, user_size_t *data_available,
3557 unsigned int flags,
3558 int32_t *retval)
3559 {
3560 return kevent_internal(p,
3561 (kqueue_id_t)fd, NULL,
3562 changelist, nchanges,
3563 eventlist, nevents,
3564 data_out, (uint64_t)data_available,
3565 (flags | KEVENT_FLAG_KERNEL),
3566 0ULL,
3567 NULL,
3568 retval);
3569 }
3570
3571 int
3572 kevent_id(struct proc *p, struct kevent_id_args *uap, int32_t *retval)
3573 {
3574 /* restrict to user flags */
3575 uap->flags &= KEVENT_FLAG_USER;
3576
3577 return kevent_internal(p,
3578 (kqueue_id_t)uap->id, NULL,
3579 uap->changelist, uap->nchanges,
3580 uap->eventlist, uap->nevents,
3581 uap->data_out, (uint64_t)uap->data_available,
3582 (uap->flags | KEVENT_FLAG_DYNAMIC_KQUEUE),
3583 0ULL,
3584 kevent_continue,
3585 retval);
3586 }
3587
3588 int
3589 kevent_id_internal(struct proc *p, kqueue_id_t *id,
3590 user_addr_t changelist, int nchanges,
3591 user_addr_t eventlist, int nevents,
3592 user_addr_t data_out, user_size_t *data_available,
3593 unsigned int flags,
3594 int32_t *retval)
3595 {
3596 return kevent_internal(p,
3597 *id, id,
3598 changelist, nchanges,
3599 eventlist, nevents,
3600 data_out, (uint64_t)data_available,
3601 (flags | KEVENT_FLAG_KERNEL | KEVENT_FLAG_DYNAMIC_KQUEUE),
3602 0ULL,
3603 NULL,
3604 retval);
3605 }
3606
3607 static int
3608 kevent_get_timeout(struct proc *p,
3609 user_addr_t utimeout,
3610 unsigned int flags,
3611 struct timeval *atvp)
3612 {
3613 struct timeval atv;
3614 int error = 0;
3615
3616 if (flags & KEVENT_FLAG_IMMEDIATE) {
3617 getmicrouptime(&atv);
3618 } else if (utimeout != USER_ADDR_NULL) {
3619 struct timeval rtv;
3620 if (flags & KEVENT_FLAG_KERNEL) {
3621 struct timespec *tsp = (struct timespec *)utimeout;
3622 TIMESPEC_TO_TIMEVAL(&rtv, tsp);
3623 } else if (IS_64BIT_PROCESS(p)) {
3624 struct user64_timespec ts;
3625 error = copyin(utimeout, &ts, sizeof(ts));
3626 if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
3627 error = EINVAL;
3628 else
3629 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
3630 } else {
3631 struct user32_timespec ts;
3632 error = copyin(utimeout, &ts, sizeof(ts));
3633 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
3634 }
3635 if (error)
3636 return (error);
3637 if (itimerfix(&rtv))
3638 return (EINVAL);
3639 getmicrouptime(&atv);
3640 timevaladd(&atv, &rtv);
3641 } else {
3642 /* wait forever value */
3643 atv.tv_sec = 0;
3644 atv.tv_usec = 0;
3645 }
3646 *atvp = atv;
3647 return 0;
3648 }
3649
3650 static int
3651 kevent_set_kq_mode(struct kqueue *kq, unsigned int flags)
3652 {
3653 /* each kq should only be used for events of one type */
3654 kqlock(kq);
3655 if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) {
3656 if (flags & KEVENT_FLAG_LEGACY32) {
3657 if ((kq->kq_state & KQ_KEV32) == 0) {
3658 kqunlock(kq);
3659 return EINVAL;
3660 }
3661 } else if (kq->kq_state & KQ_KEV32) {
3662 kqunlock(kq);
3663 return EINVAL;
3664 }
3665 } else if (flags & KEVENT_FLAG_LEGACY32) {
3666 kq->kq_state |= KQ_KEV32;
3667 } else if (flags & KEVENT_FLAG_LEGACY64) {
3668 kq->kq_state |= KQ_KEV64;
3669 } else {
3670 kq->kq_state |= KQ_KEV_QOS;
3671 }
3672 kqunlock(kq);
3673 return 0;
3674 }
3675
3676 #define KQ_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
3677 #define CONFIG_KQ_HASHSIZE CONFIG_KN_HASHSIZE
3678
3679 static inline void
3680 kqhash_lock(proc_t p)
3681 {
3682 lck_mtx_lock_spin_always(&p->p_fd->fd_kqhashlock);
3683 }
3684
3685 static inline void
3686 kqhash_lock_held(__assert_only proc_t p)
3687 {
3688 LCK_MTX_ASSERT(&p->p_fd->fd_kqhashlock, LCK_MTX_ASSERT_OWNED);
3689 }
3690
3691 static inline void
3692 kqhash_unlock(proc_t p)
3693 {
3694 lck_mtx_unlock(&p->p_fd->fd_kqhashlock);
3695 }
3696
3697 static void
3698 kqueue_hash_init_if_needed(proc_t p)
3699 {
3700 struct filedesc *fdp = p->p_fd;
3701
3702 kqhash_lock_held(p);
3703
3704 if (__improbable(fdp->fd_kqhash == NULL)) {
3705 struct kqlist *alloc_hash;
3706 u_long alloc_mask;
3707
3708 kqhash_unlock(p);
3709 alloc_hash = hashinit(CONFIG_KQ_HASHSIZE, M_KQUEUE, &alloc_mask);
3710 kqhash_lock(p);
3711
3712 /* See if we won the race */
3713 if (fdp->fd_kqhashmask == 0) {
3714 fdp->fd_kqhash = alloc_hash;
3715 fdp->fd_kqhashmask = alloc_mask;
3716 } else {
3717 kqhash_unlock(p);
3718 FREE(alloc_hash, M_KQUEUE);
3719 kqhash_lock(p);
3720 }
3721 }
3722 }
3723
3724 /*
3725 * Called with the kqhash_lock() held
3726 */
3727 static void
3728 kqueue_hash_insert(
3729 struct proc *p,
3730 kqueue_id_t id,
3731 struct kqueue *kq)
3732 {
3733 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3734 struct filedesc *fdp = p->p_fd;
3735 struct kqlist *list;
3736
3737 /* should hold the kq hash lock */
3738 kqhash_lock_held(p);
3739
3740 if ((kq->kq_state & KQ_DYNAMIC) == 0) {
3741 assert(kq->kq_state & KQ_DYNAMIC);
3742 return;
3743 }
3744
3745 /* only dynamically allocate workloop kqs for now */
3746 assert(kq->kq_state & KQ_WORKLOOP);
3747 assert(fdp->fd_kqhash);
3748
3749 kqwl->kqwl_dynamicid = id;
3750
3751 list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3752 SLIST_INSERT_HEAD(list, kqwl, kqwl_hashlink);
3753 }
3754
3755 /* Called with kqhash_lock held */
3756 static void
3757 kqueue_hash_remove(
3758 struct proc *p,
3759 struct kqueue *kq)
3760 {
3761 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
3762 struct filedesc *fdp = p->p_fd;
3763 struct kqlist *list;
3764
3765 /* should hold the kq hash lock */
3766 kqhash_lock_held(p);
3767
3768 if ((kq->kq_state & KQ_DYNAMIC) == 0) {
3769 assert(kq->kq_state & KQ_DYNAMIC);
3770 return;
3771 }
3772 assert(kq->kq_state & KQ_WORKLOOP); /* for now */
3773 list = &fdp->fd_kqhash[KQ_HASH(kqwl->kqwl_dynamicid, fdp->fd_kqhashmask)];
3774 SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
3775 }
3776
3777 /* Called with kqhash_lock held */
3778 static struct kqueue *
3779 kqueue_hash_lookup(struct proc *p, kqueue_id_t id)
3780 {
3781 struct filedesc *fdp = p->p_fd;
3782 struct kqlist *list;
3783 struct kqworkloop *kqwl;
3784
3785 /* should hold the kq hash lock */
3786 kqhash_lock_held(p);
3787
3788 if (fdp->fd_kqhashmask == 0) return NULL;
3789
3790 list = &fdp->fd_kqhash[KQ_HASH(id, fdp->fd_kqhashmask)];
3791 SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
3792 if (kqwl->kqwl_dynamicid == id) {
3793 struct kqueue *kq = (struct kqueue *)kqwl;
3794
3795 assert(kq->kq_state & KQ_DYNAMIC);
3796 assert(kq->kq_state & KQ_WORKLOOP); /* for now */
3797 return kq;
3798 }
3799 }
3800 return NULL;
3801 }
3802
3803 static inline void
3804 kqueue_release_last(struct proc *p, struct kqueue *kq)
3805 {
3806 if (kq->kq_state & KQ_DYNAMIC) {
3807 kqhash_lock(p);
3808 if (kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF)) {
3809 kqueue_hash_remove(p, kq);
3810 kqhash_unlock(p);
3811 kqueue_dealloc(kq);
3812 } else {
3813 kqhash_unlock(p);
3814 }
3815 }
3816 }
3817
3818 static struct kqueue *
3819 kevent_get_bound_kq(__assert_only struct proc *p, thread_t thread,
3820 unsigned int kev_flags, unsigned int kq_flags)
3821 {
3822 struct kqueue *kq;
3823 struct uthread *ut = get_bsdthread_info(thread);
3824
3825 assert(p == get_bsdthreadtask_info(thread));
3826
3827 if (!(ut->uu_kqueue_flags & kev_flags))
3828 return NULL;
3829
3830 kq = ut->uu_kqueue_bound;
3831 if (!kq)
3832 return NULL;
3833
3834 if (!(kq->kq_state & kq_flags))
3835 return NULL;
3836
3837 return kq;
3838 }
3839
3840 static int
3841 kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct fileproc **fpp, int *fdp, struct kqueue **kqp)
3842 {
3843 struct filedesc *descp = p->p_fd;
3844 struct fileproc *fp = NULL;
3845 struct kqueue *kq;
3846 int fd = 0;
3847 int error = 0;
3848
3849 /* Was the workloop flag passed? Then it is for sure only a workloop */
3850 if (flags & KEVENT_FLAG_DYNAMIC_KQUEUE) {
3851 assert(flags & KEVENT_FLAG_WORKLOOP);
3852 if (id == (kqueue_id_t)-1 &&
3853 (flags & KEVENT_FLAG_KERNEL) &&
3854 (flags & KEVENT_FLAG_WORKLOOP)) {
3855
3856 assert(is_workqueue_thread(current_thread()));
3857
3858 /*
3859 * when kevent_id_internal is called from within the
3860 * kernel, and the passed 'id' value is '-1' then we
3861 * look for the currently bound workloop kq.
3862 *
3863 * Until pthread kext avoids calling in to kevent_id_internal
3864 * for threads whose fulfill is canceled, calling in unbound
3865 * can't be fatal.
3866 */
3867 kq = kevent_get_bound_kq(p, current_thread(),
3868 KEVENT_FLAG_WORKLOOP, KQ_WORKLOOP);
3869 if (kq) {
3870 kqueue_retain(kq);
3871 } else {
3872 struct uthread *ut = get_bsdthread_info(current_thread());
3873
3874 /* If thread is unbound due to cancel, just return an error */
3875 if (ut->uu_kqueue_flags == KEVENT_FLAG_WORKLOOP_CANCELED) {
3876 ut->uu_kqueue_flags = 0;
3877 error = ECANCELED;
3878 } else {
3879 panic("Unbound thread called kevent_internal with id=-1"
3880 " uu_kqueue_flags:0x%x, uu_kqueue_bound:%p",
3881 ut->uu_kqueue_flags, ut->uu_kqueue_bound);
3882 }
3883 }
3884
3885 *fpp = NULL;
3886 *fdp = 0;
3887 *kqp = kq;
3888 return error;
3889 }
3890
3891 /* try shortcut on kq lookup for bound threads */
3892 kq = kevent_get_bound_kq(p, current_thread(), KEVENT_FLAG_WORKLOOP, KQ_WORKLOOP);
3893 if (kq != NULL && ((struct kqworkloop *)kq)->kqwl_dynamicid == id) {
3894
3895 if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3896 error = EEXIST;
3897 kq = NULL;
3898 goto out;
3899 }
3900
3901 /* retain a reference while working with this kq. */
3902 assert(kq->kq_state & KQ_DYNAMIC);
3903 kqueue_retain(kq);
3904 error = 0;
3905 goto out;
3906 }
3907
3908 /* look for the kq on the hash table */
3909 kqhash_lock(p);
3910 kq = kqueue_hash_lookup(p, id);
3911 if (kq == NULL) {
3912 kqhash_unlock(p);
3913
3914 if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST) {
3915 error = ENOENT;
3916 goto out;
3917 }
3918
3919 struct kqueue *alloc_kq;
3920 alloc_kq = kqueue_alloc(p, flags);
3921 if (alloc_kq) {
3922 kqhash_lock(p);
3923 kqueue_hash_init_if_needed(p);
3924 kq = kqueue_hash_lookup(p, id);
3925 if (kq == NULL) {
3926 /* insert our new one */
3927 kq = alloc_kq;
3928 kqueue_hash_insert(p, id, kq);
3929 kqhash_unlock(p);
3930 } else {
3931 /* lost race, retain existing workloop */
3932 kqueue_retain(kq);
3933 kqhash_unlock(p);
3934 kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
3935 kqueue_dealloc(alloc_kq);
3936 }
3937 } else {
3938 error = ENOMEM;
3939 goto out;
3940 }
3941 } else {
3942
3943 if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
3944 kqhash_unlock(p);
3945 kq = NULL;
3946 error = EEXIST;
3947 goto out;
3948 }
3949
3950 /* retain a reference while working with this kq. */
3951 assert(kq->kq_state & KQ_DYNAMIC);
3952 kqueue_retain(kq);
3953 kqhash_unlock(p);
3954 }
3955
3956 } else if (flags & KEVENT_FLAG_WORKQ) {
3957 /* must already exist for bound threads. */
3958 if (flags & KEVENT_FLAG_KERNEL) {
3959 assert(descp->fd_wqkqueue != NULL);
3960 }
3961
3962 /*
3963 * use the private kq associated with the proc workq.
3964 * Just being a thread within the process (and not
3965 * being the exit/exec thread) is enough to hold a
3966 * reference on this special kq.
3967 */
3968 kq = descp->fd_wqkqueue;
3969 if (kq == NULL) {
3970 struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ);
3971 if (alloc_kq == NULL)
3972 return ENOMEM;
3973
3974 knhash_lock(p);
3975 if (descp->fd_wqkqueue == NULL) {
3976 kq = descp->fd_wqkqueue = alloc_kq;
3977 knhash_unlock(p);
3978 } else {
3979 knhash_unlock(p);
3980 kq = descp->fd_wqkqueue;
3981 kqueue_dealloc(alloc_kq);
3982 }
3983 }
3984 } else {
3985 /* get a usecount for the kq itself */
3986 fd = (int)id;
3987 if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
3988 return (error);
3989 }
3990 if ((error = kevent_set_kq_mode(kq, flags)) != 0) {
3991 /* drop the usecount */
3992 if (fp != NULL)
3993 fp_drop(p, fd, fp, 0);
3994 return error;
3995 }
3996
3997 out:
3998 *fpp = fp;
3999 *fdp = fd;
4000 *kqp = kq;
4001
4002 return error;
4003 }
4004
4005 static void
4006 kevent_put_kq(
4007 struct proc *p,
4008 kqueue_id_t id,
4009 struct fileproc *fp,
4010 struct kqueue *kq)
4011 {
4012 kqueue_release_last(p, kq);
4013 if (fp != NULL) {
4014 assert((kq->kq_state & KQ_WORKQ) == 0);
4015 fp_drop(p, (int)id, fp, 0);
4016 }
4017 }
4018
4019 static uint64_t
4020 kevent_workloop_serial_no_copyin(proc_t p, uint64_t workloop_id)
4021 {
4022 uint64_t serial_no = 0;
4023 user_addr_t addr;
4024 int rc;
4025
4026 if (workloop_id == 0 || p->p_dispatchqueue_serialno_offset == 0) {
4027 return 0;
4028 }
4029 addr = (user_addr_t)(workloop_id + p->p_dispatchqueue_serialno_offset);
4030
4031 if (proc_is64bit(p)) {
4032 rc = copyin(addr, (caddr_t)&serial_no, sizeof(serial_no));
4033 } else {
4034 uint32_t serial_no32 = 0;
4035 rc = copyin(addr, (caddr_t)&serial_no32, sizeof(serial_no32));
4036 serial_no = serial_no32;
4037 }
4038 return rc == 0 ? serial_no : 0;
4039 }
4040
4041 int
4042 kevent_exit_on_workloop_ownership_leak(thread_t thread)
4043 {
4044 proc_t p = current_proc();
4045 struct filedesc *fdp = p->p_fd;
4046 kqueue_id_t workloop_id = 0;
4047 os_reason_t reason;
4048 mach_vm_address_t addr;
4049 uint32_t reason_size;
4050
4051 kqhash_lock(p);
4052 if (fdp->fd_kqhashmask > 0) {
4053 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
4054 struct kqworkloop *kqwl;
4055
4056 SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
4057 struct kqueue *kq = &kqwl->kqwl_kqueue;
4058 if ((kq->kq_state & KQ_DYNAMIC) && kqwl->kqwl_owner == thread) {
4059 workloop_id = kqwl->kqwl_dynamicid;
4060 break;
4061 }
4062 }
4063 }
4064 }
4065 kqhash_unlock(p);
4066 assert(workloop_id);
4067
4068 reason = os_reason_create(OS_REASON_LIBSYSTEM,
4069 OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK);
4070 if (reason == OS_REASON_NULL) {
4071 goto out;
4072 }
4073
4074 reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
4075 reason_size = 2 * sizeof(uint64_t);
4076 reason_size = kcdata_estimate_required_buffer_size(2, reason_size);
4077 if (os_reason_alloc_buffer(reason, reason_size) != 0) {
4078 goto out;
4079 }
4080
4081 struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor;
4082
4083 if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID,
4084 sizeof(workloop_id), &addr) == KERN_SUCCESS) {
4085 kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id));
4086 }
4087
4088 uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id);
4089 if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO,
4090 sizeof(serial_no), &addr) == KERN_SUCCESS) {
4091 kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no));
4092 }
4093
4094 out:
4095 #if DEVELOPMENT || DEBUG
4096 psignal_try_thread_with_reason(p, thread, SIGABRT, reason);
4097 return 0;
4098 #else
4099 return exit_with_reason(p, W_EXITCODE(0, SIGKILL), (int *)NULL,
4100 FALSE, FALSE, 0, reason);
4101 #endif
4102 }
4103
4104
4105 static int
4106 kevent_servicer_detach_preflight(thread_t thread, unsigned int flags, struct kqueue *kq)
4107 {
4108 int error = 0;
4109 struct kqworkloop *kqwl;
4110 struct uthread *ut;
4111 struct kqrequest *kqr;
4112
4113 if (!(flags & KEVENT_FLAG_WORKLOOP) || !(kq->kq_state & KQ_WORKLOOP))
4114 return EINVAL;
4115
4116 /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads */
4117 if (!(kq->kq_state & KQ_NO_WQ_THREAD))
4118 return EINVAL;
4119
4120 /* allow detach only on not wq threads */
4121 if (is_workqueue_thread(thread))
4122 return EINVAL;
4123
4124 /* check that the current thread is bound to the requested wq */
4125 ut = get_bsdthread_info(thread);
4126 if (ut->uu_kqueue_bound != kq)
4127 return EINVAL;
4128
4129 kqwl = (struct kqworkloop *)kq;
4130 kqwl_req_lock(kqwl);
4131 kqr = &kqwl->kqwl_request;
4132
4133 /* check that the wq is bound to the thread */
4134 if ((kqr->kqr_state & KQR_BOUND) == 0 || (kqr->kqr_thread != thread))
4135 error = EINVAL;
4136
4137 kqwl_req_unlock(kqwl);
4138
4139 return error;
4140 }
4141
4142 static void
4143 kevent_servicer_detach_thread(struct proc *p, kqueue_id_t id, thread_t thread,
4144 unsigned int flags, struct kqueue *kq)
4145 {
4146 struct kqworkloop *kqwl;
4147 struct uthread *ut;
4148
4149 assert((flags & KEVENT_FLAG_WORKLOOP) && (kq->kq_state & KQ_WORKLOOP));
4150
4151 /* allow detach only on not wqthreads threads */
4152 assert(!is_workqueue_thread(thread));
4153
4154 /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads */
4155 assert(kq->kq_state & KQ_NO_WQ_THREAD);
4156
4157 /* check that the current thread is bound to the requested kq */
4158 ut = get_bsdthread_info(thread);
4159 assert(ut->uu_kqueue_bound == kq);
4160
4161 kqwl = (struct kqworkloop *)kq;
4162
4163 kqlock(kq);
4164
4165 /* unbind the thread.
4166 * unbind itself checks if still processing and ends it.
4167 */
4168 kqworkloop_unbind_thread(kqwl, thread, flags);
4169
4170 kqunlock(kq);
4171
4172 kevent_put_kq(p, id, NULL, kq);
4173
4174 return;
4175 }
4176
4177 static int
4178 kevent_servicer_attach_thread(thread_t thread, unsigned int flags, struct kqueue *kq)
4179 {
4180 int error = 0;
4181 struct kqworkloop *kqwl;
4182 struct uthread *ut;
4183 struct kqrequest *kqr;
4184
4185 if (!(flags & KEVENT_FLAG_WORKLOOP) || !(kq->kq_state & KQ_WORKLOOP))
4186 return EINVAL;
4187
4188 /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads*/
4189 if (!(kq->kq_state & KQ_NO_WQ_THREAD))
4190 return EINVAL;
4191
4192 /* allow attach only on not wqthreads */
4193 if (is_workqueue_thread(thread))
4194 return EINVAL;
4195
4196 /* check that the thread is not already bound */
4197 ut = get_bsdthread_info(thread);
4198 if (ut->uu_kqueue_bound != NULL)
4199 return EINVAL;
4200
4201 assert(ut->uu_kqueue_flags == 0);
4202
4203 kqlock(kq);
4204 kqwl = (struct kqworkloop *)kq;
4205 kqwl_req_lock(kqwl);
4206 kqr = &kqwl->kqwl_request;
4207
4208 /* check that the kqueue is not already bound */
4209 if (kqr->kqr_state & (KQR_BOUND | KQR_THREQUESTED | KQR_DRAIN)) {
4210 error = EINVAL;
4211 goto out;
4212 }
4213
4214 assert(kqr->kqr_thread == NULL);
4215 assert((kqr->kqr_state & KQR_PROCESSING) == 0);
4216
4217 kqr->kqr_state |= KQR_THREQUESTED;
4218 kqr->kqr_qos_index = THREAD_QOS_UNSPECIFIED;
4219 kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
4220 kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
4221 kqr->kqr_owner_override_is_sync = 0;
4222
4223 kqworkloop_bind_thread_impl(kqwl, thread, KEVENT_FLAG_WORKLOOP);
4224
4225 /* get a ref on the wlkq on behalf of the attached thread */
4226 kqueue_retain(kq);
4227
4228 out:
4229 kqwl_req_unlock(kqwl);
4230 kqunlock(kq);
4231
4232 return error;
4233 }
4234
4235 static inline
4236 boolean_t kevent_args_requesting_events(unsigned int flags, int nevents)
4237 {
4238 return (!(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0);
4239 }
4240
4241 static int
4242 kevent_internal(struct proc *p,
4243 kqueue_id_t id, kqueue_id_t *id_out,
4244 user_addr_t changelist, int nchanges,
4245 user_addr_t ueventlist, int nevents,
4246 user_addr_t data_out, uint64_t data_available,
4247 unsigned int flags,
4248 user_addr_t utimeout,
4249 kqueue_continue_t continuation,
4250 int32_t *retval)
4251 {
4252 struct _kevent *cont_args;
4253 uthread_t ut;
4254 struct kqueue *kq;
4255 struct fileproc *fp = NULL;
4256 int fd = 0;
4257 struct kevent_internal_s kev;
4258 int error, noutputs;
4259 struct timeval atv;
4260 user_size_t data_size;
4261 user_size_t data_resid;
4262 thread_t thread = current_thread();
4263
4264 /* Don't allow user-space threads to process output events from the workq kqs */
4265 if (((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ) &&
4266 kevent_args_requesting_events(flags, nevents))
4267 return EINVAL;
4268
4269 /* restrict dynamic kqueue allocation to workloops (for now) */
4270 if ((flags & (KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP)) == KEVENT_FLAG_DYNAMIC_KQUEUE)
4271 return EINVAL;
4272
4273 if (flags & (KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH | KEVENT_FLAG_WORKLOOP_SERVICER_DETACH |
4274 KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST | KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD)) {
4275
4276 /* allowed only on workloops when calling kevent_id from user-space */
4277 if (!(flags & KEVENT_FLAG_WORKLOOP) || (flags & KEVENT_FLAG_KERNEL) || !(flags & KEVENT_FLAG_DYNAMIC_KQUEUE))
4278 return EINVAL;
4279
4280 /* cannot attach and detach simultaneously*/
4281 if ((flags & KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH) && (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH))
4282 return EINVAL;
4283
4284 /* cannot ask for events and detach */
4285 if ((flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) && kevent_args_requesting_events(flags, nevents))
4286 return EINVAL;
4287
4288 }
4289
4290 /* prepare to deal with stack-wise allocation of out events */
4291 if (flags & KEVENT_FLAG_STACK_EVENTS) {
4292 int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
4293 (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
4294 sizeof(struct user32_kevent)) :
4295 ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
4296 sizeof(struct kevent_qos_s)));
4297 ueventlist += nevents * scale;
4298 }
4299
4300 /* convert timeout to absolute - if we have one (and not immediate) */
4301 error = kevent_get_timeout(p, utimeout, flags, &atv);
4302 if (error)
4303 return error;
4304
4305 /* copyin initial value of data residual from data_available */
4306 error = kevent_get_data_size(p, data_available, flags, &data_size);
4307 if (error)
4308 return error;
4309
4310 /* get the kq we are going to be working on */
4311 error = kevent_get_kq(p, id, flags, &fp, &fd, &kq);
4312 if (error)
4313 return error;
4314
4315 /* only bound threads can receive events on workloops */
4316 if ((flags & KEVENT_FLAG_WORKLOOP) && kevent_args_requesting_events(flags, nevents)) {
4317 ut = (uthread_t)get_bsdthread_info(thread);
4318 if (ut->uu_kqueue_bound != kq) {
4319 error = EXDEV;
4320 goto out;
4321 }
4322
4323 }
4324
4325 /* attach the current thread if necessary */
4326 if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH) {
4327 error = kevent_servicer_attach_thread(thread, flags, kq);
4328 if (error)
4329 goto out;
4330 }
4331 else {
4332 /* before processing events and committing to the system call, return an error if the thread cannot be detached when requested */
4333 if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) {
4334 error = kevent_servicer_detach_preflight(thread, flags, kq);
4335 if (error)
4336 goto out;
4337 }
4338 }
4339
4340 if (id_out && kq && (flags & KEVENT_FLAG_WORKLOOP)) {
4341 assert(kq->kq_state & KQ_WORKLOOP);
4342 struct kqworkloop *kqwl;
4343 kqwl = (struct kqworkloop *)kq;
4344 *id_out = kqwl->kqwl_dynamicid;
4345 }
4346
4347 /* register all the change requests the user provided... */
4348 noutputs = 0;
4349 while (nchanges > 0 && error == 0) {
4350 error = kevent_copyin(&changelist, &kev, p, flags);
4351 if (error)
4352 break;
4353
4354 /* Make sure user doesn't pass in any system flags */
4355 kev.flags &= ~EV_SYSFLAGS;
4356
4357 kevent_register(kq, &kev, p);
4358
4359 if (nevents > 0 &&
4360 ((kev.flags & EV_ERROR) || (kev.flags & EV_RECEIPT))) {
4361 if (kev.flags & EV_RECEIPT) {
4362 kev.flags |= EV_ERROR;
4363 kev.data = 0;
4364 }
4365 error = kevent_copyout(&kev, &ueventlist, p, flags);
4366 if (error == 0) {
4367 nevents--;
4368 noutputs++;
4369 }
4370 } else if (kev.flags & EV_ERROR) {
4371 error = kev.data;
4372 }
4373 nchanges--;
4374 }
4375
4376 /* short-circuit the scan if we only want error events */
4377 if (flags & KEVENT_FLAG_ERROR_EVENTS)
4378 nevents = 0;
4379
4380 /* process pending events */
4381 if (nevents > 0 && noutputs == 0 && error == 0) {
4382 /* store the continuation/completion data in the uthread */
4383 ut = (uthread_t)get_bsdthread_info(thread);
4384 cont_args = &ut->uu_kevent.ss_kevent;
4385 cont_args->fp = fp;
4386 cont_args->fd = fd;
4387 cont_args->retval = retval;
4388 cont_args->eventlist = ueventlist;
4389 cont_args->eventcount = nevents;
4390 cont_args->eventout = noutputs;
4391 cont_args->data_available = data_available;
4392 cont_args->process_data.fp_fd = (int)id;
4393 cont_args->process_data.fp_flags = flags;
4394 cont_args->process_data.fp_data_out = data_out;
4395 cont_args->process_data.fp_data_size = data_size;
4396 cont_args->process_data.fp_data_resid = data_size;
4397
4398 error = kqueue_scan(kq, kevent_callback,
4399 continuation, cont_args,
4400 &cont_args->process_data,
4401 &atv, p);
4402
4403 /* process remaining outputs */
4404 noutputs = cont_args->eventout;
4405 data_resid = cont_args->process_data.fp_data_resid;
4406
4407 /* copyout residual data size value (if it needs to be copied out) */
4408 /* don't abandon other output just because of residual copyout failures */
4409 if (error == 0 && data_available && data_resid != data_size) {
4410 (void)kevent_put_data_size(p, data_available, flags, data_resid);
4411 }
4412 }
4413
4414 /* detach the current thread if necessary */
4415 if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) {
4416 assert(fp == NULL);
4417 kevent_servicer_detach_thread(p, id, thread, flags, kq);
4418 }
4419
4420 out:
4421 kevent_put_kq(p, id, fp, kq);
4422
4423 /* don't restart after signals... */
4424 if (error == ERESTART)
4425 error = EINTR;
4426 else if (error == EWOULDBLOCK)
4427 error = 0;
4428 if (error == 0)
4429 *retval = noutputs;
4430 return (error);
4431 }
4432
4433
4434 /*
4435 * kevent_callback - callback for each individual event
4436 *
4437 * called with nothing locked
4438 * caller holds a reference on the kqueue
4439 */
4440 static int
4441 kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
4442 void *data)
4443 {
4444 struct _kevent *cont_args;
4445 int error;
4446
4447 cont_args = (struct _kevent *)data;
4448 assert(cont_args->eventout < cont_args->eventcount);
4449
4450 /*
4451 * Copy out the appropriate amount of event data for this user.
4452 */
4453 error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
4454 cont_args->process_data.fp_flags);
4455
4456 /*
4457 * If there isn't space for additional events, return
4458 * a harmless error to stop the processing here
4459 */
4460 if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
4461 error = EWOULDBLOCK;
4462 return (error);
4463 }
4464
4465 /*
4466 * kevent_description - format a description of a kevent for diagnostic output
4467 *
4468 * called with a 256-byte string buffer
4469 */
4470
4471 char *
4472 kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
4473 {
4474 snprintf(s, n,
4475 "kevent="
4476 "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
4477 kevp->ident,
4478 kevp->filter,
4479 kevp->flags,
4480 kevp->udata,
4481 kevp->fflags,
4482 kevp->data,
4483 kevp->ext[0],
4484 kevp->ext[1] );
4485
4486 return (s);
4487 }
4488
4489 /*
4490 * kevent_register - add a new event to a kqueue
4491 *
4492 * Creates a mapping between the event source and
4493 * the kqueue via a knote data structure.
4494 *
4495 * Because many/most the event sources are file
4496 * descriptor related, the knote is linked off
4497 * the filedescriptor table for quick access.
4498 *
4499 * called with nothing locked
4500 * caller holds a reference on the kqueue
4501 */
4502
4503 void
4504 kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
4505 __unused struct proc *ctxp)
4506 {
4507 struct proc *p = kq->kq_p;
4508 const struct filterops *fops;
4509 struct knote *kn = NULL;
4510 int result = 0;
4511 int error = 0;
4512 unsigned short kev_flags = kev->flags;
4513 int knoteuse_flags = KNUSE_NONE;
4514
4515 if (kev->filter < 0) {
4516 if (kev->filter + EVFILT_SYSCOUNT < 0) {
4517 error = EINVAL;
4518 goto out;
4519 }
4520 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
4521 } else {
4522 error = EINVAL;
4523 goto out;
4524 }
4525
4526 /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
4527 if ((kev->flags & EV_VANISHED) &&
4528 (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) {
4529 error = EINVAL;
4530 goto out;
4531 }
4532
4533 /* Simplify the flags - delete and disable overrule */
4534 if (kev->flags & EV_DELETE)
4535 kev->flags &= ~EV_ADD;
4536 if (kev->flags & EV_DISABLE)
4537 kev->flags &= ~EV_ENABLE;
4538
4539 if (kq->kq_state & KQ_WORKLOOP) {
4540 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_REGISTER),
4541 ((struct kqworkloop *)kq)->kqwl_dynamicid,
4542 kev->udata, kev->flags, kev->filter);
4543 } else if (kq->kq_state & KQ_WORKQ) {
4544 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_REGISTER),
4545 0, kev->udata, kev->flags, kev->filter);
4546 } else {
4547 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_REGISTER),
4548 VM_KERNEL_UNSLIDE_OR_PERM(kq),
4549 kev->udata, kev->flags, kev->filter);
4550 }
4551
4552 restart:
4553
4554 /* find the matching knote from the fd tables/hashes */
4555 kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
4556
4557 if (kn == NULL) {
4558 if (kev->flags & EV_ADD) {
4559 struct fileproc *knote_fp = NULL;
4560
4561 /* grab a file reference for the new knote */
4562 if (fops->f_isfd) {
4563 if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) {
4564 goto out;
4565 }
4566 }
4567
4568 kn = knote_alloc();
4569 if (kn == NULL) {
4570 error = ENOMEM;
4571 if (knote_fp != NULL)
4572 fp_drop(p, kev->ident, knote_fp, 0);
4573 goto out;
4574 }
4575
4576 kn->kn_fp = knote_fp;
4577 knote_set_kq(kn, kq);
4578 kqueue_retain(kq); /* retain a kq ref */
4579 kn->kn_filtid = ~kev->filter;
4580 kn->kn_inuse = 1; /* for f_attach() */
4581 kn->kn_status = KN_ATTACHING | KN_ATTACHED;
4582
4583 /* was vanish support requested */
4584 if (kev->flags & EV_VANISHED) {
4585 kev->flags &= ~EV_VANISHED;
4586 kn->kn_status |= KN_REQVANISH;
4587 }
4588
4589 /* snapshot matching/dispatching protcol flags into knote */
4590 if (kev->flags & EV_DISPATCH)
4591 kn->kn_status |= KN_DISPATCH;
4592 if (kev->flags & EV_UDATA_SPECIFIC)
4593 kn->kn_status |= KN_UDATA_SPECIFIC;
4594
4595 /*
4596 * copy the kevent state into knote
4597 * protocol is that fflags and data
4598 * are saved off, and cleared before
4599 * calling the attach routine.
4600 */
4601 kn->kn_kevent = *kev;
4602 kn->kn_sfflags = kev->fflags;
4603 kn->kn_sdata = kev->data;
4604 kn->kn_fflags = 0;
4605 kn->kn_data = 0;
4606
4607 /* invoke pthread kext to convert kevent qos to thread qos */
4608 knote_canonicalize_kevent_qos(kn);
4609 knote_set_qos_index(kn, qos_index_from_qos(kn, kn->kn_qos, FALSE));
4610
4611 /* before anyone can find it */
4612 if (kev->flags & EV_DISABLE) {
4613 /*
4614 * do this before anyone can find it,
4615 * this can't call knote_disable() because it expects having
4616 * the kqlock held
4617 */
4618 kn->kn_status |= KN_DISABLED;
4619 }
4620
4621 /* Add the knote for lookup thru the fd table */
4622 error = kq_add_knote(kq, kn, kev, p, &knoteuse_flags);
4623 if (error) {
4624 (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
4625 knote_free(kn);
4626 if (knote_fp != NULL)
4627 fp_drop(p, kev->ident, knote_fp, 0);
4628
4629 if (error == ERESTART) {
4630 error = 0;
4631 goto restart;
4632 }
4633 goto out;
4634 }
4635
4636 /* fp reference count now applies to knote */
4637 /* rwlock boost is now held */
4638
4639 /* call filter attach routine */
4640 result = fops->f_attach(kn, kev);
4641
4642 /*
4643 * Trade knote use count for kq lock.
4644 * Cannot be dropped because we held
4645 * KN_ATTACHING throughout.
4646 */
4647 knoteuse2kqlock(kq, kn, KNUSE_STEAL_DROP | knoteuse_flags);
4648
4649 if (kn->kn_flags & EV_ERROR) {
4650 /*
4651 * Failed to attach correctly, so drop.
4652 * All other possible users/droppers
4653 * have deferred to us. Save the error
4654 * to return to our caller.
4655 */
4656 kn->kn_status &= ~KN_ATTACHED;
4657 kn->kn_status |= KN_DROPPING;
4658 error = kn->kn_data;
4659 kqunlock(kq);
4660 knote_drop(kn, p);
4661 goto out;
4662 }
4663
4664 /* end "attaching" phase - now just attached */
4665 kn->kn_status &= ~KN_ATTACHING;
4666
4667 if (kn->kn_status & KN_DROPPING) {
4668 /*
4669 * Attach succeeded, but someone else
4670 * deferred their drop - now we have
4671 * to do it for them.
4672 */
4673 kqunlock(kq);
4674 knote_drop(kn, p);
4675 goto out;
4676 }
4677
4678 /* Mark the thread request overcommit - if appropos */
4679 knote_set_qos_overcommit(kn);
4680
4681 /*
4682 * If the attach routine indicated that an
4683 * event is already fired, activate the knote.
4684 */
4685 if (result)
4686 knote_activate(kn);
4687
4688 if (knote_fops(kn)->f_post_attach) {
4689 error = knote_fops(kn)->f_post_attach(kn, kev);
4690 if (error) {
4691 kqunlock(kq);
4692 goto out;
4693 }
4694 }
4695
4696 } else {
4697 if ((kev_flags & (EV_ADD | EV_DELETE)) == (EV_ADD | EV_DELETE) &&
4698 (kq->kq_state & KQ_WORKLOOP)) {
4699 /*
4700 * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
4701 * that doesn't care about ENOENT, so just pretend the deletion
4702 * happened.
4703 */
4704 } else {
4705 error = ENOENT;
4706 }
4707 goto out;
4708 }
4709
4710 } else {
4711 /* existing knote: kqueue lock already taken by kq_find_knote_and_kq_lock */
4712
4713 if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
4714 /*
4715 * The knote is not in a stable state, wait for that
4716 * transition to complete and then redrive the lookup.
4717 */
4718 knoteusewait(kq, kn);
4719 goto restart;
4720 }
4721
4722 if (kev->flags & EV_DELETE) {
4723
4724 /*
4725 * If attempting to delete a disabled dispatch2 knote,
4726 * we must wait for the knote to be re-enabled (unless
4727 * it is being re-enabled atomically here).
4728 */
4729 if ((kev->flags & EV_ENABLE) == 0 &&
4730 (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) ==
4731 (KN_DISPATCH2 | KN_DISABLED)) {
4732 kn->kn_status |= KN_DEFERDELETE;
4733 kqunlock(kq);
4734 error = EINPROGRESS;
4735 } else if (knote_fops(kn)->f_drop_and_unlock) {
4736 /*
4737 * The filter has requested to handle EV_DELETE events
4738 *
4739 * ERESTART means the kevent has to be re-evaluated
4740 */
4741 error = knote_fops(kn)->f_drop_and_unlock(kn, kev);
4742 if (error == ERESTART) {
4743 error = 0;
4744 goto restart;
4745 }
4746 } else if (kqlock2knotedrop(kq, kn)) {
4747 /* standard/default EV_DELETE path */
4748 knote_drop(kn, p);
4749 } else {
4750 /*
4751 * The kqueue is unlocked, it's not being
4752 * dropped, and kqlock2knotedrop returned 0:
4753 * this means that someone stole the drop of
4754 * the knote from us.
4755 */
4756 error = EINPROGRESS;
4757 }
4758 goto out;
4759 }
4760
4761 /*
4762 * If we are re-enabling a deferred-delete knote,
4763 * just enable it now and avoid calling the
4764 * filter touch routine (it has delivered its
4765 * last event already).
4766 */
4767 if ((kev->flags & EV_ENABLE) &&
4768 (kn->kn_status & KN_DEFERDELETE)) {
4769 assert(kn->kn_status & KN_DISABLED);
4770 knote_activate(kn);
4771 knote_enable(kn);
4772 kqunlock(kq);
4773 goto out;
4774 }
4775
4776 /*
4777 * If we are disabling, do it before unlocking and
4778 * calling the touch routine (so no processing can
4779 * see the new kevent state before the disable is
4780 * applied).
4781 */
4782 if (kev->flags & EV_DISABLE)
4783 knote_disable(kn);
4784
4785 /*
4786 * Convert the kqlock to a use reference on the
4787 * knote so we can call the filter touch routine.
4788 */
4789 if (knoteuse_needs_boost(kn, kev)) {
4790 knoteuse_flags |= KNUSE_BOOST;
4791 }
4792 if (kqlock2knoteuse(kq, kn, knoteuse_flags)) {
4793 /*
4794 * Call touch routine to notify filter of changes
4795 * in filter values (and to re-determine if any
4796 * events are fired).
4797 */
4798 result = knote_fops(kn)->f_touch(kn, kev);
4799
4800 /* Get the kq lock back (don't defer droppers). */
4801 if (!knoteuse2kqlock(kq, kn, knoteuse_flags)) {
4802 kqunlock(kq);
4803 goto out;
4804 }
4805
4806 /* Handle errors during touch routine */
4807 if (kev->flags & EV_ERROR) {
4808 error = kev->data;
4809 kqunlock(kq);
4810 goto out;
4811 }
4812
4813 /* Activate it if the touch routine said to */
4814 if (result)
4815 knote_activate(kn);
4816 }
4817
4818 /* Enable the knote if called for */
4819 if (kev->flags & EV_ENABLE)
4820 knote_enable(kn);
4821
4822 }
4823
4824 /* still have kqlock held and knote is valid */
4825 kqunlock(kq);
4826
4827 out:
4828 /* output local errors through the kevent */
4829 if (error) {
4830 kev->flags |= EV_ERROR;
4831 kev->data = error;
4832 }
4833 }
4834
4835
4836 /*
4837 * knote_process - process a triggered event
4838 *
4839 * Validate that it is really still a triggered event
4840 * by calling the filter routines (if necessary). Hold
4841 * a use reference on the knote to avoid it being detached.
4842 *
4843 * If it is still considered triggered, we will have taken
4844 * a copy of the state under the filter lock. We use that
4845 * snapshot to dispatch the knote for future processing (or
4846 * not, if this was a lost event).
4847 *
4848 * Our caller assures us that nobody else can be processing
4849 * events from this knote during the whole operation. But
4850 * others can be touching or posting events to the knote
4851 * interspersed with our processing it.
4852 *
4853 * caller holds a reference on the kqueue.
4854 * kqueue locked on entry and exit - but may be dropped
4855 */
4856 static int
4857 knote_process(struct knote *kn,
4858 kevent_callback_t callback,
4859 void *callback_data,
4860 struct filt_process_s *process_data,
4861 struct proc *p)
4862 {
4863 struct kevent_internal_s kev;
4864 struct kqueue *kq = knote_get_kq(kn);
4865 int result = 0;
4866 int error = 0;
4867
4868 bzero(&kev, sizeof(kev));
4869
4870 /*
4871 * Must be active or stayactive
4872 * Must be queued and not disabled/suppressed
4873 */
4874 assert(kn->kn_status & KN_QUEUED);
4875 assert(kn->kn_status & (KN_ACTIVE|KN_STAYACTIVE));
4876 assert(!(kn->kn_status & (KN_DISABLED|KN_SUPPRESSED|KN_DROPPING)));
4877
4878 if (kq->kq_state & KQ_WORKLOOP) {
4879 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS),
4880 ((struct kqworkloop *)kq)->kqwl_dynamicid,
4881 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4882 kn->kn_filtid);
4883 } else if (kq->kq_state & KQ_WORKQ) {
4884 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS),
4885 0, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
4886 kn->kn_filtid);
4887 } else {
4888 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS),
4889 VM_KERNEL_UNSLIDE_OR_PERM(kq), kn->kn_udata,
4890 kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
4891 }
4892
4893 /*
4894 * For deferred-drop or vanished events, we just create a fake
4895 * event to acknowledge end-of-life. Otherwise, we call the
4896 * filter's process routine to snapshot the kevent state under
4897 * the filter's locking protocol.
4898 */
4899 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
4900 /* create fake event */
4901 kev.filter = kn->kn_filter;
4902 kev.ident = kn->kn_id;
4903 kev.qos = kn->kn_qos;
4904 kev.flags = (kn->kn_status & KN_DEFERDELETE) ?
4905 EV_DELETE : EV_VANISHED;
4906 kev.flags |= (EV_DISPATCH2 | EV_ONESHOT);
4907 kev.udata = kn->kn_udata;
4908 result = 1;
4909
4910 knote_suppress(kn);
4911 } else {
4912 int flags = KNUSE_NONE;
4913 /* deactivate - so new activations indicate a wakeup */
4914 knote_deactivate(kn);
4915
4916 /* suppress knotes to avoid returning the same event multiple times in a single call. */
4917 knote_suppress(kn);
4918
4919 if (knoteuse_needs_boost(kn, NULL)) {
4920 flags |= KNUSE_BOOST;
4921 }
4922 /* convert lock to a knote use reference */
4923 if (!kqlock2knoteuse(kq, kn, flags))
4924 panic("dropping knote found on queue\n");
4925
4926 /* call out to the filter to process with just a ref */
4927 result = knote_fops(kn)->f_process(kn, process_data, &kev);
4928 if (result) flags |= KNUSE_STEAL_DROP;
4929
4930 /*
4931 * convert our reference back to a lock. accept drop
4932 * responsibility from others if we've committed to
4933 * delivering event data.
4934 */
4935 if (!knoteuse2kqlock(kq, kn, flags)) {
4936 /* knote dropped */
4937 kn = NULL;
4938 }
4939 }
4940
4941 if (kn != NULL) {
4942 /*
4943 * Determine how to dispatch the knote for future event handling.
4944 * not-fired: just return (do not callout, leave deactivated).
4945 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
4946 * is the deferred delete event delivery itself). Otherwise,
4947 * drop it.
4948 * stolendrop:We took responsibility for someone else's drop attempt.
4949 * treat this just like one-shot and prepare to turn it back
4950 * into a deferred delete if required.
4951 * Dispatch: don't clear state, just mark it disabled.
4952 * Cleared: just leave it deactivated.
4953 * Others: re-activate as there may be more events to handle.
4954 * This will not wake up more handlers right now, but
4955 * at the completion of handling events it may trigger
4956 * more handler threads (TODO: optimize based on more than
4957 * just this one event being detected by the filter).
4958 */
4959
4960 if (result == 0)
4961 return (EJUSTRETURN);
4962
4963 if ((kev.flags & EV_ONESHOT) || (kn->kn_status & KN_STOLENDROP)) {
4964 if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) {
4965 /* defer dropping non-delete oneshot dispatch2 events */
4966 kn->kn_status |= KN_DEFERDELETE;
4967 knote_disable(kn);
4968
4969 /* if we took over another's drop clear those flags here */
4970 if (kn->kn_status & KN_STOLENDROP) {
4971 assert(kn->kn_status & KN_DROPPING);
4972 /*
4973 * the knote will be dropped when the
4974 * deferred deletion occurs
4975 */
4976 kn->kn_status &= ~(KN_DROPPING|KN_STOLENDROP);
4977 }
4978 } else if (kn->kn_status & KN_STOLENDROP) {
4979 /* We now own the drop of the knote. */
4980 assert(kn->kn_status & KN_DROPPING);
4981 knote_unsuppress(kn);
4982 kqunlock(kq);
4983 knote_drop(kn, p);
4984 kqlock(kq);
4985 } else if (kqlock2knotedrop(kq, kn)) {
4986 /* just EV_ONESHOT, _not_ DISPATCH2 */
4987 knote_drop(kn, p);
4988 kqlock(kq);
4989 }
4990 } else if (kn->kn_status & KN_DISPATCH) {
4991 /* disable all dispatch knotes */
4992 knote_disable(kn);
4993 } else if ((kev.flags & EV_CLEAR) == 0) {
4994 /* re-activate in case there are more events */
4995 knote_activate(kn);
4996 }
4997 }
4998
4999 /*
5000 * callback to handle each event as we find it.
5001 * If we have to detach and drop the knote, do
5002 * it while we have the kq unlocked.
5003 */
5004 if (result) {
5005 kqunlock(kq);
5006 error = (callback)(kq, &kev, callback_data);
5007 kqlock(kq);
5008 }
5009 return (error);
5010 }
5011
5012
5013 /*
5014 * Return 0 to indicate that processing should proceed,
5015 * -1 if there is nothing to process.
5016 *
5017 * Called with kqueue locked and returns the same way,
5018 * but may drop lock temporarily.
5019 */
5020 static int
5021 kqworkq_begin_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
5022 {
5023 struct kqrequest *kqr;
5024 thread_t self = current_thread();
5025 __assert_only struct uthread *ut = get_bsdthread_info(self);
5026
5027 assert(kqwq->kqwq_state & KQ_WORKQ);
5028 assert(qos_index < KQWQ_NQOS);
5029
5030 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
5031 flags, qos_index);
5032
5033 kqwq_req_lock(kqwq);
5034
5035 kqr = kqworkq_get_request(kqwq, qos_index);
5036
5037 /* manager skips buckets that haven't asked for its help */
5038 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
5039
5040 /* If nothing for manager to do, just return */
5041 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
5042 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
5043 0, kqr->kqr_state);
5044 kqwq_req_unlock(kqwq);
5045 return -1;
5046 }
5047 /* bind manager thread from this time on */
5048 kqworkq_bind_thread_impl(kqwq, qos_index, self, flags);
5049
5050 } else {
5051 /* We should already be bound to this kqueue */
5052 assert(kqr->kqr_state & KQR_BOUND);
5053 assert(kqr->kqr_thread == self);
5054 assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
5055 assert(ut->uu_kqueue_qos_index == qos_index);
5056 assert((ut->uu_kqueue_flags & flags) == ut->uu_kqueue_flags);
5057 }
5058
5059 /*
5060 * we should have been requested to be here
5061 * and nobody else should still be processing
5062 */
5063 assert(kqr->kqr_state & KQR_WAKEUP);
5064 assert(kqr->kqr_state & KQR_THREQUESTED);
5065 assert((kqr->kqr_state & KQR_PROCESSING) == 0);
5066
5067 /* reset wakeup trigger to catch new events after we start processing */
5068 kqr->kqr_state &= ~KQR_WAKEUP;
5069
5070 /* convert to processing mode */
5071 kqr->kqr_state |= KQR_PROCESSING;
5072
5073 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
5074 kqr_thread_id(kqr), kqr->kqr_state);
5075
5076 kqwq_req_unlock(kqwq);
5077 return 0;
5078 }
5079
5080 static inline bool
5081 kqworkloop_is_processing_on_current_thread(struct kqworkloop *kqwl)
5082 {
5083 struct kqueue *kq = &kqwl->kqwl_kqueue;
5084
5085 kqlock_held(kq);
5086
5087 if (kq->kq_state & KQ_PROCESSING) {
5088 /*
5089 * KQ_PROCESSING is unset with the kqlock held, and the kqr thread is
5090 * never modified while KQ_PROCESSING is set, meaning that peeking at
5091 * its value is safe from this context.
5092 */
5093 return kqwl->kqwl_request.kqr_thread == current_thread();
5094 }
5095 return false;
5096 }
5097
5098 static void
5099 kqworkloop_acknowledge_events(struct kqworkloop *kqwl, boolean_t clear_ipc_override)
5100 {
5101 struct kqrequest *kqr = &kqwl->kqwl_request;
5102 struct knote *kn, *tmp;
5103
5104 kqlock_held(&kqwl->kqwl_kqueue);
5105
5106 TAILQ_FOREACH_SAFE(kn, &kqr->kqr_suppressed, kn_tqe, tmp) {
5107 /*
5108 * If a knote that can adjust QoS is disabled because of the automatic
5109 * behavior of EV_DISPATCH, the knotes should stay suppressed so that
5110 * further overrides keep pushing.
5111 */
5112 if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
5113 (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 &&
5114 (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
5115 /*
5116 * When called from unbind, clear the sync ipc override on the knote
5117 * for events which are delivered.
5118 */
5119 if (clear_ipc_override) {
5120 knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
5121 }
5122 continue;
5123 }
5124 knote_unsuppress(kn);
5125 }
5126 }
5127
5128 static int
5129 kqworkloop_begin_processing(struct kqworkloop *kqwl,
5130 __assert_only unsigned int flags)
5131 {
5132 struct kqrequest *kqr = &kqwl->kqwl_request;
5133 struct kqueue *kq = &kqwl->kqwl_kqueue;
5134
5135 kqlock_held(kq);
5136
5137 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
5138 kqwl->kqwl_dynamicid, flags, 0);
5139
5140 kqwl_req_lock(kqwl);
5141
5142 /* nobody else should still be processing */
5143 assert((kqr->kqr_state & KQR_PROCESSING) == 0);
5144 assert((kq->kq_state & KQ_PROCESSING) == 0);
5145
5146 kqr->kqr_state |= KQR_PROCESSING | KQR_R2K_NOTIF_ARMED;
5147 kq->kq_state |= KQ_PROCESSING;
5148
5149 kqwl_req_unlock(kqwl);
5150
5151 kqworkloop_acknowledge_events(kqwl, FALSE);
5152
5153 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
5154 kqwl->kqwl_dynamicid, flags, 0);
5155
5156 return 0;
5157 }
5158
5159 /*
5160 * Return 0 to indicate that processing should proceed,
5161 * -1 if there is nothing to process.
5162 *
5163 * Called with kqueue locked and returns the same way,
5164 * but may drop lock temporarily.
5165 * May block.
5166 */
5167 static int
5168 kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags)
5169 {
5170 struct kqtailq *suppressq;
5171
5172 kqlock_held(kq);
5173
5174 if (kq->kq_state & KQ_WORKQ) {
5175 return kqworkq_begin_processing((struct kqworkq *)kq, qos_index, flags);
5176 } else if (kq->kq_state & KQ_WORKLOOP) {
5177 return kqworkloop_begin_processing((struct kqworkloop*)kq, flags);
5178 }
5179
5180 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
5181 VM_KERNEL_UNSLIDE_OR_PERM(kq), flags);
5182
5183 assert(qos_index == QOS_INDEX_KQFILE);
5184
5185 /* wait to become the exclusive processing thread */
5186 for (;;) {
5187 if (kq->kq_state & KQ_DRAIN) {
5188 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5189 VM_KERNEL_UNSLIDE_OR_PERM(kq), 2);
5190 return -1;
5191 }
5192
5193 if ((kq->kq_state & KQ_PROCESSING) == 0)
5194 break;
5195
5196 /* if someone else is processing the queue, wait */
5197 kq->kq_state |= KQ_PROCWAIT;
5198 suppressq = kqueue_get_suppressed_queue(kq, qos_index);
5199 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
5200 CAST_EVENT64_T(suppressq),
5201 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
5202
5203 kqunlock(kq);
5204 thread_block(THREAD_CONTINUE_NULL);
5205 kqlock(kq);
5206 }
5207
5208 /* Nobody else processing */
5209
5210 /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
5211 waitq_set_clear_preposts(&kq->kq_wqs);
5212 kq->kq_state &= ~KQ_WAKEUP;
5213
5214 /* anything left to process? */
5215 if (kqueue_queue_empty(kq, qos_index)) {
5216 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5217 VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
5218 return -1;
5219 }
5220
5221 /* convert to processing mode */
5222 kq->kq_state |= KQ_PROCESSING;
5223
5224 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
5225 VM_KERNEL_UNSLIDE_OR_PERM(kq));
5226
5227 return 0;
5228 }
5229
5230 /*
5231 * kqworkq_end_processing - Complete the processing of a workq kqueue
5232 *
5233 * We may have to request new threads.
5234 * This can happen there are no waiting processing threads and:
5235 * - there were active events we never got to (count > 0)
5236 * - we pended waitq hook callouts during processing
5237 * - we pended wakeups while processing (or unsuppressing)
5238 *
5239 * Called with kqueue lock held.
5240 */
5241 static void
5242 kqworkq_end_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
5243 {
5244 #pragma unused(flags)
5245
5246 struct kqueue *kq = &kqwq->kqwq_kqueue;
5247 struct kqtailq *suppressq = kqueue_get_suppressed_queue(kq, qos_index);
5248
5249 thread_t self = current_thread();
5250 struct uthread *ut = get_bsdthread_info(self);
5251 struct knote *kn;
5252 struct kqrequest *kqr;
5253 thread_t thread;
5254
5255 assert(kqwq->kqwq_state & KQ_WORKQ);
5256 assert(qos_index < KQWQ_NQOS);
5257
5258 /* Are we really bound to this kqueue? */
5259 if (ut->uu_kqueue_bound != kq) {
5260 assert(ut->uu_kqueue_bound == kq);
5261 return;
5262 }
5263
5264 kqr = kqworkq_get_request(kqwq, qos_index);
5265
5266 kqwq_req_lock(kqwq);
5267
5268 /* Do we claim to be manager? */
5269 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
5270
5271 /* bail if not bound that way */
5272 if (ut->uu_kqueue_qos_index != KQWQ_QOS_MANAGER ||
5273 (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0) {
5274 assert(ut->uu_kqueue_qos_index == KQWQ_QOS_MANAGER);
5275 assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
5276 kqwq_req_unlock(kqwq);
5277 return;
5278 }
5279
5280 /* bail if this request wasn't already getting manager help */
5281 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0 ||
5282 (kqr->kqr_state & KQR_PROCESSING) == 0) {
5283 kqwq_req_unlock(kqwq);
5284 return;
5285 }
5286 } else {
5287 if (ut->uu_kqueue_qos_index != qos_index ||
5288 (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER)) {
5289 assert(ut->uu_kqueue_qos_index == qos_index);
5290 assert((ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0);
5291 kqwq_req_unlock(kqwq);
5292 return;
5293 }
5294 }
5295
5296 assert(kqr->kqr_state & KQR_BOUND);
5297 thread = kqr->kqr_thread;
5298 assert(thread == self);
5299
5300 assert(kqr->kqr_state & KQR_PROCESSING);
5301
5302 /* If we didn't drain the whole queue, re-mark a wakeup being needed */
5303 if (!kqueue_queue_empty(kq, qos_index))
5304 kqr->kqr_state |= KQR_WAKEUP;
5305
5306 kqwq_req_unlock(kqwq);
5307
5308 /*
5309 * Return suppressed knotes to their original state.
5310 * For workq kqueues, suppressed ones that are still
5311 * truly active (not just forced into the queue) will
5312 * set flags we check below to see if anything got
5313 * woken up.
5314 */
5315 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
5316 assert(kn->kn_status & KN_SUPPRESSED);
5317 knote_unsuppress(kn);
5318 }
5319
5320 kqwq_req_lock(kqwq);
5321
5322 /* Indicate that we are done processing this request */
5323 kqr->kqr_state &= ~KQR_PROCESSING;
5324
5325 /*
5326 * Drop our association with this one request and its
5327 * override on us.
5328 */
5329 kqworkq_unbind_thread(kqwq, qos_index, thread, flags);
5330
5331 /*
5332 * request a new thread if we didn't process the whole
5333 * queue or real events have happened (not just putting
5334 * stay-active events back).
5335 */
5336 if (kqr->kqr_state & KQR_WAKEUP) {
5337 if (kqueue_queue_empty(kq, qos_index)) {
5338 kqr->kqr_state &= ~KQR_WAKEUP;
5339 } else {
5340 kqworkq_request_thread(kqwq, qos_index);
5341 }
5342 }
5343 kqwq_req_unlock(kqwq);
5344 }
5345
5346 static void
5347 kqworkloop_end_processing(struct kqworkloop *kqwl, int nevents,
5348 unsigned int flags)
5349 {
5350 struct kqrequest *kqr = &kqwl->kqwl_request;
5351 struct kqueue *kq = &kqwl->kqwl_kqueue;
5352
5353 kqlock_held(kq);
5354
5355 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
5356 kqwl->kqwl_dynamicid, flags, 0);
5357
5358 if ((kq->kq_state & KQ_NO_WQ_THREAD) && nevents == 0 &&
5359 (flags & KEVENT_FLAG_IMMEDIATE) == 0) {
5360 /*
5361 * <rdar://problem/31634014> We may soon block, but have returned no
5362 * kevents that need to be kept supressed for overriding purposes.
5363 *
5364 * It is hence safe to acknowledge events and unsuppress everything, so
5365 * that if we block we can observe all events firing.
5366 */
5367 kqworkloop_acknowledge_events(kqwl, TRUE);
5368 }
5369
5370 kqwl_req_lock(kqwl);
5371
5372 assert(kqr->kqr_state & KQR_PROCESSING);
5373 assert(kq->kq_state & KQ_PROCESSING);
5374
5375 kq->kq_state &= ~KQ_PROCESSING;
5376 kqr->kqr_state &= ~KQR_PROCESSING;
5377 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
5378
5379 kqwl_req_unlock(kqwl);
5380
5381 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
5382 kqwl->kqwl_dynamicid, flags, 0);
5383 }
5384
5385 /*
5386 * Called with kqueue lock held.
5387 */
5388 static void
5389 kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index,
5390 int nevents, unsigned int flags)
5391 {
5392 struct knote *kn;
5393 struct kqtailq *suppressq;
5394 int procwait;
5395
5396 kqlock_held(kq);
5397
5398 assert((kq->kq_state & KQ_WORKQ) == 0);
5399
5400 if (kq->kq_state & KQ_WORKLOOP) {
5401 return kqworkloop_end_processing((struct kqworkloop *)kq, nevents, flags);
5402 }
5403
5404 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
5405 VM_KERNEL_UNSLIDE_OR_PERM(kq), flags);
5406
5407 assert(qos_index == QOS_INDEX_KQFILE);
5408
5409 /*
5410 * Return suppressed knotes to their original state.
5411 */
5412 suppressq = kqueue_get_suppressed_queue(kq, qos_index);
5413 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
5414 assert(kn->kn_status & KN_SUPPRESSED);
5415 knote_unsuppress(kn);
5416 }
5417
5418 procwait = (kq->kq_state & KQ_PROCWAIT);
5419 kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
5420
5421 if (procwait) {
5422 /* first wake up any thread already waiting to process */
5423 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
5424 CAST_EVENT64_T(suppressq),
5425 THREAD_AWAKENED,
5426 WAITQ_ALL_PRIORITIES);
5427 }
5428 }
5429
5430 /*
5431 * kqwq_internal_bind - bind thread to processing workq kqueue
5432 *
5433 * Determines if the provided thread will be responsible for
5434 * servicing the particular QoS class index specified in the
5435 * parameters. Once the binding is done, any overrides that may
5436 * be associated with the cooresponding events can be applied.
5437 *
5438 * This should be called as soon as the thread identity is known,
5439 * preferably while still at high priority during creation.
5440 *
5441 * - caller holds a reference on the process (and workq kq)
5442 * - the thread MUST call kevent_qos_internal after being bound
5443 * or the bucket of events may never be delivered.
5444 * - Nothing locked
5445 * (unless this is a synchronous bind, then the request is locked)
5446 */
5447 static int
5448 kqworkq_internal_bind(
5449 struct proc *p,
5450 kq_index_t qos_index,
5451 thread_t thread,
5452 unsigned int flags)
5453 {
5454 struct kqueue *kq;
5455 struct kqworkq *kqwq;
5456 struct kqrequest *kqr;
5457 struct uthread *ut = get_bsdthread_info(thread);
5458
5459 /* If no process workq, can't be our thread. */
5460 kq = p->p_fd->fd_wqkqueue;
5461
5462 if (kq == NULL)
5463 return 0;
5464
5465 assert(kq->kq_state & KQ_WORKQ);
5466 kqwq = (struct kqworkq *)kq;
5467
5468 /*
5469 * No need to bind the manager thread to any specific
5470 * bucket, but still claim the thread.
5471 */
5472 if (qos_index == KQWQ_QOS_MANAGER) {
5473 assert(ut->uu_kqueue_bound == NULL);
5474 assert(flags & KEVENT_FLAG_WORKQ_MANAGER);
5475 ut->uu_kqueue_bound = kq;
5476 ut->uu_kqueue_qos_index = qos_index;
5477 ut->uu_kqueue_flags = flags;
5478
5479 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND),
5480 thread_tid(thread), flags, qos_index);
5481
5482 return 1;
5483 }
5484
5485 /*
5486 * If this is a synchronous bind callback, the request
5487 * lock is already held, so just do the bind.
5488 */
5489 if (flags & KEVENT_FLAG_SYNCHRONOUS_BIND) {
5490 kqwq_req_held(kqwq);
5491 /* strip out synchronout bind flag */
5492 flags &= ~KEVENT_FLAG_SYNCHRONOUS_BIND;
5493 kqworkq_bind_thread_impl(kqwq, qos_index, thread, flags);
5494 return 1;
5495 }
5496
5497 /*
5498 * check the request that corresponds to our qos_index
5499 * to see if there is an outstanding request.
5500 */
5501 kqr = kqworkq_get_request(kqwq, qos_index);
5502 assert(kqr->kqr_qos_index == qos_index);
5503 kqwq_req_lock(kqwq);
5504
5505 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND),
5506 thread_tid(thread), flags, qos_index, kqr->kqr_state);
5507
5508 if ((kqr->kqr_state & KQR_THREQUESTED) &&
5509 (kqr->kqr_state & KQR_PROCESSING) == 0) {
5510
5511 if ((kqr->kqr_state & KQR_BOUND) &&
5512 thread == kqr->kqr_thread) {
5513 /* duplicate bind - claim the thread */
5514 assert(ut->uu_kqueue_bound == kq);
5515 assert(ut->uu_kqueue_qos_index == qos_index);
5516 kqwq_req_unlock(kqwq);
5517 return 1;
5518 }
5519 if ((kqr->kqr_state & (KQR_BOUND | KQWQ_THMANAGER)) == 0) {
5520 /* ours to bind to */
5521 kqworkq_bind_thread_impl(kqwq, qos_index, thread, flags);
5522 kqwq_req_unlock(kqwq);
5523 return 1;
5524 }
5525 }
5526 kqwq_req_unlock(kqwq);
5527 return 0;
5528 }
5529
5530 static void
5531 kqworkloop_bind_thread_impl(struct kqworkloop *kqwl,
5532 thread_t thread,
5533 __assert_only unsigned int flags)
5534 {
5535 assert(flags & KEVENT_FLAG_WORKLOOP);
5536
5537 /* the request object must be locked */
5538 kqwl_req_held(kqwl);
5539
5540 struct kqrequest *kqr = &kqwl->kqwl_request;
5541 struct uthread *ut = get_bsdthread_info(thread);
5542 boolean_t ipc_override_is_sync;
5543 kq_index_t qos_index = kqworkloop_combined_qos(kqwl, &ipc_override_is_sync);
5544
5545 /* nobody else bound so finally bind (as a workloop) */
5546 assert(kqr->kqr_state & KQR_THREQUESTED);
5547 assert((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) == 0);
5548 assert(thread != kqwl->kqwl_owner);
5549
5550 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND),
5551 kqwl->kqwl_dynamicid, (uintptr_t)thread_tid(thread),
5552 qos_index,
5553 (uintptr_t)(((uintptr_t)kqr->kqr_override_index << 16) |
5554 (((uintptr_t)kqr->kqr_state) << 8) |
5555 ((uintptr_t)ipc_override_is_sync)));
5556
5557 kqr->kqr_state |= KQR_BOUND | KQR_R2K_NOTIF_ARMED;
5558 kqr->kqr_thread = thread;
5559
5560 /* bind the workloop to the uthread */
5561 ut->uu_kqueue_bound = (struct kqueue *)kqwl;
5562 ut->uu_kqueue_flags = flags;
5563 ut->uu_kqueue_qos_index = qos_index;
5564 assert(ut->uu_kqueue_override_is_sync == 0);
5565 ut->uu_kqueue_override_is_sync = ipc_override_is_sync;
5566 if (qos_index) {
5567 thread_add_ipc_override(thread, qos_index);
5568 }
5569 if (ipc_override_is_sync) {
5570 thread_add_sync_ipc_override(thread);
5571 }
5572 }
5573
5574 /*
5575 * workloop_fulfill_threadreq - bind thread to processing workloop
5576 *
5577 * The provided thread will be responsible for delivering events
5578 * associated with the given kqrequest. Bind it and get ready for
5579 * the thread to eventually arrive.
5580 *
5581 * If WORKLOOP_FULFILL_THREADREQ_SYNC is specified, the callback
5582 * within the context of the pthread_functions->workq_threadreq
5583 * callout. In this case, the request structure is already locked.
5584 */
5585 int
5586 workloop_fulfill_threadreq(struct proc *p,
5587 workq_threadreq_t req,
5588 thread_t thread,
5589 int flags)
5590 {
5591 int sync = (flags & WORKLOOP_FULFILL_THREADREQ_SYNC);
5592 int cancel = (flags & WORKLOOP_FULFILL_THREADREQ_CANCEL);
5593 struct kqrequest *kqr;
5594 struct kqworkloop *kqwl;
5595
5596 kqwl = (struct kqworkloop *)((uintptr_t)req -
5597 offsetof(struct kqworkloop, kqwl_request) -
5598 offsetof(struct kqrequest, kqr_req));
5599 kqr = &kqwl->kqwl_request;
5600
5601 /* validate we're looking at something valid */
5602 if (kqwl->kqwl_p != p ||
5603 (kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
5604 assert(kqwl->kqwl_p == p);
5605 assert(kqwl->kqwl_state & KQ_WORKLOOP);
5606 return EINVAL;
5607 }
5608
5609 if (!sync)
5610 kqwl_req_lock(kqwl);
5611
5612 /* Should be a pending request */
5613 if ((kqr->kqr_state & KQR_BOUND) ||
5614 (kqr->kqr_state & KQR_THREQUESTED) == 0) {
5615
5616 assert((kqr->kqr_state & KQR_BOUND) == 0);
5617 assert(kqr->kqr_state & KQR_THREQUESTED);
5618 if (!sync)
5619 kqwl_req_unlock(kqwl);
5620 return EINPROGRESS;
5621 }
5622
5623 assert((kqr->kqr_state & KQR_DRAIN) == 0);
5624
5625 /*
5626 * Is it a cancel indication from pthread.
5627 * If so, we must be exiting/exec'ing. Forget
5628 * our pending request.
5629 */
5630 if (cancel) {
5631 kqr->kqr_state &= ~KQR_THREQUESTED;
5632 kqr->kqr_state |= KQR_DRAIN;
5633 } else {
5634 /* do the actual bind? */
5635 kqworkloop_bind_thread_impl(kqwl, thread, KEVENT_FLAG_WORKLOOP);
5636 }
5637
5638 if (!sync)
5639 kqwl_req_unlock(kqwl);
5640
5641 if (cancel)
5642 kqueue_release_last(p, &kqwl->kqwl_kqueue); /* may dealloc kq */
5643
5644 return 0;
5645 }
5646
5647
5648 /*
5649 * kevent_qos_internal_bind - bind thread to processing kqueue
5650 *
5651 * Indicates that the provided thread will be responsible for
5652 * servicing the particular QoS class index specified in the
5653 * parameters. Once the binding is done, any overrides that may
5654 * be associated with the cooresponding events can be applied.
5655 *
5656 * This should be called as soon as the thread identity is known,
5657 * preferably while still at high priority during creation.
5658 *
5659 * - caller holds a reference on the kqueue.
5660 * - the thread MUST call kevent_qos_internal after being bound
5661 * or the bucket of events may never be delivered.
5662 * - Nothing locked (may take mutex or block).
5663 */
5664
5665 int
5666 kevent_qos_internal_bind(
5667 struct proc *p,
5668 int qos_class,
5669 thread_t thread,
5670 unsigned int flags)
5671 {
5672 kq_index_t qos_index;
5673
5674 assert(flags & KEVENT_FLAG_WORKQ);
5675
5676 if (thread == THREAD_NULL || (flags & KEVENT_FLAG_WORKQ) == 0) {
5677 return EINVAL;
5678 }
5679
5680 /* get the qos index we're going to service */
5681 qos_index = qos_index_for_servicer(qos_class, thread, flags);
5682
5683 if (kqworkq_internal_bind(p, qos_index, thread, flags))
5684 return 0;
5685
5686 return EINPROGRESS;
5687 }
5688
5689
5690 static void
5691 kqworkloop_internal_unbind(
5692 struct proc *p,
5693 thread_t thread,
5694 unsigned int flags)
5695 {
5696 struct kqueue *kq;
5697 struct kqworkloop *kqwl;
5698 struct uthread *ut = get_bsdthread_info(thread);
5699
5700 assert(ut->uu_kqueue_bound != NULL);
5701 kq = ut->uu_kqueue_bound;
5702 assert(kq->kq_state & KQ_WORKLOOP);
5703 kqwl = (struct kqworkloop *)kq;
5704
5705 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND),
5706 kqwl->kqwl_dynamicid, (uintptr_t)thread_tid(thread),
5707 flags, 0);
5708
5709 if (!(kq->kq_state & KQ_NO_WQ_THREAD)) {
5710 assert(is_workqueue_thread(thread));
5711
5712 kqlock(kq);
5713 kqworkloop_unbind_thread(kqwl, thread, flags);
5714 kqunlock(kq);
5715
5716 /* If last reference, dealloc the workloop kq */
5717 kqueue_release_last(p, kq);
5718 } else {
5719 assert(!is_workqueue_thread(thread));
5720 kevent_servicer_detach_thread(p, kqwl->kqwl_dynamicid, thread, flags, kq);
5721 }
5722 }
5723
5724 static void
5725 kqworkq_internal_unbind(
5726 struct proc *p,
5727 kq_index_t qos_index,
5728 thread_t thread,
5729 unsigned int flags)
5730 {
5731 struct kqueue *kq;
5732 struct kqworkq *kqwq;
5733 struct uthread *ut;
5734 kq_index_t end_index;
5735
5736 assert(thread == current_thread());
5737 ut = get_bsdthread_info(thread);
5738
5739 kq = p->p_fd->fd_wqkqueue;
5740 assert(kq->kq_state & KQ_WORKQ);
5741 assert(ut->uu_kqueue_bound == kq);
5742
5743 kqwq = (struct kqworkq *)kq;
5744
5745 /* end servicing any requests we might own */
5746 end_index = (qos_index == KQWQ_QOS_MANAGER) ?
5747 0 : qos_index;
5748 kqlock(kq);
5749
5750 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND),
5751 (uintptr_t)thread_tid(thread), flags, qos_index);
5752
5753 do {
5754 kqworkq_end_processing(kqwq, qos_index, flags);
5755 } while (qos_index-- > end_index);
5756
5757 ut->uu_kqueue_bound = NULL;
5758 ut->uu_kqueue_qos_index = 0;
5759 ut->uu_kqueue_flags = 0;
5760
5761 kqunlock(kq);
5762 }
5763
5764 /*
5765 * kevent_qos_internal_unbind - unbind thread from processing kqueue
5766 *
5767 * End processing the per-QoS bucket of events and allow other threads
5768 * to be requested for future servicing.
5769 *
5770 * caller holds a reference on the kqueue.
5771 * thread is the current thread.
5772 */
5773
5774 int
5775 kevent_qos_internal_unbind(
5776 struct proc *p,
5777 int qos_class,
5778 thread_t thread,
5779 unsigned int flags)
5780 {
5781 #pragma unused(qos_class)
5782
5783 struct uthread *ut;
5784 struct kqueue *kq;
5785 unsigned int bound_flags;
5786 bool check_flags;
5787
5788 ut = get_bsdthread_info(thread);
5789 if (ut->uu_kqueue_bound == NULL) {
5790 /* early out if we are already unbound */
5791 assert(ut->uu_kqueue_flags == 0);
5792 assert(ut->uu_kqueue_qos_index == 0);
5793 assert(ut->uu_kqueue_override_is_sync == 0);
5794 return EALREADY;
5795 }
5796
5797 assert(flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP));
5798 assert(thread == current_thread());
5799
5800 check_flags = flags & KEVENT_FLAG_UNBIND_CHECK_FLAGS;
5801
5802 /* Get the kqueue we started with */
5803 kq = ut->uu_kqueue_bound;
5804 assert(kq != NULL);
5805 assert(kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
5806
5807 /* get flags and QoS parameters we started with */
5808 bound_flags = ut->uu_kqueue_flags;
5809
5810 /* Unbind from the class of workq */
5811 if (kq->kq_state & KQ_WORKQ) {
5812 if (check_flags && !(flags & KEVENT_FLAG_WORKQ)) {
5813 return EINVAL;
5814 }
5815
5816 kqworkq_internal_unbind(p, ut->uu_kqueue_qos_index, thread, bound_flags);
5817 } else {
5818 if (check_flags && !(flags & KEVENT_FLAG_WORKLOOP)) {
5819 return EINVAL;
5820 }
5821
5822 kqworkloop_internal_unbind(p, thread, bound_flags);
5823 }
5824
5825 return 0;
5826 }
5827
5828 /*
5829 * kqueue_process - process the triggered events in a kqueue
5830 *
5831 * Walk the queued knotes and validate that they are
5832 * really still triggered events by calling the filter
5833 * routines (if necessary). Hold a use reference on
5834 * the knote to avoid it being detached. For each event
5835 * that is still considered triggered, invoke the
5836 * callback routine provided.
5837 *
5838 * caller holds a reference on the kqueue.
5839 * kqueue locked on entry and exit - but may be dropped
5840 * kqueue list locked (held for duration of call)
5841 */
5842
5843 static int
5844 kqueue_process(struct kqueue *kq,
5845 kevent_callback_t callback,
5846 void *callback_data,
5847 struct filt_process_s *process_data,
5848 int *countp,
5849 struct proc *p)
5850 {
5851 unsigned int flags = process_data ? process_data->fp_flags : 0;
5852 struct uthread *ut = get_bsdthread_info(current_thread());
5853 kq_index_t start_index, end_index, i;
5854 struct knote *kn;
5855 int nevents = 0;
5856 int error = 0;
5857
5858 /*
5859 * Based on the mode of the kqueue and the bound QoS of the servicer,
5860 * determine the range of thread requests that need checking
5861 */
5862 if (kq->kq_state & KQ_WORKQ) {
5863 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
5864 start_index = KQWQ_QOS_MANAGER;
5865 } else if (ut->uu_kqueue_bound != kq) {
5866 return EJUSTRETURN;
5867 } else {
5868 start_index = ut->uu_kqueue_qos_index;
5869 }
5870
5871 /* manager services every request in a workq kqueue */
5872 assert(start_index > 0 && start_index <= KQWQ_QOS_MANAGER);
5873 end_index = (start_index == KQWQ_QOS_MANAGER) ? 0 : start_index;
5874
5875 } else if (kq->kq_state & KQ_WORKLOOP) {
5876 if (ut->uu_kqueue_bound != kq)
5877 return EJUSTRETURN;
5878
5879 /*
5880 * Single request servicing
5881 * we want to deliver all events, regardless of the QOS
5882 */
5883 start_index = end_index = THREAD_QOS_UNSPECIFIED;
5884 } else {
5885 start_index = end_index = QOS_INDEX_KQFILE;
5886 }
5887
5888 i = start_index;
5889
5890 do {
5891 if (kqueue_begin_processing(kq, i, flags) == -1) {
5892 *countp = 0;
5893 /* Nothing to process */
5894 continue;
5895 }
5896
5897 /*
5898 * loop through the enqueued knotes associated with this request,
5899 * processing each one. Each request may have several queues
5900 * of knotes to process (depending on the type of kqueue) so we
5901 * have to loop through all the queues as long as we have additional
5902 * space.
5903 */
5904 error = 0;
5905
5906 struct kqtailq *base_queue = kqueue_get_base_queue(kq, i);
5907 struct kqtailq *queue = kqueue_get_high_queue(kq, i);
5908 do {
5909 while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) {
5910 error = knote_process(kn, callback, callback_data, process_data, p);
5911 if (error == EJUSTRETURN) {
5912 error = 0;
5913 } else {
5914 nevents++;
5915 }
5916 /* error is EWOULDBLOCK when the out event array is full */
5917 }
5918 } while (error == 0 && queue-- > base_queue);
5919
5920 if ((kq->kq_state & KQ_WORKQ) == 0) {
5921 kqueue_end_processing(kq, i, nevents, flags);
5922 }
5923
5924 if (error == EWOULDBLOCK) {
5925 /* break out if no more space for additional events */
5926 error = 0;
5927 break;
5928 }
5929 } while (i-- > end_index);
5930
5931 *countp = nevents;
5932 return (error);
5933 }
5934
5935 static void
5936 kqueue_scan_continue(void *data, wait_result_t wait_result)
5937 {
5938 thread_t self = current_thread();
5939 uthread_t ut = (uthread_t)get_bsdthread_info(self);
5940 struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
5941 struct kqueue *kq = (struct kqueue *)data;
5942 struct filt_process_s *process_data = cont_args->process_data;
5943 int error;
5944 int count;
5945
5946 /* convert the (previous) wait_result to a proper error */
5947 switch (wait_result) {
5948 case THREAD_AWAKENED: {
5949 kqlock(kq);
5950 retry:
5951 error = kqueue_process(kq, cont_args->call, cont_args->data,
5952 process_data, &count, current_proc());
5953 if (error == 0 && count == 0) {
5954 if (kq->kq_state & KQ_DRAIN) {
5955 kqunlock(kq);
5956 goto drain;
5957 }
5958
5959 if (kq->kq_state & KQ_WAKEUP)
5960 goto retry;
5961
5962 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
5963 KQ_EVENT, THREAD_ABORTSAFE,
5964 cont_args->deadline);
5965 kq->kq_state |= KQ_SLEEP;
5966 kqunlock(kq);
5967 thread_block_parameter(kqueue_scan_continue, kq);
5968 /* NOTREACHED */
5969 }
5970 kqunlock(kq);
5971 } break;
5972 case THREAD_TIMED_OUT:
5973 error = EWOULDBLOCK;
5974 break;
5975 case THREAD_INTERRUPTED:
5976 error = EINTR;
5977 break;
5978 case THREAD_RESTART:
5979 drain:
5980 error = EBADF;
5981 break;
5982 default:
5983 panic("%s: - invalid wait_result (%d)", __func__,
5984 wait_result);
5985 error = 0;
5986 }
5987
5988 /* call the continuation with the results */
5989 assert(cont_args->cont != NULL);
5990 (cont_args->cont)(kq, cont_args->data, error);
5991 }
5992
5993
5994 /*
5995 * kqueue_scan - scan and wait for events in a kqueue
5996 *
5997 * Process the triggered events in a kqueue.
5998 *
5999 * If there are no events triggered arrange to
6000 * wait for them. If the caller provided a
6001 * continuation routine, then kevent_scan will
6002 * also.
6003 *
6004 * The callback routine must be valid.
6005 * The caller must hold a use-count reference on the kq.
6006 */
6007
6008 int
6009 kqueue_scan(struct kqueue *kq,
6010 kevent_callback_t callback,
6011 kqueue_continue_t continuation,
6012 void *callback_data,
6013 struct filt_process_s *process_data,
6014 struct timeval *atvp,
6015 struct proc *p)
6016 {
6017 thread_continue_t cont = THREAD_CONTINUE_NULL;
6018 unsigned int flags;
6019 uint64_t deadline;
6020 int error;
6021 int first;
6022 int fd;
6023
6024 assert(callback != NULL);
6025
6026 /*
6027 * Determine which QoS index we are servicing
6028 */
6029 flags = (process_data) ? process_data->fp_flags : 0;
6030 fd = (process_data) ? process_data->fp_fd : -1;
6031
6032 first = 1;
6033 for (;;) {
6034 wait_result_t wait_result;
6035 int count;
6036
6037 /*
6038 * Make a pass through the kq to find events already
6039 * triggered.
6040 */
6041 kqlock(kq);
6042 error = kqueue_process(kq, callback, callback_data,
6043 process_data, &count, p);
6044 if (error || count)
6045 break; /* lock still held */
6046
6047 /* looks like we have to consider blocking */
6048 if (first) {
6049 first = 0;
6050 /* convert the timeout to a deadline once */
6051 if (atvp->tv_sec || atvp->tv_usec) {
6052 uint64_t now;
6053
6054 clock_get_uptime(&now);
6055 nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
6056 atvp->tv_usec * (long)NSEC_PER_USEC,
6057 &deadline);
6058 if (now >= deadline) {
6059 /* non-blocking call */
6060 error = EWOULDBLOCK;
6061 break; /* lock still held */
6062 }
6063 deadline -= now;
6064 clock_absolutetime_interval_to_deadline(deadline, &deadline);
6065 } else {
6066 deadline = 0; /* block forever */
6067 }
6068
6069 if (continuation) {
6070 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
6071 struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
6072
6073 cont_args->call = callback;
6074 cont_args->cont = continuation;
6075 cont_args->deadline = deadline;
6076 cont_args->data = callback_data;
6077 cont_args->process_data = process_data;
6078 cont = kqueue_scan_continue;
6079 }
6080 }
6081
6082 if (kq->kq_state & KQ_DRAIN) {
6083 kqunlock(kq);
6084 return EBADF;
6085 }
6086
6087 /* If awakened during processing, try again */
6088 if (kq->kq_state & KQ_WAKEUP) {
6089 kqunlock(kq);
6090 continue;
6091 }
6092
6093 /* go ahead and wait */
6094 waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
6095 KQ_EVENT, THREAD_ABORTSAFE,
6096 TIMEOUT_URGENCY_USER_NORMAL,
6097 deadline, TIMEOUT_NO_LEEWAY);
6098 kq->kq_state |= KQ_SLEEP;
6099 kqunlock(kq);
6100 wait_result = thread_block_parameter(cont, kq);
6101 /* NOTREACHED if (continuation != NULL) */
6102
6103 switch (wait_result) {
6104 case THREAD_AWAKENED:
6105 continue;
6106 case THREAD_TIMED_OUT:
6107 return EWOULDBLOCK;
6108 case THREAD_INTERRUPTED:
6109 return EINTR;
6110 case THREAD_RESTART:
6111 return EBADF;
6112 default:
6113 panic("%s: - bad wait_result (%d)", __func__,
6114 wait_result);
6115 error = 0;
6116 }
6117 }
6118 kqunlock(kq);
6119 return (error);
6120 }
6121
6122
6123 /*
6124 * XXX
6125 * This could be expanded to call kqueue_scan, if desired.
6126 */
6127 /*ARGSUSED*/
6128 static int
6129 kqueue_read(__unused struct fileproc *fp,
6130 __unused struct uio *uio,
6131 __unused int flags,
6132 __unused vfs_context_t ctx)
6133 {
6134 return (ENXIO);
6135 }
6136
6137 /*ARGSUSED*/
6138 static int
6139 kqueue_write(__unused struct fileproc *fp,
6140 __unused struct uio *uio,
6141 __unused int flags,
6142 __unused vfs_context_t ctx)
6143 {
6144 return (ENXIO);
6145 }
6146
6147 /*ARGSUSED*/
6148 static int
6149 kqueue_ioctl(__unused struct fileproc *fp,
6150 __unused u_long com,
6151 __unused caddr_t data,
6152 __unused vfs_context_t ctx)
6153 {
6154 return (ENOTTY);
6155 }
6156
6157 /*ARGSUSED*/
6158 static int
6159 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
6160 __unused vfs_context_t ctx)
6161 {
6162 struct kqueue *kq = (struct kqueue *)fp->f_data;
6163 struct kqtailq *queue;
6164 struct kqtailq *suppressq;
6165 struct knote *kn;
6166 int retnum = 0;
6167
6168 if (which != FREAD)
6169 return (0);
6170
6171 kqlock(kq);
6172
6173 assert((kq->kq_state & KQ_WORKQ) == 0);
6174
6175 /*
6176 * If this is the first pass, link the wait queue associated with the
6177 * the kqueue onto the wait queue set for the select(). Normally we
6178 * use selrecord() for this, but it uses the wait queue within the
6179 * selinfo structure and we need to use the main one for the kqueue to
6180 * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
6181 * (The select() call will unlink them when it ends).
6182 */
6183 if (wq_link_id != NULL) {
6184 thread_t cur_act = current_thread();
6185 struct uthread * ut = get_bsdthread_info(cur_act);
6186
6187 kq->kq_state |= KQ_SEL;
6188 waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset,
6189 WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
6190
6191 /* always consume the reserved link object */
6192 waitq_link_release(*(uint64_t *)wq_link_id);
6193 *(uint64_t *)wq_link_id = 0;
6194
6195 /*
6196 * selprocess() is expecting that we send it back the waitq
6197 * that was just added to the thread's waitq set. In order
6198 * to not change the selrecord() API (which is exported to
6199 * kexts), we pass this value back through the
6200 * void *wq_link_id pointer we were passed. We need to use
6201 * memcpy here because the pointer may not be properly aligned
6202 * on 32-bit systems.
6203 */
6204 void *wqptr = &kq->kq_wqs;
6205 memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
6206 }
6207
6208 if (kqueue_begin_processing(kq, QOS_INDEX_KQFILE, 0) == -1) {
6209 kqunlock(kq);
6210 return (0);
6211 }
6212
6213 queue = kqueue_get_base_queue(kq, QOS_INDEX_KQFILE);
6214 if (!TAILQ_EMPTY(queue)) {
6215 /*
6216 * there is something queued - but it might be a
6217 * KN_STAYACTIVE knote, which may or may not have
6218 * any events pending. Otherwise, we have to walk
6219 * the list of knotes to see, and peek at the
6220 * (non-vanished) stay-active ones to be really sure.
6221 */
6222 while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
6223 if (kn->kn_status & KN_ACTIVE) {
6224 retnum = 1;
6225 goto out;
6226 }
6227 assert(kn->kn_status & KN_STAYACTIVE);
6228 knote_suppress(kn);
6229 }
6230
6231 /*
6232 * There were no regular events on the queue, so take
6233 * a deeper look at the stay-queued ones we suppressed.
6234 */
6235 suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
6236 while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
6237 unsigned peek = 1;
6238
6239 assert(!knoteuse_needs_boost(kn, NULL));
6240
6241 /* If didn't vanish while suppressed - peek at it */
6242 if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
6243 peek = knote_fops(kn)->f_peek(kn);
6244
6245 /* if it dropped while getting lock - move on */
6246 if (!knoteuse2kqlock(kq, kn, KNUSE_NONE))
6247 continue;
6248 }
6249
6250 /* unsuppress it */
6251 knote_unsuppress(kn);
6252
6253 /* has data or it has to report a vanish */
6254 if (peek > 0) {
6255 retnum = 1;
6256 goto out;
6257 }
6258 }
6259 }
6260
6261 out:
6262 kqueue_end_processing(kq, QOS_INDEX_KQFILE, retnum, 0);
6263 kqunlock(kq);
6264 return (retnum);
6265 }
6266
6267 /*
6268 * kqueue_close -
6269 */
6270 /*ARGSUSED*/
6271 static int
6272 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
6273 {
6274 struct kqfile *kqf = (struct kqfile *)fg->fg_data;
6275
6276 assert((kqf->kqf_state & KQ_WORKQ) == 0);
6277 kqueue_dealloc(&kqf->kqf_kqueue);
6278 fg->fg_data = NULL;
6279 return (0);
6280 }
6281
6282 /*ARGSUSED*/
6283 /*
6284 * The callers has taken a use-count reference on this kqueue and will donate it
6285 * to the kqueue we are being added to. This keeps the kqueue from closing until
6286 * that relationship is torn down.
6287 */
6288 static int
6289 kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn,
6290 __unused struct kevent_internal_s *kev, __unused vfs_context_t ctx)
6291 {
6292 struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
6293 struct kqueue *kq = &kqf->kqf_kqueue;
6294 struct kqueue *parentkq = knote_get_kq(kn);
6295
6296 assert((kqf->kqf_state & KQ_WORKQ) == 0);
6297
6298 if (parentkq == kq ||
6299 kn->kn_filter != EVFILT_READ) {
6300 kn->kn_flags = EV_ERROR;
6301 kn->kn_data = EINVAL;
6302 return 0;
6303 }
6304
6305 /*
6306 * We have to avoid creating a cycle when nesting kqueues
6307 * inside another. Rather than trying to walk the whole
6308 * potential DAG of nested kqueues, we just use a simple
6309 * ceiling protocol. When a kqueue is inserted into another,
6310 * we check that the (future) parent is not already nested
6311 * into another kqueue at a lower level than the potenial
6312 * child (because it could indicate a cycle). If that test
6313 * passes, we just mark the nesting levels accordingly.
6314 */
6315
6316 kqlock(parentkq);
6317 if (parentkq->kq_level > 0 &&
6318 parentkq->kq_level < kq->kq_level)
6319 {
6320 kqunlock(parentkq);
6321 kn->kn_flags = EV_ERROR;
6322 kn->kn_data = EINVAL;
6323 return 0;
6324 } else {
6325 /* set parent level appropriately */
6326 if (parentkq->kq_level == 0)
6327 parentkq->kq_level = 2;
6328 if (parentkq->kq_level < kq->kq_level + 1)
6329 parentkq->kq_level = kq->kq_level + 1;
6330 kqunlock(parentkq);
6331
6332 kn->kn_filtid = EVFILTID_KQREAD;
6333 kqlock(kq);
6334 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
6335 /* indicate nesting in child, if needed */
6336 if (kq->kq_level == 0)
6337 kq->kq_level = 1;
6338
6339 int count = kq->kq_count;
6340 kqunlock(kq);
6341 return (count > 0);
6342 }
6343 }
6344
6345 /*
6346 * kqueue_drain - called when kq is closed
6347 */
6348 /*ARGSUSED*/
6349 static int
6350 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
6351 {
6352 struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
6353
6354 assert((kq->kq_state & KQ_WORKQ) == 0);
6355
6356 kqlock(kq);
6357 kq->kq_state |= KQ_DRAIN;
6358 kqueue_interrupt(kq);
6359 kqunlock(kq);
6360 return (0);
6361 }
6362
6363 /*ARGSUSED*/
6364 int
6365 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
6366 {
6367 assert((kq->kq_state & KQ_WORKQ) == 0);
6368
6369 kqlock(kq);
6370 if (isstat64 != 0) {
6371 struct stat64 *sb64 = (struct stat64 *)ub;
6372
6373 bzero((void *)sb64, sizeof(*sb64));
6374 sb64->st_size = kq->kq_count;
6375 if (kq->kq_state & KQ_KEV_QOS)
6376 sb64->st_blksize = sizeof(struct kevent_qos_s);
6377 else if (kq->kq_state & KQ_KEV64)
6378 sb64->st_blksize = sizeof(struct kevent64_s);
6379 else if (IS_64BIT_PROCESS(p))
6380 sb64->st_blksize = sizeof(struct user64_kevent);
6381 else
6382 sb64->st_blksize = sizeof(struct user32_kevent);
6383 sb64->st_mode = S_IFIFO;
6384 } else {
6385 struct stat *sb = (struct stat *)ub;
6386
6387 bzero((void *)sb, sizeof(*sb));
6388 sb->st_size = kq->kq_count;
6389 if (kq->kq_state & KQ_KEV_QOS)
6390 sb->st_blksize = sizeof(struct kevent_qos_s);
6391 else if (kq->kq_state & KQ_KEV64)
6392 sb->st_blksize = sizeof(struct kevent64_s);
6393 else if (IS_64BIT_PROCESS(p))
6394 sb->st_blksize = sizeof(struct user64_kevent);
6395 else
6396 sb->st_blksize = sizeof(struct user32_kevent);
6397 sb->st_mode = S_IFIFO;
6398 }
6399 kqunlock(kq);
6400 return (0);
6401 }
6402
6403 /*
6404 * Interact with the pthread kext to request a servicing there.
6405 * Eventually, this will request threads at specific QoS levels.
6406 * For now, it only requests a dispatch-manager-QoS thread, and
6407 * only one-at-a-time.
6408 *
6409 * - Caller holds the workq request lock
6410 *
6411 * - May be called with the kqueue's wait queue set locked,
6412 * so cannot do anything that could recurse on that.
6413 */
6414 static void
6415 kqworkq_request_thread(
6416 struct kqworkq *kqwq,
6417 kq_index_t qos_index)
6418 {
6419 struct kqrequest *kqr;
6420
6421 assert(kqwq->kqwq_state & KQ_WORKQ);
6422 assert(qos_index < KQWQ_NQOS);
6423
6424 kqr = kqworkq_get_request(kqwq, qos_index);
6425
6426 assert(kqr->kqr_state & KQR_WAKEUP);
6427
6428 /*
6429 * If we have already requested a thread, and it hasn't
6430 * started processing yet, there's no use hammering away
6431 * on the pthread kext.
6432 */
6433 if (kqr->kqr_state & KQR_THREQUESTED)
6434 return;
6435
6436 assert((kqr->kqr_state & KQR_BOUND) == 0);
6437
6438 /* request additional workq threads if appropriate */
6439 if (pthread_functions != NULL &&
6440 pthread_functions->workq_reqthreads != NULL) {
6441 unsigned int flags = KEVENT_FLAG_WORKQ;
6442 unsigned long priority;
6443 thread_t wqthread;
6444
6445 /* Compute the appropriate pthread priority */
6446 priority = qos_from_qos_index(qos_index);
6447
6448 #if 0
6449 /* JMM - for now remain compatible with old invocations */
6450 /* set the over-commit flag on the request if needed */
6451 if (kqr->kqr_state & KQR_THOVERCOMMIT)
6452 priority |= _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
6453 #endif /* 0 */
6454
6455 /* Compute a priority based on qos_index. */
6456 struct workq_reqthreads_req_s request = {
6457 .priority = priority,
6458 .count = 1
6459 };
6460
6461 /* mark that we are making a request */
6462 kqr->kqr_state |= KQR_THREQUESTED;
6463 if (qos_index == KQWQ_QOS_MANAGER)
6464 kqr->kqr_state |= KQWQ_THMANAGER;
6465
6466 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
6467 0, qos_index,
6468 (((uintptr_t)kqr->kqr_override_index << 8) |
6469 (uintptr_t)kqr->kqr_state));
6470 wqthread = (*pthread_functions->workq_reqthreads)(kqwq->kqwq_p, 1, &request);
6471
6472 /* We've been switched to the emergency/manager thread */
6473 if (wqthread == (thread_t)-1) {
6474 assert(qos_index != KQWQ_QOS_MANAGER);
6475 kqr->kqr_state |= KQWQ_THMANAGER;
6476 return;
6477 }
6478
6479 /*
6480 * bind the returned thread identity
6481 * This goes away when we switch to synchronous callback
6482 * binding from the pthread kext.
6483 */
6484 if (wqthread != NULL) {
6485 kqworkq_bind_thread_impl(kqwq, qos_index, wqthread, flags);
6486 }
6487 }
6488 }
6489
6490 /*
6491 * If we aren't already busy processing events [for this QoS],
6492 * request workq thread support as appropriate.
6493 *
6494 * TBD - for now, we don't segregate out processing by QoS.
6495 *
6496 * - May be called with the kqueue's wait queue set locked,
6497 * so cannot do anything that could recurse on that.
6498 */
6499 static void
6500 kqworkq_request_help(
6501 struct kqworkq *kqwq,
6502 kq_index_t qos_index)
6503 {
6504 struct kqrequest *kqr;
6505
6506 /* convert to thread qos value */
6507 assert(qos_index < KQWQ_NQOS);
6508
6509 kqwq_req_lock(kqwq);
6510 kqr = kqworkq_get_request(kqwq, qos_index);
6511
6512 if ((kqr->kqr_state & KQR_WAKEUP) == 0) {
6513 /* Indicate that we needed help from this request */
6514 kqr->kqr_state |= KQR_WAKEUP;
6515
6516 /* Go assure a thread request has been made */
6517 kqworkq_request_thread(kqwq, qos_index);
6518 }
6519 kqwq_req_unlock(kqwq);
6520 }
6521
6522 static void
6523 kqworkloop_threadreq_impl(struct kqworkloop *kqwl, kq_index_t qos_index)
6524 {
6525 struct kqrequest *kqr = &kqwl->kqwl_request;
6526 unsigned long pri = pthread_priority_for_kqrequest(kqr, qos_index);
6527 int op, ret;
6528
6529 assert((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED);
6530
6531 /*
6532 * New-style thread request supported. Provide
6533 * the pthread kext a pointer to a workq_threadreq_s
6534 * structure for its use until a corresponding
6535 * workloop_fulfill_threqreq callback.
6536 */
6537 if (current_proc() == kqwl->kqwl_kqueue.kq_p) {
6538 op = WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL;
6539 } else {
6540 op = WORKQ_THREADREQ_WORKLOOP;
6541 }
6542 again:
6543 ret = (*pthread_functions->workq_threadreq)(kqwl->kqwl_p, &kqr->kqr_req,
6544 WORKQ_THREADREQ_WORKLOOP, pri, 0);
6545 switch (ret) {
6546 case ENOTSUP:
6547 assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
6548 op = WORKQ_THREADREQ_WORKLOOP;
6549 goto again;
6550
6551 case ECANCELED:
6552 case EINVAL:
6553 /*
6554 * Process is shutting down or exec'ing.
6555 * All the kqueues are going to be cleaned up
6556 * soon. Forget we even asked for a thread -
6557 * and make sure we don't ask for more.
6558 */
6559 kqueue_release((struct kqueue *)kqwl, KQUEUE_CANT_BE_LAST_REF);
6560 kqr->kqr_state &= ~KQR_THREQUESTED;
6561 kqr->kqr_state |= KQR_DRAIN;
6562 break;
6563
6564 case EAGAIN:
6565 assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
6566 act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
6567 break;
6568
6569 default:
6570 assert(ret == 0);
6571 }
6572 }
6573
6574 static void
6575 kqworkloop_threadreq_modify(struct kqworkloop *kqwl, kq_index_t qos_index)
6576 {
6577 struct kqrequest *kqr = &kqwl->kqwl_request;
6578 unsigned long pri = pthread_priority_for_kqrequest(kqr, qos_index);
6579 int ret, op = WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL;
6580
6581 assert((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED);
6582
6583 if (current_proc() == kqwl->kqwl_kqueue.kq_p) {
6584 op = WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL;
6585 } else {
6586 op = WORKQ_THREADREQ_CHANGE_PRI;
6587 }
6588 again:
6589 ret = (*pthread_functions->workq_threadreq_modify)(kqwl->kqwl_p,
6590 &kqr->kqr_req, op, pri, 0);
6591 switch (ret) {
6592 case ENOTSUP:
6593 assert(op == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL);
6594 op = WORKQ_THREADREQ_CHANGE_PRI;
6595 goto again;
6596
6597 case EAGAIN:
6598 assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
6599 act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
6600 break;
6601
6602 case ECANCELED:
6603 case EINVAL:
6604 case 0:
6605 break;
6606
6607 default:
6608 assert(ret == 0);
6609 }
6610 }
6611
6612 /*
6613 * Interact with the pthread kext to request a servicing thread.
6614 * This will request a single thread at the highest QoS level
6615 * for which there is work (whether that was the requested QoS
6616 * for an event or an override applied to a lower-QoS request).
6617 *
6618 * - Caller holds the workloop request lock
6619 *
6620 * - May be called with the kqueue's wait queue set locked,
6621 * so cannot do anything that could recurse on that.
6622 */
6623 static void
6624 kqworkloop_request_thread(struct kqworkloop *kqwl, kq_index_t qos_index)
6625 {
6626 struct kqrequest *kqr;
6627
6628 assert(kqwl->kqwl_state & KQ_WORKLOOP);
6629
6630 kqr = &kqwl->kqwl_request;
6631
6632 assert(kqwl->kqwl_owner == THREAD_NULL);
6633 assert((kqr->kqr_state & KQR_BOUND) == 0);
6634 assert((kqr->kqr_state & KQR_THREQUESTED) == 0);
6635 assert(!(kqwl->kqwl_kqueue.kq_state & KQ_NO_WQ_THREAD));
6636
6637 /* If we're draining thread requests, just bail */
6638 if (kqr->kqr_state & KQR_DRAIN)
6639 return;
6640
6641 if (pthread_functions != NULL &&
6642 pthread_functions->workq_threadreq != NULL) {
6643 /*
6644 * set request state flags, etc... before calling pthread
6645 * This assures they are set before a possible synchronous
6646 * callback to workloop_fulfill_threadreq().
6647 */
6648 kqr->kqr_state |= KQR_THREQUESTED;
6649
6650 /* Add a thread request reference on the kqueue. */
6651 kqueue_retain((struct kqueue *)kqwl);
6652
6653 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
6654 kqwl->kqwl_dynamicid,
6655 0, qos_index, kqr->kqr_state);
6656 kqworkloop_threadreq_impl(kqwl, qos_index);
6657 } else {
6658 panic("kqworkloop_request_thread");
6659 return;
6660 }
6661 }
6662
6663 static void
6664 kqworkloop_update_sync_override_state(struct kqworkloop *kqwl, boolean_t sync_ipc_override)
6665 {
6666 struct kqrequest *kqr = &kqwl->kqwl_request;
6667 kqwl_req_lock(kqwl);
6668 kqr->kqr_has_sync_override = sync_ipc_override;
6669 kqwl_req_unlock(kqwl);
6670
6671 }
6672
6673 static inline kq_index_t
6674 kqworkloop_combined_qos(struct kqworkloop *kqwl, boolean_t *ipc_override_is_sync)
6675 {
6676 struct kqrequest *kqr = &kqwl->kqwl_request;
6677 kq_index_t override;
6678
6679 *ipc_override_is_sync = FALSE;
6680 override = MAX(MAX(kqr->kqr_qos_index, kqr->kqr_override_index),
6681 kqr->kqr_dsync_waiters_qos);
6682
6683 if (kqr->kqr_sync_suppress_count > 0 || kqr->kqr_has_sync_override) {
6684 *ipc_override_is_sync = TRUE;
6685 override = THREAD_QOS_USER_INTERACTIVE;
6686 }
6687 return override;
6688 }
6689
6690 static inline void
6691 kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
6692 {
6693 struct kqrequest *kqr = &kqwl->kqwl_request;
6694
6695 kqwl_req_held(kqwl);
6696
6697 if (kqr->kqr_state & KQR_R2K_NOTIF_ARMED) {
6698 assert(kqr->kqr_state & KQR_BOUND);
6699 assert(kqr->kqr_thread);
6700
6701 kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
6702 act_set_astkevent(kqr->kqr_thread, AST_KEVENT_RETURN_TO_KERNEL);
6703 }
6704 }
6705
6706 static void
6707 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
6708 {
6709 const uint8_t KQWL_STAYACTIVE_FIRED_BIT = (1 << 0);
6710
6711 struct kqrequest *kqr = &kqwl->kqwl_request;
6712 boolean_t old_ipc_override_is_sync = FALSE;
6713 kq_index_t old_qos = kqworkloop_combined_qos(kqwl, &old_ipc_override_is_sync);
6714 struct kqueue *kq = &kqwl->kqwl_kqueue;
6715 bool static_thread = (kq->kq_state & KQ_NO_WQ_THREAD);
6716 kq_index_t i;
6717
6718 /* must hold the kqr lock */
6719 kqwl_req_held(kqwl);
6720
6721 switch (op) {
6722 case KQWL_UTQ_UPDATE_WAKEUP_QOS:
6723 if (qos == KQWL_BUCKET_STAYACTIVE) {
6724 /*
6725 * the KQWL_BUCKET_STAYACTIVE is not a QoS bucket, we only remember
6726 * a high watermark (kqr_stayactive_qos) of any stay active knote
6727 * that was ever registered with this workloop.
6728 *
6729 * When waitq_set__CALLING_PREPOST_HOOK__() wakes up any stay active
6730 * knote, we use this high-watermark as a wakeup-index, and also set
6731 * the magic KQWL_BUCKET_STAYACTIVE bit to make sure we remember
6732 * there is at least one stay active knote fired until the next full
6733 * processing of this bucket.
6734 */
6735 kqr->kqr_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT;
6736 qos = kqr->kqr_stayactive_qos;
6737 assert(qos);
6738 assert(!static_thread);
6739 }
6740 if (kqr->kqr_wakeup_indexes & (1 << qos)) {
6741 assert(kqr->kqr_state & KQR_WAKEUP);
6742 break;
6743 }
6744
6745 kqr->kqr_wakeup_indexes |= (1 << qos);
6746 kqr->kqr_state |= KQR_WAKEUP;
6747 kqworkloop_request_fire_r2k_notification(kqwl);
6748 goto recompute_async;
6749
6750 case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
6751 assert(qos);
6752 if (kqr->kqr_stayactive_qos < qos) {
6753 kqr->kqr_stayactive_qos = qos;
6754 if (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
6755 assert(kqr->kqr_state & KQR_WAKEUP);
6756 kqr->kqr_wakeup_indexes |= (1 << qos);
6757 goto recompute_async;
6758 }
6759 }
6760 break;
6761
6762 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
6763 kqlock_held(kq); // to look at kq_queues
6764 kqr->kqr_has_sync_override = FALSE;
6765 i = KQWL_BUCKET_STAYACTIVE;
6766 if (TAILQ_EMPTY(&kqr->kqr_suppressed)) {
6767 kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
6768 }
6769 if (!TAILQ_EMPTY(&kq->kq_queue[i]) &&
6770 (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
6771 /*
6772 * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
6773 * knote may have fired, so we need to merge in kqr_stayactive_qos.
6774 *
6775 * Unlike other buckets, this one is never empty but could be idle.
6776 */
6777 kqr->kqr_wakeup_indexes &= KQWL_STAYACTIVE_FIRED_BIT;
6778 kqr->kqr_wakeup_indexes |= (1 << kqr->kqr_stayactive_qos);
6779 } else {
6780 kqr->kqr_wakeup_indexes = 0;
6781 }
6782 for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) {
6783 if (!TAILQ_EMPTY(&kq->kq_queue[i])) {
6784 kqr->kqr_wakeup_indexes |= (1 << i);
6785 struct knote *kn = TAILQ_FIRST(&kqwl->kqwl_kqueue.kq_queue[i]);
6786 if (i == THREAD_QOS_USER_INTERACTIVE &&
6787 kn->kn_qos_override_is_sync) {
6788 kqr->kqr_has_sync_override = TRUE;
6789 }
6790 }
6791 }
6792 if (kqr->kqr_wakeup_indexes) {
6793 kqr->kqr_state |= KQR_WAKEUP;
6794 kqworkloop_request_fire_r2k_notification(kqwl);
6795 } else {
6796 kqr->kqr_state &= ~KQR_WAKEUP;
6797 }
6798 assert(qos == THREAD_QOS_UNSPECIFIED);
6799 goto recompute_async;
6800
6801 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
6802 kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
6803 assert(qos == THREAD_QOS_UNSPECIFIED);
6804 goto recompute_async;
6805
6806 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
6807 recompute_async:
6808 /*
6809 * When modifying the wakeup QoS or the async override QoS, we always
6810 * need to maintain our invariant that kqr_override_index is at least as
6811 * large as the highest QoS for which an event is fired.
6812 *
6813 * However this override index can be larger when there is an overriden
6814 * suppressed knote pushing on the kqueue.
6815 */
6816 if (kqr->kqr_wakeup_indexes > (1 << qos)) {
6817 qos = fls(kqr->kqr_wakeup_indexes) - 1; /* fls is 1-based */
6818 }
6819 if (kqr->kqr_override_index < qos) {
6820 kqr->kqr_override_index = qos;
6821 }
6822 break;
6823
6824 case KQWL_UTQ_REDRIVE_EVENTS:
6825 break;
6826
6827 case KQWL_UTQ_SET_ASYNC_QOS:
6828 filt_wlheld(kqwl);
6829 kqr->kqr_qos_index = qos;
6830 break;
6831
6832 case KQWL_UTQ_SET_SYNC_WAITERS_QOS:
6833 filt_wlheld(kqwl);
6834 kqr->kqr_dsync_waiters_qos = qos;
6835 break;
6836
6837 default:
6838 panic("unknown kqwl thread qos update operation: %d", op);
6839 }
6840
6841 boolean_t new_ipc_override_is_sync = FALSE;
6842 kq_index_t new_qos = kqworkloop_combined_qos(kqwl, &new_ipc_override_is_sync);
6843 thread_t kqwl_owner = kqwl->kqwl_owner;
6844 thread_t servicer = kqr->kqr_thread;
6845 __assert_only int ret;
6846
6847 /*
6848 * Apply the diffs to the owner if applicable
6849 */
6850 if (filt_wlowner_is_valid(kqwl_owner)) {
6851 #if 0
6852 /* JMM - need new trace hooks for owner overrides */
6853 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
6854 kqwl->kqwl_dynamicid,
6855 (kqr->kqr_state & KQR_BOUND) ? thread_tid(kqwl_owner) : 0,
6856 (kqr->kqr_qos_index << 8) | new_qos,
6857 (kqr->kqr_override_index << 8) | kqr->kqr_state);
6858 #endif
6859 if (new_qos == kqr->kqr_dsync_owner_qos) {
6860 // nothing to do
6861 } else if (kqr->kqr_dsync_owner_qos == THREAD_QOS_UNSPECIFIED) {
6862 thread_add_ipc_override(kqwl_owner, new_qos);
6863 } else if (new_qos == THREAD_QOS_UNSPECIFIED) {
6864 thread_drop_ipc_override(kqwl_owner);
6865 } else /* kqr->kqr_dsync_owner_qos != new_qos */ {
6866 thread_update_ipc_override(kqwl_owner, new_qos);
6867 }
6868 kqr->kqr_dsync_owner_qos = new_qos;
6869
6870 if (new_ipc_override_is_sync &&
6871 !kqr->kqr_owner_override_is_sync) {
6872 thread_add_sync_ipc_override(kqwl_owner);
6873 } else if (!new_ipc_override_is_sync &&
6874 kqr->kqr_owner_override_is_sync) {
6875 thread_drop_sync_ipc_override(kqwl_owner);
6876 }
6877 kqr->kqr_owner_override_is_sync = new_ipc_override_is_sync;
6878 }
6879
6880 /*
6881 * apply the diffs to the servicer
6882 */
6883 if (static_thread) {
6884 /*
6885 * Statically bound thread
6886 *
6887 * These threads don't participates in QoS overrides today, just wakeup
6888 * the thread blocked on this kqueue if a new event arrived.
6889 */
6890
6891 switch (op) {
6892 case KQWL_UTQ_UPDATE_WAKEUP_QOS:
6893 case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
6894 case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
6895 break;
6896
6897 case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
6898 case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
6899 case KQWL_UTQ_REDRIVE_EVENTS:
6900 case KQWL_UTQ_SET_ASYNC_QOS:
6901 case KQWL_UTQ_SET_SYNC_WAITERS_QOS:
6902 panic("should never be called");
6903 break;
6904 }
6905
6906 kqlock_held(kq);
6907
6908 if ((kqr->kqr_state & KQR_BOUND) && (kqr->kqr_state & KQR_WAKEUP)) {
6909 assert(servicer && !is_workqueue_thread(servicer));
6910 if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
6911 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
6912 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT,
6913 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
6914 }
6915 }
6916 } else if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
6917 /*
6918 * No servicer, nor thread-request
6919 *
6920 * Make a new thread request, unless there is an owner (or the workloop
6921 * is suspended in userland) or if there is no asynchronous work in the
6922 * first place.
6923 */
6924
6925 if (kqwl_owner == THREAD_NULL && (kqr->kqr_state & KQR_WAKEUP)) {
6926 kqworkloop_request_thread(kqwl, new_qos);
6927 }
6928 } else if ((kqr->kqr_state & KQR_BOUND) == 0 &&
6929 (kqwl_owner || (kqr->kqr_state & KQR_WAKEUP) == 0)) {
6930 /*
6931 * No servicer, thread request in flight we want to cancel
6932 *
6933 * We just got rid of the last knote of the kqueue or noticed an owner
6934 * with a thread request still in flight, take it back.
6935 */
6936 ret = (*pthread_functions->workq_threadreq_modify)(kqwl->kqwl_p,
6937 &kqr->kqr_req, WORKQ_THREADREQ_CANCEL, 0, 0);
6938 if (ret == 0) {
6939 kqr->kqr_state &= ~KQR_THREQUESTED;
6940 kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
6941 }
6942 } else {
6943 boolean_t qos_changed = FALSE;
6944
6945 /*
6946 * Servicer or request is in flight
6947 *
6948 * Just apply the diff to the servicer or the thread request
6949 */
6950 if (kqr->kqr_state & KQR_BOUND) {
6951 servicer = kqr->kqr_thread;
6952 struct uthread *ut = get_bsdthread_info(servicer);
6953 if (ut->uu_kqueue_qos_index != new_qos) {
6954 if (ut->uu_kqueue_qos_index == THREAD_QOS_UNSPECIFIED) {
6955 thread_add_ipc_override(servicer, new_qos);
6956 } else if (new_qos == THREAD_QOS_UNSPECIFIED) {
6957 thread_drop_ipc_override(servicer);
6958 } else /* ut->uu_kqueue_qos_index != new_qos */ {
6959 thread_update_ipc_override(servicer, new_qos);
6960 }
6961 ut->uu_kqueue_qos_index = new_qos;
6962 qos_changed = TRUE;
6963 }
6964
6965 if (new_ipc_override_is_sync != ut->uu_kqueue_override_is_sync) {
6966 if (new_ipc_override_is_sync &&
6967 !ut->uu_kqueue_override_is_sync) {
6968 thread_add_sync_ipc_override(servicer);
6969 } else if (!new_ipc_override_is_sync &&
6970 ut->uu_kqueue_override_is_sync) {
6971 thread_drop_sync_ipc_override(servicer);
6972 }
6973 ut->uu_kqueue_override_is_sync = new_ipc_override_is_sync;
6974 qos_changed = TRUE;
6975 }
6976 } else if (old_qos != new_qos) {
6977 assert(new_qos);
6978 kqworkloop_threadreq_modify(kqwl, new_qos);
6979 qos_changed = TRUE;
6980 }
6981 if (qos_changed) {
6982 servicer = kqr->kqr_thread;
6983 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
6984 kqwl->kqwl_dynamicid,
6985 (kqr->kqr_state & KQR_BOUND) ? thread_tid(servicer) : 0,
6986 (kqr->kqr_qos_index << 16) | (new_qos << 8) | new_ipc_override_is_sync,
6987 (kqr->kqr_override_index << 8) | kqr->kqr_state);
6988 }
6989 }
6990 }
6991
6992 static void
6993 kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index)
6994 {
6995 /* convert to thread qos value */
6996 assert(qos_index < KQWL_NBUCKETS);
6997
6998 kqwl_req_lock(kqwl);
6999 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos_index);
7000 kqwl_req_unlock(kqwl);
7001 }
7002
7003 /*
7004 * These arrays described the low and high qindexes for a given qos_index.
7005 * The values come from the chart in <sys/eventvar.h> (must stay in sync).
7006 */
7007 static kq_index_t _kqwq_base_index[KQWQ_NQOS] = {0, 0, 6, 11, 15, 18, 20, 21};
7008 static kq_index_t _kqwq_high_index[KQWQ_NQOS] = {0, 5, 10, 14, 17, 19, 20, 21};
7009
7010 static struct kqtailq *
7011 kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index)
7012 {
7013 if (kq->kq_state & KQ_WORKQ) {
7014 assert(qos_index < KQWQ_NQOS);
7015 return &kq->kq_queue[_kqwq_base_index[qos_index]];
7016 } else if (kq->kq_state & KQ_WORKLOOP) {
7017 assert(qos_index < KQWL_NBUCKETS);
7018 return &kq->kq_queue[qos_index];
7019 } else {
7020 assert(qos_index == QOS_INDEX_KQFILE);
7021 return &kq->kq_queue[QOS_INDEX_KQFILE];
7022 }
7023 }
7024
7025 static struct kqtailq *
7026 kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index)
7027 {
7028 if (kq->kq_state & KQ_WORKQ) {
7029 assert(qos_index < KQWQ_NQOS);
7030 return &kq->kq_queue[_kqwq_high_index[qos_index]];
7031 } else if (kq->kq_state & KQ_WORKLOOP) {
7032 assert(qos_index < KQWL_NBUCKETS);
7033 return &kq->kq_queue[KQWL_BUCKET_STAYACTIVE];
7034 } else {
7035 assert(qos_index == QOS_INDEX_KQFILE);
7036 return &kq->kq_queue[QOS_INDEX_KQFILE];
7037 }
7038 }
7039
7040 static int
7041 kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
7042 {
7043 struct kqtailq *base_queue = kqueue_get_base_queue(kq, qos_index);
7044 struct kqtailq *queue = kqueue_get_high_queue(kq, qos_index);
7045
7046 do {
7047 if (!TAILQ_EMPTY(queue))
7048 return 0;
7049 } while (queue-- > base_queue);
7050 return 1;
7051 }
7052
7053 static struct kqtailq *
7054 kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index)
7055 {
7056 struct kqtailq *res;
7057 struct kqrequest *kqr;
7058
7059 if (kq->kq_state & KQ_WORKQ) {
7060 struct kqworkq *kqwq = (struct kqworkq *)kq;
7061
7062 kqr = kqworkq_get_request(kqwq, qos_index);
7063 res = &kqr->kqr_suppressed;
7064 } else if (kq->kq_state & KQ_WORKLOOP) {
7065 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7066
7067 kqr = &kqwl->kqwl_request;
7068 res = &kqr->kqr_suppressed;
7069 } else {
7070 struct kqfile *kqf = (struct kqfile *)kq;
7071 res = &kqf->kqf_suppressed;
7072 }
7073 return res;
7074 }
7075
7076 static kq_index_t
7077 knote_get_queue_index(struct knote *kn)
7078 {
7079 kq_index_t override_index = knote_get_qos_override_index(kn);
7080 kq_index_t qos_index = knote_get_qos_index(kn);
7081 struct kqueue *kq = knote_get_kq(kn);
7082 kq_index_t res;
7083
7084 if (kq->kq_state & KQ_WORKQ) {
7085 res = _kqwq_base_index[qos_index];
7086 if (override_index > qos_index)
7087 res += override_index - qos_index;
7088 assert(res <= _kqwq_high_index[qos_index]);
7089 } else if (kq->kq_state & KQ_WORKLOOP) {
7090 res = MAX(override_index, qos_index);
7091 assert(res < KQWL_NBUCKETS);
7092 } else {
7093 assert(qos_index == QOS_INDEX_KQFILE);
7094 assert(override_index == QOS_INDEX_KQFILE);
7095 res = QOS_INDEX_KQFILE;
7096 }
7097 return res;
7098 }
7099
7100 static struct kqtailq *
7101 knote_get_queue(struct knote *kn)
7102 {
7103 kq_index_t qindex = knote_get_queue_index(kn);
7104
7105 return &(knote_get_kq(kn))->kq_queue[qindex];
7106 }
7107
7108 static kq_index_t
7109 knote_get_req_index(struct knote *kn)
7110 {
7111 return kn->kn_req_index;
7112 }
7113
7114 static kq_index_t
7115 knote_get_qos_index(struct knote *kn)
7116 {
7117 return kn->kn_qos_index;
7118 }
7119
7120 static void
7121 knote_set_qos_index(struct knote *kn, kq_index_t qos_index)
7122 {
7123 struct kqueue *kq = knote_get_kq(kn);
7124
7125 assert(qos_index < KQWQ_NQOS);
7126 assert((kn->kn_status & KN_QUEUED) == 0);
7127
7128 if (kq->kq_state & KQ_WORKQ) {
7129 assert(qos_index > THREAD_QOS_UNSPECIFIED);
7130 } else if (kq->kq_state & KQ_WORKLOOP) {
7131 /* XXX this policy decision shouldn't be here */
7132 if (qos_index == THREAD_QOS_UNSPECIFIED)
7133 qos_index = THREAD_QOS_LEGACY;
7134 } else
7135 qos_index = QOS_INDEX_KQFILE;
7136
7137 /* always set requested */
7138 kn->kn_req_index = qos_index;
7139
7140 /* only adjust in-use qos index when not suppressed */
7141 if ((kn->kn_status & KN_SUPPRESSED) == 0)
7142 kn->kn_qos_index = qos_index;
7143 }
7144
7145 static void
7146 knote_set_qos_overcommit(struct knote *kn)
7147 {
7148 struct kqueue *kq = knote_get_kq(kn);
7149 struct kqrequest *kqr;
7150
7151 /* turn overcommit on for the appropriate thread request? */
7152 if (kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
7153 if (kq->kq_state & KQ_WORKQ) {
7154 kq_index_t qos_index = knote_get_qos_index(kn);
7155 struct kqworkq *kqwq = (struct kqworkq *)kq;
7156
7157 kqr = kqworkq_get_request(kqwq, qos_index);
7158
7159 kqwq_req_lock(kqwq);
7160 kqr->kqr_state |= KQR_THOVERCOMMIT;
7161 kqwq_req_unlock(kqwq);
7162 } else if (kq->kq_state & KQ_WORKLOOP) {
7163 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7164
7165 kqr = &kqwl->kqwl_request;
7166
7167 kqwl_req_lock(kqwl);
7168 kqr->kqr_state |= KQR_THOVERCOMMIT;
7169 kqwl_req_unlock(kqwl);
7170 }
7171 }
7172 }
7173
7174 static kq_index_t
7175 knote_get_qos_override_index(struct knote *kn)
7176 {
7177 return kn->kn_qos_override;
7178 }
7179
7180 static void
7181 knote_set_qos_override_index(struct knote *kn, kq_index_t override_index,
7182 boolean_t override_is_sync)
7183 {
7184 struct kqueue *kq = knote_get_kq(kn);
7185 kq_index_t qos_index = knote_get_qos_index(kn);
7186 kq_index_t old_override_index = knote_get_qos_override_index(kn);
7187 boolean_t old_override_is_sync = kn->kn_qos_override_is_sync;
7188 uint32_t flags = 0;
7189
7190 assert((kn->kn_status & KN_QUEUED) == 0);
7191
7192 if (override_index == KQWQ_QOS_MANAGER) {
7193 assert(qos_index == KQWQ_QOS_MANAGER);
7194 } else {
7195 assert(override_index < KQWQ_QOS_MANAGER);
7196 }
7197
7198 kn->kn_qos_override = override_index;
7199 kn->kn_qos_override_is_sync = override_is_sync;
7200
7201 /*
7202 * If this is a workq/workloop kqueue, apply the override to the
7203 * servicing thread.
7204 */
7205 if (kq->kq_state & KQ_WORKQ) {
7206 struct kqworkq *kqwq = (struct kqworkq *)kq;
7207
7208 assert(qos_index > THREAD_QOS_UNSPECIFIED);
7209 kqworkq_update_override(kqwq, qos_index, override_index);
7210 } else if (kq->kq_state & KQ_WORKLOOP) {
7211 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7212
7213 if ((kn->kn_status & KN_SUPPRESSED) == KN_SUPPRESSED) {
7214 flags = flags | KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS;
7215
7216 if (override_index == THREAD_QOS_USER_INTERACTIVE
7217 && override_is_sync) {
7218 flags = flags | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI;
7219 }
7220
7221 if (old_override_index == THREAD_QOS_USER_INTERACTIVE
7222 && old_override_is_sync) {
7223 flags = flags | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI;
7224 }
7225 }
7226
7227 assert(qos_index > THREAD_QOS_UNSPECIFIED);
7228 kqworkloop_update_override(kqwl, qos_index, override_index, flags);
7229 }
7230 }
7231
7232 static kq_index_t
7233 knote_get_sync_qos_override_index(struct knote *kn)
7234 {
7235 return kn->kn_qos_sync_override;
7236 }
7237
7238 static void
7239 kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index)
7240 {
7241 struct kqrequest *kqr;
7242 kq_index_t old_override_index;
7243
7244 if (override_index <= qos_index) {
7245 return;
7246 }
7247
7248 kqr = kqworkq_get_request(kqwq, qos_index);
7249
7250 kqwq_req_lock(kqwq);
7251 old_override_index = kqr->kqr_override_index;
7252 if (override_index > MAX(kqr->kqr_qos_index, old_override_index)) {
7253 kqr->kqr_override_index = override_index;
7254
7255 /* apply the override to [incoming?] servicing thread */
7256 if (kqr->kqr_state & KQR_BOUND) {
7257 thread_t wqthread = kqr->kqr_thread;
7258
7259 /* only apply if non-manager */
7260 assert(wqthread);
7261 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
7262 if (old_override_index)
7263 thread_update_ipc_override(wqthread, override_index);
7264 else
7265 thread_add_ipc_override(wqthread, override_index);
7266 }
7267 }
7268 }
7269 kqwq_req_unlock(kqwq);
7270 }
7271
7272 /* called with the kqworkq lock held */
7273 static void
7274 kqworkq_bind_thread_impl(
7275 struct kqworkq *kqwq,
7276 kq_index_t qos_index,
7277 thread_t thread,
7278 unsigned int flags)
7279 {
7280 /* request lock must be held */
7281 kqwq_req_held(kqwq);
7282
7283 struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
7284 assert(kqr->kqr_state & KQR_THREQUESTED);
7285
7286 if (qos_index == KQWQ_QOS_MANAGER)
7287 flags |= KEVENT_FLAG_WORKQ_MANAGER;
7288
7289 struct uthread *ut = get_bsdthread_info(thread);
7290
7291 /*
7292 * If this is a manager, and the manager request bit is
7293 * not set, assure no other thread is bound. If the bit
7294 * is set, make sure the old thread is us (or not set).
7295 */
7296 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
7297 if ((kqr->kqr_state & KQR_BOUND) == 0) {
7298 kqr->kqr_state |= (KQR_BOUND | KQWQ_THMANAGER);
7299 TAILQ_INIT(&kqr->kqr_suppressed);
7300 kqr->kqr_thread = thread;
7301 ut->uu_kqueue_bound = (struct kqueue *)kqwq;
7302 ut->uu_kqueue_qos_index = KQWQ_QOS_MANAGER;
7303 ut->uu_kqueue_flags = (KEVENT_FLAG_WORKQ |
7304 KEVENT_FLAG_WORKQ_MANAGER);
7305 } else {
7306 assert(kqr->kqr_state & KQR_BOUND);
7307 assert(thread == kqr->kqr_thread);
7308 assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
7309 assert(ut->uu_kqueue_qos_index == KQWQ_QOS_MANAGER);
7310 assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
7311 }
7312 return;
7313 }
7314
7315 /* Just a normal one-queue servicing thread */
7316 assert(kqr->kqr_state & KQR_THREQUESTED);
7317 assert(kqr->kqr_qos_index == qos_index);
7318
7319 if ((kqr->kqr_state & KQR_BOUND) == 0) {
7320 kqr->kqr_state |= KQR_BOUND;
7321 TAILQ_INIT(&kqr->kqr_suppressed);
7322 kqr->kqr_thread = thread;
7323
7324 /* apply an ipc QoS override if one is needed */
7325 if (kqr->kqr_override_index) {
7326 assert(kqr->kqr_qos_index);
7327 assert(kqr->kqr_override_index > kqr->kqr_qos_index);
7328 assert(thread_get_ipc_override(thread) == THREAD_QOS_UNSPECIFIED);
7329 thread_add_ipc_override(thread, kqr->kqr_override_index);
7330 }
7331
7332 /* indicate that we are processing in the uthread */
7333 ut->uu_kqueue_bound = (struct kqueue *)kqwq;
7334 ut->uu_kqueue_qos_index = qos_index;
7335 ut->uu_kqueue_flags = flags;
7336 } else {
7337 /*
7338 * probably syncronously bound AND post-request bound
7339 * this logic can go away when we get rid of post-request bind
7340 */
7341 assert(kqr->kqr_state & KQR_BOUND);
7342 assert(thread == kqr->kqr_thread);
7343 assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
7344 assert(ut->uu_kqueue_qos_index == qos_index);
7345 assert((ut->uu_kqueue_flags & flags) == flags);
7346 }
7347 }
7348
7349 static void
7350 kqworkloop_update_override(
7351 struct kqworkloop *kqwl,
7352 kq_index_t qos_index,
7353 kq_index_t override_index,
7354 uint32_t flags)
7355 {
7356 struct kqrequest *kqr = &kqwl->kqwl_request;
7357
7358 kqwl_req_lock(kqwl);
7359
7360 /* Do not override on attached threads */
7361 if (kqr->kqr_state & KQR_BOUND) {
7362 assert(kqr->kqr_thread);
7363
7364 if (kqwl->kqwl_kqueue.kq_state & KQ_NO_WQ_THREAD) {
7365 kqwl_req_unlock(kqwl);
7366 assert(!is_workqueue_thread(kqr->kqr_thread));
7367 return;
7368 }
7369 }
7370
7371 /* Update sync ipc counts on kqr for suppressed knotes */
7372 if (flags & KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS) {
7373 kqworkloop_update_suppress_sync_count(kqr, flags);
7374 }
7375
7376 if ((flags & KQWL_UO_UPDATE_OVERRIDE_LAZY) == 0) {
7377 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
7378 MAX(qos_index, override_index));
7379 }
7380 kqwl_req_unlock(kqwl);
7381 }
7382
7383 static void
7384 kqworkloop_update_suppress_sync_count(
7385 struct kqrequest *kqr,
7386 uint32_t flags)
7387 {
7388 if (flags & KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI) {
7389 kqr->kqr_sync_suppress_count++;
7390 }
7391
7392 if (flags & KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI) {
7393 assert(kqr->kqr_sync_suppress_count > 0);
7394 kqr->kqr_sync_suppress_count--;
7395 }
7396 }
7397
7398 /*
7399 * kqworkloop_unbind_thread - Unbind the servicer thread of a workloop kqueue
7400 *
7401 * It will end the processing phase in case it was still processing:
7402 *
7403 * We may have to request a new thread for not KQ_NO_WQ_THREAD workloop.
7404 * This can happen if :
7405 * - there were active events at or above our QoS we never got to (count > 0)
7406 * - we pended waitq hook callouts during processing
7407 * - we pended wakeups while processing (or unsuppressing)
7408 *
7409 * Called with kqueue lock held.
7410 */
7411
7412 static void
7413 kqworkloop_unbind_thread(
7414 struct kqworkloop *kqwl,
7415 thread_t thread,
7416 __unused unsigned int flags)
7417 {
7418 struct kqueue *kq = &kqwl->kqwl_kqueue;
7419 struct kqrequest *kqr = &kqwl->kqwl_request;
7420
7421 kqlock_held(kq);
7422
7423 assert((kq->kq_state & KQ_PROCESSING) == 0);
7424 if (kq->kq_state & KQ_PROCESSING) {
7425 return;
7426 }
7427
7428 /*
7429 * Forcing the KQ_PROCESSING flag allows for QoS updates because of
7430 * unsuppressing knotes not to be applied until the eventual call to
7431 * kqworkloop_update_threads_qos() below.
7432 */
7433 kq->kq_state |= KQ_PROCESSING;
7434 kqworkloop_acknowledge_events(kqwl, TRUE);
7435 kq->kq_state &= ~KQ_PROCESSING;
7436
7437 kqwl_req_lock(kqwl);
7438
7439 /* deal with extraneous unbinds in release kernels */
7440 assert((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) == KQR_BOUND);
7441 if ((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) != KQR_BOUND) {
7442 kqwl_req_unlock(kqwl);
7443 return;
7444 }
7445
7446 assert(thread == current_thread());
7447 assert(kqr->kqr_thread == thread);
7448 if (kqr->kqr_thread != thread) {
7449 kqwl_req_unlock(kqwl);
7450 return;
7451 }
7452
7453 struct uthread *ut = get_bsdthread_info(thread);
7454 kq_index_t old_qos_index = ut->uu_kqueue_qos_index;
7455 boolean_t ipc_override_is_sync = ut->uu_kqueue_override_is_sync;
7456 ut->uu_kqueue_bound = NULL;
7457 ut->uu_kqueue_qos_index = 0;
7458 ut->uu_kqueue_override_is_sync = 0;
7459 ut->uu_kqueue_flags = 0;
7460
7461 /* unbind the servicer thread, drop overrides */
7462 kqr->kqr_thread = NULL;
7463 kqr->kqr_state &= ~(KQR_BOUND | KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
7464 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
7465
7466 kqwl_req_unlock(kqwl);
7467
7468 /*
7469 * Drop the override on the current thread last, after the call to
7470 * kqworkloop_update_threads_qos above.
7471 */
7472 if (old_qos_index) {
7473 thread_drop_ipc_override(thread);
7474 }
7475 if (ipc_override_is_sync) {
7476 thread_drop_sync_ipc_override(thread);
7477 }
7478 }
7479
7480 /* called with the kqworkq lock held */
7481 static void
7482 kqworkq_unbind_thread(
7483 struct kqworkq *kqwq,
7484 kq_index_t qos_index,
7485 thread_t thread,
7486 __unused unsigned int flags)
7487 {
7488 struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
7489 kq_index_t override_index = 0;
7490
7491 /* request lock must be held */
7492 kqwq_req_held(kqwq);
7493
7494 assert(thread == current_thread());
7495
7496 if ((kqr->kqr_state & KQR_BOUND) == 0) {
7497 assert(kqr->kqr_state & KQR_BOUND);
7498 return;
7499 }
7500
7501 assert(kqr->kqr_thread == thread);
7502 assert(TAILQ_EMPTY(&kqr->kqr_suppressed));
7503
7504 /*
7505 * If there is an override, drop it from the current thread
7506 * and then we are free to recompute (a potentially lower)
7507 * minimum override to apply to the next thread request.
7508 */
7509 if (kqr->kqr_override_index) {
7510 struct kqtailq *base_queue = kqueue_get_base_queue(&kqwq->kqwq_kqueue, qos_index);
7511 struct kqtailq *queue = kqueue_get_high_queue(&kqwq->kqwq_kqueue, qos_index);
7512
7513 /* if not bound to a manager thread, drop the current ipc override */
7514 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
7515 thread_drop_ipc_override(thread);
7516 }
7517
7518 /* recompute the new override */
7519 do {
7520 if (!TAILQ_EMPTY(queue)) {
7521 override_index = queue - base_queue + qos_index;
7522 break;
7523 }
7524 } while (queue-- > base_queue);
7525 }
7526
7527 /* Mark it unbound */
7528 kqr->kqr_thread = NULL;
7529 kqr->kqr_state &= ~(KQR_BOUND | KQR_THREQUESTED | KQWQ_THMANAGER);
7530
7531 /* apply the new override */
7532 if (override_index > kqr->kqr_qos_index) {
7533 kqr->kqr_override_index = override_index;
7534 } else {
7535 kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
7536 }
7537 }
7538
7539 struct kqrequest *
7540 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
7541 {
7542 assert(qos_index < KQWQ_NQOS);
7543 return &kqwq->kqwq_request[qos_index];
7544 }
7545
7546 void
7547 knote_adjust_qos(struct knote *kn, qos_t new_qos, qos_t new_override, kq_index_t sync_override_index)
7548 {
7549 struct kqueue *kq = knote_get_kq(kn);
7550 boolean_t override_is_sync = FALSE;
7551
7552 if (kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) {
7553 kq_index_t new_qos_index;
7554 kq_index_t new_override_index;
7555 kq_index_t servicer_qos_index;
7556
7557 new_qos_index = qos_index_from_qos(kn, new_qos, FALSE);
7558 new_override_index = qos_index_from_qos(kn, new_override, TRUE);
7559
7560 /* make sure the servicer qos acts as a floor */
7561 servicer_qos_index = qos_index_from_qos(kn, kn->kn_qos, FALSE);
7562 if (servicer_qos_index > new_qos_index)
7563 new_qos_index = servicer_qos_index;
7564 if (servicer_qos_index > new_override_index)
7565 new_override_index = servicer_qos_index;
7566 if (sync_override_index >= new_override_index) {
7567 new_override_index = sync_override_index;
7568 override_is_sync = TRUE;
7569 }
7570
7571 kqlock(kq);
7572 if (new_qos_index != knote_get_req_index(kn) ||
7573 new_override_index != knote_get_qos_override_index(kn) ||
7574 override_is_sync != kn->kn_qos_override_is_sync) {
7575 if (kn->kn_status & KN_QUEUED) {
7576 knote_dequeue(kn);
7577 knote_set_qos_index(kn, new_qos_index);
7578 knote_set_qos_override_index(kn, new_override_index, override_is_sync);
7579 knote_enqueue(kn);
7580 knote_wakeup(kn);
7581 } else {
7582 knote_set_qos_index(kn, new_qos_index);
7583 knote_set_qos_override_index(kn, new_override_index, override_is_sync);
7584 }
7585 }
7586 kqunlock(kq);
7587 }
7588 }
7589
7590 void
7591 knote_adjust_sync_qos(struct knote *kn, kq_index_t sync_qos, boolean_t lock_kq)
7592 {
7593 struct kqueue *kq = knote_get_kq(kn);
7594 kq_index_t old_sync_override;
7595 kq_index_t qos_index = knote_get_qos_index(kn);
7596 uint32_t flags = 0;
7597
7598 /* Tracking only happens for UI qos */
7599 if (sync_qos != THREAD_QOS_USER_INTERACTIVE &&
7600 sync_qos != THREAD_QOS_UNSPECIFIED) {
7601 return;
7602 }
7603
7604 if (lock_kq)
7605 kqlock(kq);
7606
7607 if (kq->kq_state & KQ_WORKLOOP) {
7608 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7609
7610 old_sync_override = knote_get_sync_qos_override_index(kn);
7611 if (old_sync_override != sync_qos) {
7612 kn->kn_qos_sync_override = sync_qos;
7613
7614 /* update sync ipc counters for suppressed knotes */
7615 if ((kn->kn_status & KN_SUPPRESSED) == KN_SUPPRESSED) {
7616 flags = flags | KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS;
7617
7618 /* Do not recalculate kqwl override, it would be done later */
7619 flags = flags | KQWL_UO_UPDATE_OVERRIDE_LAZY;
7620
7621 if (sync_qos == THREAD_QOS_USER_INTERACTIVE) {
7622 flags = flags | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI;
7623 }
7624
7625 if (old_sync_override == THREAD_QOS_USER_INTERACTIVE) {
7626 flags = flags | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI;
7627 }
7628
7629 kqworkloop_update_override(kqwl, qos_index, sync_qos,
7630 flags);
7631 }
7632
7633 }
7634 }
7635 if (lock_kq)
7636 kqunlock(kq);
7637 }
7638
7639 static void
7640 knote_wakeup(struct knote *kn)
7641 {
7642 struct kqueue *kq = knote_get_kq(kn);
7643 kq_index_t qos_index = knote_get_qos_index(kn);
7644
7645 kqlock_held(kq);
7646
7647 if (kq->kq_state & KQ_WORKQ) {
7648 /* request a servicing thread */
7649 struct kqworkq *kqwq = (struct kqworkq *)kq;
7650
7651 kqworkq_request_help(kqwq, qos_index);
7652
7653 } else if (kq->kq_state & KQ_WORKLOOP) {
7654 /* request a servicing thread */
7655 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7656
7657 if (kqworkloop_is_processing_on_current_thread(kqwl)) {
7658 /*
7659 * kqworkloop_end_processing() will perform the required QoS
7660 * computations when it unsets the processing mode.
7661 */
7662 return;
7663 }
7664 kqworkloop_request_help(kqwl, qos_index);
7665 } else {
7666 struct kqfile *kqf = (struct kqfile *)kq;
7667
7668 /* flag wakeups during processing */
7669 if (kq->kq_state & KQ_PROCESSING)
7670 kq->kq_state |= KQ_WAKEUP;
7671
7672 /* wakeup a thread waiting on this queue */
7673 if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
7674 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
7675 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7676 KQ_EVENT,
7677 THREAD_AWAKENED,
7678 WAITQ_ALL_PRIORITIES);
7679 }
7680
7681 /* wakeup other kqueues/select sets we're inside */
7682 KNOTE(&kqf->kqf_sel.si_note, 0);
7683 }
7684 }
7685
7686 /*
7687 * Called with the kqueue locked
7688 */
7689 static void
7690 kqueue_interrupt(struct kqueue *kq)
7691 {
7692 assert((kq->kq_state & KQ_WORKQ) == 0);
7693
7694 /* wakeup sleeping threads */
7695 if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0) {
7696 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
7697 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7698 KQ_EVENT,
7699 THREAD_RESTART,
7700 WAITQ_ALL_PRIORITIES);
7701 }
7702
7703 /* wakeup threads waiting their turn to process */
7704 if (kq->kq_state & KQ_PROCWAIT) {
7705 struct kqtailq *suppressq;
7706
7707 assert(kq->kq_state & KQ_PROCESSING);
7708
7709 kq->kq_state &= ~KQ_PROCWAIT;
7710 suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
7711 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
7712 CAST_EVENT64_T(suppressq),
7713 THREAD_RESTART,
7714 WAITQ_ALL_PRIORITIES);
7715 }
7716 }
7717
7718 /*
7719 * Called back from waitq code when no threads waiting and the hook was set.
7720 *
7721 * Interrupts are likely disabled and spin locks are held - minimal work
7722 * can be done in this context!!!
7723 *
7724 * JMM - in the future, this will try to determine which knotes match the
7725 * wait queue wakeup and apply these wakeups against those knotes themselves.
7726 * For now, all the events dispatched this way are dispatch-manager handled,
7727 * so hard-code that for now.
7728 */
7729 void
7730 waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos)
7731 {
7732 #pragma unused(knote_hook, qos)
7733
7734 struct kqueue *kq = (struct kqueue *)kq_hook;
7735
7736 if (kq->kq_state & KQ_WORKQ) {
7737 struct kqworkq *kqwq = (struct kqworkq *)kq;
7738
7739 kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER);
7740
7741 } else if (kq->kq_state & KQ_WORKLOOP) {
7742 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
7743
7744 kqworkloop_request_help(kqwl, KQWL_BUCKET_STAYACTIVE);
7745 }
7746 }
7747
7748 void
7749 klist_init(struct klist *list)
7750 {
7751 SLIST_INIT(list);
7752 }
7753
7754
7755 /*
7756 * Query/Post each knote in the object's list
7757 *
7758 * The object lock protects the list. It is assumed
7759 * that the filter/event routine for the object can
7760 * determine that the object is already locked (via
7761 * the hint) and not deadlock itself.
7762 *
7763 * The object lock should also hold off pending
7764 * detach/drop operations. But we'll prevent it here
7765 * too (by taking a use reference) - just in case.
7766 */
7767 void
7768 knote(struct klist *list, long hint)
7769 {
7770 struct knote *kn;
7771
7772 SLIST_FOREACH(kn, list, kn_selnext) {
7773 struct kqueue *kq = knote_get_kq(kn);
7774
7775 kqlock(kq);
7776
7777 assert(!knoteuse_needs_boost(kn, NULL));
7778
7779 /* If we can get a use reference - deliver event */
7780 if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
7781 int result;
7782
7783 /* call the event with only a use count */
7784 result = knote_fops(kn)->f_event(kn, hint);
7785
7786 /* if its not going away and triggered */
7787 if (knoteuse2kqlock(kq, kn, KNUSE_NONE) && result)
7788 knote_activate(kn);
7789 /* kq lock held */
7790 }
7791 kqunlock(kq);
7792 }
7793 }
7794
7795 /*
7796 * attach a knote to the specified list. Return true if this is the first entry.
7797 * The list is protected by whatever lock the object it is associated with uses.
7798 */
7799 int
7800 knote_attach(struct klist *list, struct knote *kn)
7801 {
7802 int ret = SLIST_EMPTY(list);
7803 SLIST_INSERT_HEAD(list, kn, kn_selnext);
7804 return (ret);
7805 }
7806
7807 /*
7808 * detach a knote from the specified list. Return true if that was the last entry.
7809 * The list is protected by whatever lock the object it is associated with uses.
7810 */
7811 int
7812 knote_detach(struct klist *list, struct knote *kn)
7813 {
7814 SLIST_REMOVE(list, kn, knote, kn_selnext);
7815 return (SLIST_EMPTY(list));
7816 }
7817
7818 /*
7819 * knote_vanish - Indicate that the source has vanished
7820 *
7821 * If the knote has requested EV_VANISHED delivery,
7822 * arrange for that. Otherwise, deliver a NOTE_REVOKE
7823 * event for backward compatibility.
7824 *
7825 * The knote is marked as having vanished, but is not
7826 * actually detached from the source in this instance.
7827 * The actual detach is deferred until the knote drop.
7828 *
7829 * Our caller already has the object lock held. Calling
7830 * the detach routine would try to take that lock
7831 * recursively - which likely is not supported.
7832 */
7833 void
7834 knote_vanish(struct klist *list)
7835 {
7836 struct knote *kn;
7837 struct knote *kn_next;
7838
7839 SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
7840 struct kqueue *kq = knote_get_kq(kn);
7841 int result;
7842
7843 kqlock(kq);
7844
7845 assert(!knoteuse_needs_boost(kn, NULL));
7846
7847 if ((kn->kn_status & KN_DROPPING) == 0) {
7848 /* If EV_VANISH supported - prepare to deliver one */
7849 if (kn->kn_status & KN_REQVANISH) {
7850 kn->kn_status |= KN_VANISHED;
7851 knote_activate(kn);
7852
7853 } else if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
7854 /* call the event with only a use count */
7855 result = knote_fops(kn)->f_event(kn, NOTE_REVOKE);
7856
7857 /* if its not going away and triggered */
7858 if (knoteuse2kqlock(kq, kn, KNUSE_NONE) && result)
7859 knote_activate(kn);
7860 /* lock held again */
7861 }
7862 }
7863 kqunlock(kq);
7864 }
7865 }
7866
7867 /*
7868 * For a given knote, link a provided wait queue directly with the kqueue.
7869 * Wakeups will happen via recursive wait queue support. But nothing will move
7870 * the knote to the active list at wakeup (nothing calls knote()). Instead,
7871 * we permanently enqueue them here.
7872 *
7873 * kqueue and knote references are held by caller.
7874 * waitq locked by caller.
7875 *
7876 * caller provides the wait queue link structure.
7877 */
7878 int
7879 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
7880 {
7881 struct kqueue *kq = knote_get_kq(kn);
7882 kern_return_t kr;
7883
7884 kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
7885 if (kr == KERN_SUCCESS) {
7886 knote_markstayactive(kn);
7887 return (0);
7888 } else {
7889 return (EINVAL);
7890 }
7891 }
7892
7893 /*
7894 * Unlink the provided wait queue from the kqueue associated with a knote.
7895 * Also remove it from the magic list of directly attached knotes.
7896 *
7897 * Note that the unlink may have already happened from the other side, so
7898 * ignore any failures to unlink and just remove it from the kqueue list.
7899 *
7900 * On success, caller is responsible for the link structure
7901 */
7902 int
7903 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
7904 {
7905 struct kqueue *kq = knote_get_kq(kn);
7906 kern_return_t kr;
7907
7908 kr = waitq_unlink(wq, &kq->kq_wqs);
7909 knote_clearstayactive(kn);
7910 return ((kr != KERN_SUCCESS) ? EINVAL : 0);
7911 }
7912
7913 /*
7914 * remove all knotes referencing a specified fd
7915 *
7916 * Essentially an inlined knote_remove & knote_drop
7917 * when we know for sure that the thing is a file
7918 *
7919 * Entered with the proc_fd lock already held.
7920 * It returns the same way, but may drop it temporarily.
7921 */
7922 void
7923 knote_fdclose(struct proc *p, int fd, int force)
7924 {
7925 struct klist *list;
7926 struct knote *kn;
7927
7928 restart:
7929 list = &p->p_fd->fd_knlist[fd];
7930 SLIST_FOREACH(kn, list, kn_link) {
7931 struct kqueue *kq = knote_get_kq(kn);
7932
7933 kqlock(kq);
7934
7935 if (kq->kq_p != p)
7936 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
7937 __func__, kq->kq_p, p);
7938
7939 /*
7940 * If the knote supports EV_VANISHED delivery,
7941 * transition it to vanished mode (or skip over
7942 * it if already vanished).
7943 */
7944 if (!force && (kn->kn_status & KN_REQVANISH)) {
7945
7946 if ((kn->kn_status & KN_VANISHED) == 0) {
7947 proc_fdunlock(p);
7948
7949 assert(!knoteuse_needs_boost(kn, NULL));
7950
7951 /* get detach reference (also marks vanished) */
7952 if (kqlock2knotedetach(kq, kn, KNUSE_NONE)) {
7953 /* detach knote and drop fp use reference */
7954 knote_fops(kn)->f_detach(kn);
7955 if (knote_fops(kn)->f_isfd)
7956 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
7957
7958 /* activate it if it's still in existence */
7959 if (knoteuse2kqlock(kq, kn, KNUSE_NONE)) {
7960 knote_activate(kn);
7961 }
7962 kqunlock(kq);
7963 }
7964 proc_fdlock(p);
7965 goto restart;
7966 } else {
7967 kqunlock(kq);
7968 continue;
7969 }
7970 }
7971
7972 proc_fdunlock(p);
7973
7974 /*
7975 * Convert the kq lock to a drop ref.
7976 * If we get it, go ahead and drop it.
7977 * Otherwise, we waited for the blocking
7978 * condition to complete. Either way,
7979 * we dropped the fdlock so start over.
7980 */
7981 if (kqlock2knotedrop(kq, kn)) {
7982 knote_drop(kn, p);
7983 }
7984
7985 proc_fdlock(p);
7986 goto restart;
7987 }
7988 }
7989
7990 /*
7991 * knote_fdfind - lookup a knote in the fd table for process
7992 *
7993 * If the filter is file-based, lookup based on fd index.
7994 * Otherwise use a hash based on the ident.
7995 *
7996 * Matching is based on kq, filter, and ident. Optionally,
7997 * it may also be based on the udata field in the kevent -
7998 * allowing multiple event registration for the file object
7999 * per kqueue.
8000 *
8001 * fd_knhashlock or fdlock held on entry (and exit)
8002 */
8003 static struct knote *
8004 knote_fdfind(struct kqueue *kq,
8005 struct kevent_internal_s *kev,
8006 bool is_fd,
8007 struct proc *p)
8008 {
8009 struct filedesc *fdp = p->p_fd;
8010 struct klist *list = NULL;
8011 struct knote *kn = NULL;
8012
8013 /*
8014 * determine where to look for the knote
8015 */
8016 if (is_fd) {
8017 /* fd-based knotes are linked off the fd table */
8018 if (kev->ident < (u_int)fdp->fd_knlistsize) {
8019 list = &fdp->fd_knlist[kev->ident];
8020 }
8021 } else if (fdp->fd_knhashmask != 0) {
8022 /* hash non-fd knotes here too */
8023 list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
8024 }
8025
8026 /*
8027 * scan the selected list looking for a match
8028 */
8029 if (list != NULL) {
8030 SLIST_FOREACH(kn, list, kn_link) {
8031 if (kq == knote_get_kq(kn) &&
8032 kev->ident == kn->kn_id &&
8033 kev->filter == kn->kn_filter) {
8034 if (kev->flags & EV_UDATA_SPECIFIC) {
8035 if ((kn->kn_status & KN_UDATA_SPECIFIC) &&
8036 kev->udata == kn->kn_udata) {
8037 break; /* matching udata-specific knote */
8038 }
8039 } else if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
8040 break; /* matching non-udata-specific knote */
8041 }
8042 }
8043 }
8044 }
8045 return kn;
8046 }
8047
8048 /*
8049 * kq_add_knote- Add knote to the fd table for process
8050 * while checking for duplicates.
8051 *
8052 * All file-based filters associate a list of knotes by file
8053 * descriptor index. All other filters hash the knote by ident.
8054 *
8055 * May have to grow the table of knote lists to cover the
8056 * file descriptor index presented.
8057 *
8058 * fd_knhashlock and fdlock unheld on entry (and exit).
8059 *
8060 * Takes a rwlock boost if inserting the knote is successful.
8061 */
8062 static int
8063 kq_add_knote(struct kqueue *kq, struct knote *kn,
8064 struct kevent_internal_s *kev,
8065 struct proc *p, int *knoteuse_flags)
8066 {
8067 struct filedesc *fdp = p->p_fd;
8068 struct klist *list = NULL;
8069 int ret = 0;
8070 bool is_fd = knote_fops(kn)->f_isfd;
8071
8072 if (is_fd)
8073 proc_fdlock(p);
8074 else
8075 knhash_lock(p);
8076
8077 if (knote_fdfind(kq, kev, is_fd, p) != NULL) {
8078 /* found an existing knote: we can't add this one */
8079 ret = ERESTART;
8080 goto out_locked;
8081 }
8082
8083 /* knote was not found: add it now */
8084 if (!is_fd) {
8085 if (fdp->fd_knhashmask == 0) {
8086 u_long size = 0;
8087
8088 list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
8089 &size);
8090 if (list == NULL) {
8091 ret = ENOMEM;
8092 goto out_locked;
8093 }
8094
8095 fdp->fd_knhash = list;
8096 fdp->fd_knhashmask = size;
8097 }
8098
8099 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
8100 SLIST_INSERT_HEAD(list, kn, kn_link);
8101 ret = 0;
8102 goto out_locked;
8103
8104 } else {
8105 /* knote is fd based */
8106
8107 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
8108 u_int size = 0;
8109
8110 if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
8111 || kn->kn_id >= (uint64_t)maxfiles) {
8112 ret = EINVAL;
8113 goto out_locked;
8114 }
8115 /* have to grow the fd_knlist */
8116 size = fdp->fd_knlistsize;
8117 while (size <= kn->kn_id)
8118 size += KQEXTENT;
8119
8120 if (size >= (UINT_MAX/sizeof(struct klist *))) {
8121 ret = EINVAL;
8122 goto out_locked;
8123 }
8124
8125 MALLOC(list, struct klist *,
8126 size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
8127 if (list == NULL) {
8128 ret = ENOMEM;
8129 goto out_locked;
8130 }
8131
8132 bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
8133 fdp->fd_knlistsize * sizeof(struct klist *));
8134 bzero((caddr_t)list +
8135 fdp->fd_knlistsize * sizeof(struct klist *),
8136 (size - fdp->fd_knlistsize) * sizeof(struct klist *));
8137 FREE(fdp->fd_knlist, M_KQUEUE);
8138 fdp->fd_knlist = list;
8139 fdp->fd_knlistsize = size;
8140 }
8141
8142 list = &fdp->fd_knlist[kn->kn_id];
8143 SLIST_INSERT_HEAD(list, kn, kn_link);
8144 ret = 0;
8145 goto out_locked;
8146
8147 }
8148
8149 out_locked:
8150 if (ret == 0 && knoteuse_needs_boost(kn, kev)) {
8151 set_thread_rwlock_boost();
8152 *knoteuse_flags = KNUSE_BOOST;
8153 } else {
8154 *knoteuse_flags = KNUSE_NONE;
8155 }
8156 if (is_fd)
8157 proc_fdunlock(p);
8158 else
8159 knhash_unlock(p);
8160
8161 return ret;
8162 }
8163
8164 /*
8165 * kq_remove_knote - remove a knote from the fd table for process
8166 * and copy kn_status an kq_state while holding kqlock and
8167 * fd table locks.
8168 *
8169 * If the filter is file-based, remove based on fd index.
8170 * Otherwise remove from the hash based on the ident.
8171 *
8172 * fd_knhashlock and fdlock unheld on entry (and exit).
8173 */
8174 static void
8175 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
8176 kn_status_t *kn_status, uint16_t *kq_state)
8177 {
8178 struct filedesc *fdp = p->p_fd;
8179 struct klist *list = NULL;
8180 bool is_fd;
8181
8182 is_fd = knote_fops(kn)->f_isfd;
8183
8184 if (is_fd)
8185 proc_fdlock(p);
8186 else
8187 knhash_lock(p);
8188
8189 if (is_fd) {
8190 assert ((u_int)fdp->fd_knlistsize > kn->kn_id);
8191 list = &fdp->fd_knlist[kn->kn_id];
8192 } else {
8193 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
8194 }
8195 SLIST_REMOVE(list, kn, knote, kn_link);
8196
8197 kqlock(kq);
8198 *kn_status = kn->kn_status;
8199 *kq_state = kq->kq_state;
8200 kqunlock(kq);
8201
8202 if (is_fd)
8203 proc_fdunlock(p);
8204 else
8205 knhash_unlock(p);
8206 }
8207
8208 /*
8209 * kq_find_knote_and_kq_lock - lookup a knote in the fd table for process
8210 * and, if the knote is found, acquires the kqlock while holding the fd table lock/spinlock.
8211 *
8212 * fd_knhashlock or fdlock unheld on entry (and exit)
8213 */
8214
8215 static struct knote *
8216 kq_find_knote_and_kq_lock(struct kqueue *kq,
8217 struct kevent_internal_s *kev,
8218 bool is_fd,
8219 struct proc *p)
8220 {
8221 struct knote * ret;
8222
8223 if (is_fd)
8224 proc_fdlock(p);
8225 else
8226 knhash_lock(p);
8227
8228 ret = knote_fdfind(kq, kev, is_fd, p);
8229
8230 if (ret) {
8231 kqlock(kq);
8232 }
8233
8234 if (is_fd)
8235 proc_fdunlock(p);
8236 else
8237 knhash_unlock(p);
8238
8239 return ret;
8240 }
8241 /*
8242 * knote_drop - disconnect and drop the knote
8243 *
8244 * Called with the kqueue unlocked and holding a
8245 * "drop reference" on the knote in question.
8246 * This reference is most often aquired thru a call
8247 * to kqlock2knotedrop(). But it can also be acquired
8248 * through stealing a drop reference via a call to
8249 * knoteuse2knotedrop() or during the initial attach
8250 * of the knote.
8251 *
8252 * The knote may have already been detached from
8253 * (or not yet attached to) its source object.
8254 */
8255 static void
8256 knote_drop(struct knote *kn, __unused struct proc *ctxp)
8257 {
8258 struct kqueue *kq = knote_get_kq(kn);
8259 struct proc *p = kq->kq_p;
8260 kn_status_t kn_status;
8261 uint16_t kq_state;
8262
8263 /* If we are attached, disconnect from the source first */
8264 if (kn->kn_status & KN_ATTACHED) {
8265 knote_fops(kn)->f_detach(kn);
8266 }
8267
8268 /* Remove the source from the appropriate hash */
8269 kq_remove_knote(kq, kn, p, &kn_status, &kq_state);
8270
8271 /*
8272 * If a kqueue_dealloc is happening in parallel for the kq
8273 * pointed by the knote the kq could be aready deallocated
8274 * at this point.
8275 * Do not access the kq after the kq_remove_knote if it is
8276 * not a KQ_DYNAMIC.
8277 */
8278
8279 /* determine if anyone needs to know about the drop */
8280 assert((kn_status & (KN_DROPPING | KN_SUPPRESSED | KN_QUEUED)) == KN_DROPPING);
8281
8282 /*
8283 * If KN_USEWAIT is set, some other thread was trying to drop the kn.
8284 * Or it was in kqueue_dealloc, so the kqueue_dealloc did not happen
8285 * because that thread was waiting on this wake, or it was a drop happening
8286 * because of a kevent_register that takes a reference on the kq, and therefore
8287 * the kq cannot be deallocated in parallel.
8288 *
8289 * It is safe to access kq->kq_wqs if needswakeup is set.
8290 */
8291 if (kn_status & KN_USEWAIT)
8292 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
8293 CAST_EVENT64_T(&kn->kn_status),
8294 THREAD_RESTART,
8295 WAITQ_ALL_PRIORITIES);
8296
8297 if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0))
8298 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
8299
8300 knote_free(kn);
8301
8302 /*
8303 * release reference on dynamic kq (and free if last).
8304 * Will only be last if this is from fdfree, etc...
8305 * because otherwise processing thread has reference.
8306 */
8307 if (kq_state & KQ_DYNAMIC)
8308 kqueue_release_last(p, kq);
8309 }
8310
8311 /* called with kqueue lock held */
8312 static void
8313 knote_activate(struct knote *kn)
8314 {
8315 if (kn->kn_status & KN_ACTIVE)
8316 return;
8317
8318 KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_ACTIVATE),
8319 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
8320 kn->kn_filtid);
8321
8322 kn->kn_status |= KN_ACTIVE;
8323 if (knote_enqueue(kn))
8324 knote_wakeup(kn);
8325 }
8326
8327 /* called with kqueue lock held */
8328 static void
8329 knote_deactivate(struct knote *kn)
8330 {
8331 kn->kn_status &= ~KN_ACTIVE;
8332 if ((kn->kn_status & KN_STAYACTIVE) == 0)
8333 knote_dequeue(kn);
8334 }
8335
8336 /* called with kqueue lock held */
8337 static void
8338 knote_enable(struct knote *kn)
8339 {
8340 if ((kn->kn_status & KN_DISABLED) == 0)
8341 return;
8342
8343 kn->kn_status &= ~KN_DISABLED;
8344
8345 if (kn->kn_status & KN_SUPPRESSED) {
8346 /* Clear the sync qos on the knote */
8347 knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
8348
8349 /*
8350 * it is possible for userland to have knotes registered for a given
8351 * workloop `wl_orig` but really handled on another workloop `wl_new`.
8352 *
8353 * In that case, rearming will happen from the servicer thread of
8354 * `wl_new` which if `wl_orig` is no longer being serviced, would cause
8355 * this knote to stay suppressed forever if we only relied on
8356 * kqworkloop_acknowledge_events to be called by `wl_orig`.
8357 *
8358 * However if we see the KQ_PROCESSING bit on `wl_orig` set, we can't
8359 * unsuppress because that would mess with the processing phase of
8360 * `wl_orig`, however it also means kqworkloop_acknowledge_events()
8361 * will be called.
8362 */
8363 struct kqueue *kq = knote_get_kq(kn);
8364 if ((kq->kq_state & KQ_PROCESSING) == 0) {
8365 knote_unsuppress(kn);
8366 }
8367 } else if (knote_enqueue(kn)) {
8368 knote_wakeup(kn);
8369 }
8370 }
8371
8372 /* called with kqueue lock held */
8373 static void
8374 knote_disable(struct knote *kn)
8375 {
8376 if (kn->kn_status & KN_DISABLED)
8377 return;
8378
8379 kn->kn_status |= KN_DISABLED;
8380 knote_dequeue(kn);
8381 }
8382
8383 /* called with kqueue lock held */
8384 static void
8385 knote_suppress(struct knote *kn)
8386 {
8387 struct kqtailq *suppressq;
8388 struct kqueue *kq = knote_get_kq(kn);
8389
8390 kqlock_held(kq);
8391
8392 if (kn->kn_status & KN_SUPPRESSED)
8393 return;
8394
8395 knote_dequeue(kn);
8396 kn->kn_status |= KN_SUPPRESSED;
8397 suppressq = kqueue_get_suppressed_queue(kq, knote_get_qos_index(kn));
8398 TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
8399
8400 if ((kq->kq_state & KQ_WORKLOOP) &&
8401 knote_get_qos_override_index(kn) == THREAD_QOS_USER_INTERACTIVE &&
8402 kn->kn_qos_override_is_sync) {
8403 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8404 /* update the sync qos override counter for suppressed knotes */
8405 kqworkloop_update_override(kqwl, knote_get_qos_index(kn),
8406 knote_get_qos_override_index(kn),
8407 (KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI));
8408 }
8409 }
8410
8411 /* called with kqueue lock held */
8412 static void
8413 knote_unsuppress(struct knote *kn)
8414 {
8415 struct kqtailq *suppressq;
8416 struct kqueue *kq = knote_get_kq(kn);
8417
8418 kqlock_held(kq);
8419
8420 if ((kn->kn_status & KN_SUPPRESSED) == 0)
8421 return;
8422
8423 /* Clear the sync qos on the knote */
8424 knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
8425
8426 kn->kn_status &= ~KN_SUPPRESSED;
8427 suppressq = kqueue_get_suppressed_queue(kq, knote_get_qos_index(kn));
8428 TAILQ_REMOVE(suppressq, kn, kn_tqe);
8429
8430 /* udate in-use qos to equal requested qos */
8431 kn->kn_qos_index = kn->kn_req_index;
8432
8433 /* don't wakeup if unsuppressing just a stay-active knote */
8434 if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
8435 knote_wakeup(kn);
8436 }
8437
8438 if ((kq->kq_state & KQ_WORKLOOP) && !(kq->kq_state & KQ_NO_WQ_THREAD) &&
8439 knote_get_qos_override_index(kn) == THREAD_QOS_USER_INTERACTIVE &&
8440 kn->kn_qos_override_is_sync) {
8441 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8442
8443 /* update the sync qos override counter for suppressed knotes */
8444 kqworkloop_update_override(kqwl, knote_get_qos_index(kn),
8445 knote_get_qos_override_index(kn),
8446 (KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI));
8447 }
8448
8449 if (TAILQ_EMPTY(suppressq) && (kq->kq_state & KQ_WORKLOOP) &&
8450 !(kq->kq_state & KQ_NO_WQ_THREAD)) {
8451 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8452 if (kqworkloop_is_processing_on_current_thread(kqwl)) {
8453 /*
8454 * kqworkloop_end_processing() will perform the required QoS
8455 * computations when it unsets the processing mode.
8456 */
8457 } else {
8458 kqwl_req_lock(kqwl);
8459 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, 0);
8460 kqwl_req_unlock(kqwl);
8461 }
8462 }
8463 }
8464
8465 /* called with kqueue lock held */
8466 static void
8467 knote_update_sync_override_state(struct knote *kn)
8468 {
8469 struct kqtailq *queue = knote_get_queue(kn);
8470 struct kqueue *kq = knote_get_kq(kn);
8471
8472 if (!(kq->kq_state & KQ_WORKLOOP) ||
8473 knote_get_queue_index(kn) != THREAD_QOS_USER_INTERACTIVE)
8474 return;
8475
8476 /* Update the sync ipc state on workloop */
8477 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
8478 boolean_t sync_ipc_override = FALSE;
8479 if (!TAILQ_EMPTY(queue)) {
8480 struct knote *kn_head = TAILQ_FIRST(queue);
8481 if (kn_head->kn_qos_override_is_sync)
8482 sync_ipc_override = TRUE;
8483 }
8484 kqworkloop_update_sync_override_state(kqwl, sync_ipc_override);
8485 }
8486
8487 /* called with kqueue lock held */
8488 static int
8489 knote_enqueue(struct knote *kn)
8490 {
8491 if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0 ||
8492 (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)))
8493 return 0;
8494
8495 if ((kn->kn_status & KN_QUEUED) == 0) {
8496 struct kqtailq *queue = knote_get_queue(kn);
8497 struct kqueue *kq = knote_get_kq(kn);
8498
8499 kqlock_held(kq);
8500 /* insert at head for sync ipc waiters */
8501 if (kn->kn_qos_override_is_sync) {
8502 TAILQ_INSERT_HEAD(queue, kn, kn_tqe);
8503 } else {
8504 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
8505 }
8506 kn->kn_status |= KN_QUEUED;
8507 kq->kq_count++;
8508 knote_update_sync_override_state(kn);
8509 return 1;
8510 }
8511 return ((kn->kn_status & KN_STAYACTIVE) != 0);
8512 }
8513
8514
8515 /* called with kqueue lock held */
8516 static void
8517 knote_dequeue(struct knote *kn)
8518 {
8519 struct kqueue *kq = knote_get_kq(kn);
8520 struct kqtailq *queue;
8521
8522 kqlock_held(kq);
8523
8524 if ((kn->kn_status & KN_QUEUED) == 0)
8525 return;
8526
8527 queue = knote_get_queue(kn);
8528 TAILQ_REMOVE(queue, kn, kn_tqe);
8529 kn->kn_status &= ~KN_QUEUED;
8530 kq->kq_count--;
8531 knote_update_sync_override_state(kn);
8532 }
8533
8534 void
8535 knote_init(void)
8536 {
8537 knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote),
8538 8192, "knote zone");
8539
8540 kqfile_zone = zinit(sizeof(struct kqfile), 8192*sizeof(struct kqfile),
8541 8192, "kqueue file zone");
8542
8543 kqworkq_zone = zinit(sizeof(struct kqworkq), 8192*sizeof(struct kqworkq),
8544 8192, "kqueue workq zone");
8545
8546 kqworkloop_zone = zinit(sizeof(struct kqworkloop), 8192*sizeof(struct kqworkloop),
8547 8192, "kqueue workloop zone");
8548
8549 /* allocate kq lock group attribute and group */
8550 kq_lck_grp_attr = lck_grp_attr_alloc_init();
8551
8552 kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr);
8553
8554 /* Allocate kq lock attribute */
8555 kq_lck_attr = lck_attr_alloc_init();
8556
8557 /* Initialize the timer filter lock */
8558 lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
8559
8560 /* Initialize the user filter lock */
8561 lck_spin_init(&_filt_userlock, kq_lck_grp, kq_lck_attr);
8562
8563 #if CONFIG_MEMORYSTATUS
8564 /* Initialize the memorystatus list lock */
8565 memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
8566 #endif
8567 }
8568 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
8569
8570 const struct filterops *
8571 knote_fops(struct knote *kn)
8572 {
8573 return sysfilt_ops[kn->kn_filtid];
8574 }
8575
8576 static struct knote *
8577 knote_alloc(void)
8578 {
8579 struct knote *kn;
8580 kn = ((struct knote *)zalloc(knote_zone));
8581 *kn = (struct knote) { .kn_qos_override = 0, .kn_qos_sync_override = 0, .kn_qos_override_is_sync = 0 };
8582 return kn;
8583 }
8584
8585 static void
8586 knote_free(struct knote *kn)
8587 {
8588 zfree(knote_zone, kn);
8589 }
8590
8591 #if SOCKETS
8592 #include <sys/param.h>
8593 #include <sys/socket.h>
8594 #include <sys/protosw.h>
8595 #include <sys/domain.h>
8596 #include <sys/mbuf.h>
8597 #include <sys/kern_event.h>
8598 #include <sys/malloc.h>
8599 #include <sys/sys_domain.h>
8600 #include <sys/syslog.h>
8601
8602 #ifndef ROUNDUP64
8603 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
8604 #endif
8605
8606 #ifndef ADVANCE64
8607 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
8608 #endif
8609
8610 static lck_grp_attr_t *kev_lck_grp_attr;
8611 static lck_attr_t *kev_lck_attr;
8612 static lck_grp_t *kev_lck_grp;
8613 static decl_lck_rw_data(,kev_lck_data);
8614 static lck_rw_t *kev_rwlock = &kev_lck_data;
8615
8616 static int kev_attach(struct socket *so, int proto, struct proc *p);
8617 static int kev_detach(struct socket *so);
8618 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
8619 struct ifnet *ifp, struct proc *p);
8620 static lck_mtx_t * event_getlock(struct socket *, int);
8621 static int event_lock(struct socket *, int, void *);
8622 static int event_unlock(struct socket *, int, void *);
8623
8624 static int event_sofreelastref(struct socket *);
8625 static void kev_delete(struct kern_event_pcb *);
8626
8627 static struct pr_usrreqs event_usrreqs = {
8628 .pru_attach = kev_attach,
8629 .pru_control = kev_control,
8630 .pru_detach = kev_detach,
8631 .pru_soreceive = soreceive,
8632 };
8633
8634 static struct protosw eventsw[] = {
8635 {
8636 .pr_type = SOCK_RAW,
8637 .pr_protocol = SYSPROTO_EVENT,
8638 .pr_flags = PR_ATOMIC,
8639 .pr_usrreqs = &event_usrreqs,
8640 .pr_lock = event_lock,
8641 .pr_unlock = event_unlock,
8642 .pr_getlock = event_getlock,
8643 }
8644 };
8645
8646 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
8647 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
8648
8649 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
8650 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Kernel event family");
8651
8652 struct kevtstat kevtstat;
8653 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
8654 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8655 kevt_getstat, "S,kevtstat", "");
8656
8657 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
8658 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
8659 kevt_pcblist, "S,xkevtpcb", "");
8660
8661 static lck_mtx_t *
8662 event_getlock(struct socket *so, int flags)
8663 {
8664 #pragma unused(flags)
8665 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8666
8667 if (so->so_pcb != NULL) {
8668 if (so->so_usecount < 0)
8669 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8670 so, so->so_usecount, solockhistory_nr(so));
8671 /* NOTREACHED */
8672 } else {
8673 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
8674 so, solockhistory_nr(so));
8675 /* NOTREACHED */
8676 }
8677 return (&ev_pcb->evp_mtx);
8678 }
8679
8680 static int
8681 event_lock(struct socket *so, int refcount, void *lr)
8682 {
8683 void *lr_saved;
8684
8685 if (lr == NULL)
8686 lr_saved = __builtin_return_address(0);
8687 else
8688 lr_saved = lr;
8689
8690 if (so->so_pcb != NULL) {
8691 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8692 } else {
8693 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
8694 so, lr_saved, solockhistory_nr(so));
8695 /* NOTREACHED */
8696 }
8697
8698 if (so->so_usecount < 0) {
8699 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
8700 so, so->so_pcb, lr_saved, so->so_usecount,
8701 solockhistory_nr(so));
8702 /* NOTREACHED */
8703 }
8704
8705 if (refcount)
8706 so->so_usecount++;
8707
8708 so->lock_lr[so->next_lock_lr] = lr_saved;
8709 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
8710 return (0);
8711 }
8712
8713 static int
8714 event_unlock(struct socket *so, int refcount, void *lr)
8715 {
8716 void *lr_saved;
8717 lck_mtx_t *mutex_held;
8718
8719 if (lr == NULL)
8720 lr_saved = __builtin_return_address(0);
8721 else
8722 lr_saved = lr;
8723
8724 if (refcount) {
8725 so->so_usecount--;
8726 }
8727 if (so->so_usecount < 0) {
8728 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
8729 so, so->so_usecount, solockhistory_nr(so));
8730 /* NOTREACHED */
8731 }
8732 if (so->so_pcb == NULL) {
8733 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
8734 so, so->so_usecount, (void *)lr_saved,
8735 solockhistory_nr(so));
8736 /* NOTREACHED */
8737 }
8738 mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
8739
8740 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
8741 so->unlock_lr[so->next_unlock_lr] = lr_saved;
8742 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
8743
8744 if (so->so_usecount == 0) {
8745 VERIFY(so->so_flags & SOF_PCBCLEARING);
8746 event_sofreelastref(so);
8747 } else {
8748 lck_mtx_unlock(mutex_held);
8749 }
8750
8751 return (0);
8752 }
8753
8754 static int
8755 event_sofreelastref(struct socket *so)
8756 {
8757 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
8758
8759 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
8760
8761 so->so_pcb = NULL;
8762
8763 /*
8764 * Disable upcall in the event another thread is in kev_post_msg()
8765 * appending record to the receive socket buffer, since sbwakeup()
8766 * may release the socket lock otherwise.
8767 */
8768 so->so_rcv.sb_flags &= ~SB_UPCALL;
8769 so->so_snd.sb_flags &= ~SB_UPCALL;
8770 so->so_event = sonullevent;
8771 lck_mtx_unlock(&(ev_pcb->evp_mtx));
8772
8773 LCK_MTX_ASSERT(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
8774 lck_rw_lock_exclusive(kev_rwlock);
8775 LIST_REMOVE(ev_pcb, evp_link);
8776 kevtstat.kes_pcbcount--;
8777 kevtstat.kes_gencnt++;
8778 lck_rw_done(kev_rwlock);
8779 kev_delete(ev_pcb);
8780
8781 sofreelastref(so, 1);
8782 return (0);
8783 }
8784
8785 static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));
8786
8787 static
8788 struct kern_event_head kern_event_head;
8789
8790 static u_int32_t static_event_id = 0;
8791
8792 #define EVPCB_ZONE_MAX 65536
8793 #define EVPCB_ZONE_NAME "kerneventpcb"
8794 static struct zone *ev_pcb_zone;
8795
8796 /*
8797 * Install the protosw's for the NKE manager. Invoked at extension load time
8798 */
8799 void
8800 kern_event_init(struct domain *dp)
8801 {
8802 struct protosw *pr;
8803 int i;
8804
8805 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
8806 VERIFY(dp == systemdomain);
8807
8808 kev_lck_grp_attr = lck_grp_attr_alloc_init();
8809 if (kev_lck_grp_attr == NULL) {
8810 panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
8811 /* NOTREACHED */
8812 }
8813
8814 kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
8815 kev_lck_grp_attr);
8816 if (kev_lck_grp == NULL) {
8817 panic("%s: lck_grp_alloc_init failed\n", __func__);
8818 /* NOTREACHED */
8819 }
8820
8821 kev_lck_attr = lck_attr_alloc_init();
8822 if (kev_lck_attr == NULL) {
8823 panic("%s: lck_attr_alloc_init failed\n", __func__);
8824 /* NOTREACHED */
8825 }
8826
8827 lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
8828 if (kev_rwlock == NULL) {
8829 panic("%s: lck_mtx_alloc_init failed\n", __func__);
8830 /* NOTREACHED */
8831 }
8832
8833 for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++)
8834 net_add_proto(pr, dp, 1);
8835
8836 ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
8837 EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
8838 if (ev_pcb_zone == NULL) {
8839 panic("%s: failed allocating ev_pcb_zone", __func__);
8840 /* NOTREACHED */
8841 }
8842 zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
8843 zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
8844 }
8845
8846 static int
8847 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
8848 {
8849 int error = 0;
8850 struct kern_event_pcb *ev_pcb;
8851
8852 error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
8853 if (error != 0)
8854 return (error);
8855
8856 if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
8857 return (ENOBUFS);
8858 }
8859 bzero(ev_pcb, sizeof(struct kern_event_pcb));
8860 lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
8861
8862 ev_pcb->evp_socket = so;
8863 ev_pcb->evp_vendor_code_filter = 0xffffffff;
8864
8865 so->so_pcb = (caddr_t) ev_pcb;
8866 lck_rw_lock_exclusive(kev_rwlock);
8867 LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
8868 kevtstat.kes_pcbcount++;
8869 kevtstat.kes_gencnt++;
8870 lck_rw_done(kev_rwlock);
8871
8872 return (error);
8873 }
8874
8875 static void
8876 kev_delete(struct kern_event_pcb *ev_pcb)
8877 {
8878 VERIFY(ev_pcb != NULL);
8879 lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
8880 zfree(ev_pcb_zone, ev_pcb);
8881 }
8882
8883 static int
8884 kev_detach(struct socket *so)
8885 {
8886 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
8887
8888 if (ev_pcb != NULL) {
8889 soisdisconnected(so);
8890 so->so_flags |= SOF_PCBCLEARING;
8891 }
8892
8893 return (0);
8894 }
8895
8896 /*
8897 * For now, kev_vendor_code and mbuf_tags use the same
8898 * mechanism.
8899 */
8900 errno_t kev_vendor_code_find(
8901 const char *string,
8902 u_int32_t *out_vendor_code)
8903 {
8904 if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
8905 return (EINVAL);
8906 }
8907 return (net_str_id_find_internal(string, out_vendor_code,
8908 NSI_VENDOR_CODE, 1));
8909 }
8910
8911 errno_t
8912 kev_msg_post(struct kev_msg *event_msg)
8913 {
8914 mbuf_tag_id_t min_vendor, max_vendor;
8915
8916 net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
8917
8918 if (event_msg == NULL)
8919 return (EINVAL);
8920
8921 /*
8922 * Limit third parties to posting events for registered vendor codes
8923 * only
8924 */
8925 if (event_msg->vendor_code < min_vendor ||
8926 event_msg->vendor_code > max_vendor) {
8927 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
8928 return (EINVAL);
8929 }
8930 return (kev_post_msg(event_msg));
8931 }
8932
8933 int
8934 kev_post_msg(struct kev_msg *event_msg)
8935 {
8936 struct mbuf *m, *m2;
8937 struct kern_event_pcb *ev_pcb;
8938 struct kern_event_msg *ev;
8939 char *tmp;
8940 u_int32_t total_size;
8941 int i;
8942
8943 /* Verify the message is small enough to fit in one mbuf w/o cluster */
8944 total_size = KEV_MSG_HEADER_SIZE;
8945
8946 for (i = 0; i < 5; i++) {
8947 if (event_msg->dv[i].data_length == 0)
8948 break;
8949 total_size += event_msg->dv[i].data_length;
8950 }
8951
8952 if (total_size > MLEN) {
8953 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
8954 return (EMSGSIZE);
8955 }
8956
8957 m = m_get(M_WAIT, MT_DATA);
8958 if (m == 0) {
8959 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
8960 return (ENOMEM);
8961 }
8962 ev = mtod(m, struct kern_event_msg *);
8963 total_size = KEV_MSG_HEADER_SIZE;
8964
8965 tmp = (char *) &ev->event_data[0];
8966 for (i = 0; i < 5; i++) {
8967 if (event_msg->dv[i].data_length == 0)
8968 break;
8969
8970 total_size += event_msg->dv[i].data_length;
8971 bcopy(event_msg->dv[i].data_ptr, tmp,
8972 event_msg->dv[i].data_length);
8973 tmp += event_msg->dv[i].data_length;
8974 }
8975
8976 ev->id = ++static_event_id;
8977 ev->total_size = total_size;
8978 ev->vendor_code = event_msg->vendor_code;
8979 ev->kev_class = event_msg->kev_class;
8980 ev->kev_subclass = event_msg->kev_subclass;
8981 ev->event_code = event_msg->event_code;
8982
8983 m->m_len = total_size;
8984 lck_rw_lock_shared(kev_rwlock);
8985 for (ev_pcb = LIST_FIRST(&kern_event_head);
8986 ev_pcb;
8987 ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
8988 lck_mtx_lock(&ev_pcb->evp_mtx);
8989 if (ev_pcb->evp_socket->so_pcb == NULL) {
8990 lck_mtx_unlock(&ev_pcb->evp_mtx);
8991 continue;
8992 }
8993 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
8994 if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
8995 lck_mtx_unlock(&ev_pcb->evp_mtx);
8996 continue;
8997 }
8998
8999 if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
9000 if (ev_pcb->evp_class_filter != ev->kev_class) {
9001 lck_mtx_unlock(&ev_pcb->evp_mtx);
9002 continue;
9003 }
9004
9005 if ((ev_pcb->evp_subclass_filter !=
9006 KEV_ANY_SUBCLASS) &&
9007 (ev_pcb->evp_subclass_filter !=
9008 ev->kev_subclass)) {
9009 lck_mtx_unlock(&ev_pcb->evp_mtx);
9010 continue;
9011 }
9012 }
9013 }
9014
9015 m2 = m_copym(m, 0, m->m_len, M_WAIT);
9016 if (m2 == 0) {
9017 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
9018 m_free(m);
9019 lck_mtx_unlock(&ev_pcb->evp_mtx);
9020 lck_rw_done(kev_rwlock);
9021 return (ENOMEM);
9022 }
9023 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
9024 /*
9025 * We use "m" for the socket stats as it would be
9026 * unsafe to use "m2"
9027 */
9028 so_inc_recv_data_stat(ev_pcb->evp_socket,
9029 1, m->m_len, MBUF_TC_BE);
9030
9031 sorwakeup(ev_pcb->evp_socket);
9032 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
9033 } else {
9034 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
9035 }
9036 lck_mtx_unlock(&ev_pcb->evp_mtx);
9037 }
9038 m_free(m);
9039 lck_rw_done(kev_rwlock);
9040
9041 return (0);
9042 }
9043
9044 static int
9045 kev_control(struct socket *so,
9046 u_long cmd,
9047 caddr_t data,
9048 __unused struct ifnet *ifp,
9049 __unused struct proc *p)
9050 {
9051 struct kev_request *kev_req = (struct kev_request *) data;
9052 struct kern_event_pcb *ev_pcb;
9053 struct kev_vendor_code *kev_vendor;
9054 u_int32_t *id_value = (u_int32_t *) data;
9055
9056 switch (cmd) {
9057 case SIOCGKEVID:
9058 *id_value = static_event_id;
9059 break;
9060 case SIOCSKEVFILT:
9061 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9062 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
9063 ev_pcb->evp_class_filter = kev_req->kev_class;
9064 ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
9065 break;
9066 case SIOCGKEVFILT:
9067 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
9068 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
9069 kev_req->kev_class = ev_pcb->evp_class_filter;
9070 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
9071 break;
9072 case SIOCGKEVVENDOR:
9073 kev_vendor = (struct kev_vendor_code *)data;
9074 /* Make sure string is NULL terminated */
9075 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
9076 return (net_str_id_find_internal(kev_vendor->vendor_string,
9077 &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0));
9078 default:
9079 return (ENOTSUP);
9080 }
9081
9082 return (0);
9083 }
9084
9085 int
9086 kevt_getstat SYSCTL_HANDLER_ARGS
9087 {
9088 #pragma unused(oidp, arg1, arg2)
9089 int error = 0;
9090
9091 lck_rw_lock_shared(kev_rwlock);
9092
9093 if (req->newptr != USER_ADDR_NULL) {
9094 error = EPERM;
9095 goto done;
9096 }
9097 if (req->oldptr == USER_ADDR_NULL) {
9098 req->oldidx = sizeof(struct kevtstat);
9099 goto done;
9100 }
9101
9102 error = SYSCTL_OUT(req, &kevtstat,
9103 MIN(sizeof(struct kevtstat), req->oldlen));
9104 done:
9105 lck_rw_done(kev_rwlock);
9106
9107 return (error);
9108 }
9109
9110 __private_extern__ int
9111 kevt_pcblist SYSCTL_HANDLER_ARGS
9112 {
9113 #pragma unused(oidp, arg1, arg2)
9114 int error = 0;
9115 int n, i;
9116 struct xsystmgen xsg;
9117 void *buf = NULL;
9118 size_t item_size = ROUNDUP64(sizeof (struct xkevtpcb)) +
9119 ROUNDUP64(sizeof (struct xsocket_n)) +
9120 2 * ROUNDUP64(sizeof (struct xsockbuf_n)) +
9121 ROUNDUP64(sizeof (struct xsockstat_n));
9122 struct kern_event_pcb *ev_pcb;
9123
9124 buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
9125 if (buf == NULL)
9126 return (ENOMEM);
9127
9128 lck_rw_lock_shared(kev_rwlock);
9129
9130 n = kevtstat.kes_pcbcount;
9131
9132 if (req->oldptr == USER_ADDR_NULL) {
9133 req->oldidx = (n + n/8) * item_size;
9134 goto done;
9135 }
9136 if (req->newptr != USER_ADDR_NULL) {
9137 error = EPERM;
9138 goto done;
9139 }
9140 bzero(&xsg, sizeof (xsg));
9141 xsg.xg_len = sizeof (xsg);
9142 xsg.xg_count = n;
9143 xsg.xg_gen = kevtstat.kes_gencnt;
9144 xsg.xg_sogen = so_gencnt;
9145 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
9146 if (error) {
9147 goto done;
9148 }
9149 /*
9150 * We are done if there is no pcb
9151 */
9152 if (n == 0) {
9153 goto done;
9154 }
9155
9156 i = 0;
9157 for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
9158 i < n && ev_pcb != NULL;
9159 i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
9160 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
9161 struct xsocket_n *xso = (struct xsocket_n *)
9162 ADVANCE64(xk, sizeof (*xk));
9163 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
9164 ADVANCE64(xso, sizeof (*xso));
9165 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
9166 ADVANCE64(xsbrcv, sizeof (*xsbrcv));
9167 struct xsockstat_n *xsostats = (struct xsockstat_n *)
9168 ADVANCE64(xsbsnd, sizeof (*xsbsnd));
9169
9170 bzero(buf, item_size);
9171
9172 lck_mtx_lock(&ev_pcb->evp_mtx);
9173
9174 xk->kep_len = sizeof(struct xkevtpcb);
9175 xk->kep_kind = XSO_EVT;
9176 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
9177 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
9178 xk->kep_class_filter = ev_pcb->evp_class_filter;
9179 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
9180
9181 sotoxsocket_n(ev_pcb->evp_socket, xso);
9182 sbtoxsockbuf_n(ev_pcb->evp_socket ?
9183 &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
9184 sbtoxsockbuf_n(ev_pcb->evp_socket ?
9185 &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
9186 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
9187
9188 lck_mtx_unlock(&ev_pcb->evp_mtx);
9189
9190 error = SYSCTL_OUT(req, buf, item_size);
9191 }
9192
9193 if (error == 0) {
9194 /*
9195 * Give the user an updated idea of our state.
9196 * If the generation differs from what we told
9197 * her before, she knows that something happened
9198 * while we were processing this request, and it
9199 * might be necessary to retry.
9200 */
9201 bzero(&xsg, sizeof (xsg));
9202 xsg.xg_len = sizeof (xsg);
9203 xsg.xg_count = n;
9204 xsg.xg_gen = kevtstat.kes_gencnt;
9205 xsg.xg_sogen = so_gencnt;
9206 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
9207 if (error) {
9208 goto done;
9209 }
9210 }
9211
9212 done:
9213 lck_rw_done(kev_rwlock);
9214
9215 return (error);
9216 }
9217
9218 #endif /* SOCKETS */
9219
9220
9221 int
9222 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
9223 {
9224 struct vinfo_stat * st;
9225
9226 st = &kinfo->kq_stat;
9227
9228 st->vst_size = kq->kq_count;
9229 if (kq->kq_state & KQ_KEV_QOS)
9230 st->vst_blksize = sizeof(struct kevent_qos_s);
9231 else if (kq->kq_state & KQ_KEV64)
9232 st->vst_blksize = sizeof(struct kevent64_s);
9233 else
9234 st->vst_blksize = sizeof(struct kevent);
9235 st->vst_mode = S_IFIFO;
9236 st->vst_ino = (kq->kq_state & KQ_DYNAMIC) ?
9237 ((struct kqworkloop *)kq)->kqwl_dynamicid : 0;
9238
9239 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
9240 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS|KQ_WORKQ|KQ_WORKLOOP)
9241 kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
9242
9243 return (0);
9244 }
9245
9246 static int
9247 fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi)
9248 {
9249 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
9250 struct kqrequest *kqr = &kqwl->kqwl_request;
9251 int err;
9252
9253 if ((kq->kq_state & KQ_WORKLOOP) == 0) {
9254 return EINVAL;
9255 }
9256
9257 if ((err = fill_kqueueinfo(kq, &kqdi->kqdi_info))) {
9258 return err;
9259 }
9260
9261 kqwl_req_lock(kqwl);
9262
9263 if (kqr->kqr_thread) {
9264 kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread);
9265 }
9266
9267 if (kqwl->kqwl_owner == WL_OWNER_SUSPENDED) {
9268 kqdi->kqdi_owner = ~0ull;
9269 } else {
9270 kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
9271 }
9272
9273 kqdi->kqdi_request_state = kqr->kqr_state;
9274 kqdi->kqdi_async_qos = kqr->kqr_qos_index;
9275 kqdi->kqdi_events_qos = kqr->kqr_override_index;
9276 kqdi->kqdi_sync_waiters = kqr->kqr_dsync_waiters;
9277 kqdi->kqdi_sync_waiter_qos = kqr->kqr_dsync_waiters_qos;
9278
9279 kqwl_req_unlock(kqwl);
9280
9281 return 0;
9282 }
9283
9284
9285 void
9286 knote_markstayactive(struct knote *kn)
9287 {
9288 struct kqueue *kq = knote_get_kq(kn);
9289
9290 kqlock(kq);
9291 kn->kn_status |= KN_STAYACTIVE;
9292
9293 /*
9294 * Making a knote stay active is a property of the knote that must be
9295 * established before it is fully attached.
9296 */
9297 assert(kn->kn_status & KN_ATTACHING);
9298
9299 /* handle all stayactive knotes on the (appropriate) manager */
9300 if (kq->kq_state & KQ_WORKQ) {
9301 knote_set_qos_index(kn, KQWQ_QOS_MANAGER);
9302 } else if (kq->kq_state & KQ_WORKLOOP) {
9303 struct kqworkloop *kqwl = (struct kqworkloop *)kq;
9304 kqwl_req_lock(kqwl);
9305 assert(kn->kn_req_index && kn->kn_req_index < THREAD_QOS_LAST);
9306 kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
9307 kn->kn_req_index);
9308 kqwl_req_unlock(kqwl);
9309 knote_set_qos_index(kn, KQWL_BUCKET_STAYACTIVE);
9310 }
9311
9312 knote_activate(kn);
9313 kqunlock(kq);
9314 }
9315
9316 void
9317 knote_clearstayactive(struct knote *kn)
9318 {
9319 kqlock(knote_get_kq(kn));
9320 kn->kn_status &= ~KN_STAYACTIVE;
9321 knote_deactivate(kn);
9322 kqunlock(knote_get_kq(kn));
9323 }
9324
9325 static unsigned long
9326 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
9327 unsigned long buflen, unsigned long nknotes)
9328 {
9329 for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
9330 if (kq == knote_get_kq(kn)) {
9331 if (nknotes < buflen) {
9332 struct kevent_extinfo *info = &buf[nknotes];
9333 struct kevent_internal_s *kevp = &kn->kn_kevent;
9334
9335 kqlock(kq);
9336
9337 info->kqext_kev = (struct kevent_qos_s){
9338 .ident = kevp->ident,
9339 .filter = kevp->filter,
9340 .flags = kevp->flags,
9341 .fflags = kevp->fflags,
9342 .data = (int64_t)kevp->data,
9343 .udata = kevp->udata,
9344 .ext[0] = kevp->ext[0],
9345 .ext[1] = kevp->ext[1],
9346 .ext[2] = kevp->ext[2],
9347 .ext[3] = kevp->ext[3],
9348 .qos = kn->kn_req_index,
9349 };
9350 info->kqext_sdata = kn->kn_sdata;
9351 info->kqext_status = kn->kn_status;
9352 info->kqext_sfflags = kn->kn_sfflags;
9353
9354 kqunlock(kq);
9355 }
9356
9357 /* we return total number of knotes, which may be more than requested */
9358 nknotes++;
9359 }
9360 }
9361
9362 return nknotes;
9363 }
9364
9365 int
9366 kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf, uint32_t ubufsize,
9367 int32_t *nkqueues_out)
9368 {
9369 proc_t p = (proc_t)proc;
9370 struct filedesc *fdp = p->p_fd;
9371 unsigned int nkqueues = 0;
9372 unsigned long ubuflen = ubufsize / sizeof(kqueue_id_t);
9373 size_t buflen, bufsize;
9374 kqueue_id_t *kq_ids = NULL;
9375 int err = 0;
9376
9377 assert(p != NULL);
9378
9379 if (ubuf == USER_ADDR_NULL && ubufsize != 0) {
9380 err = EINVAL;
9381 goto out;
9382 }
9383
9384 buflen = min(ubuflen, PROC_PIDDYNKQUEUES_MAX);
9385
9386 if (ubuflen != 0) {
9387 if (os_mul_overflow(sizeof(kqueue_id_t), buflen, &bufsize)) {
9388 err = ERANGE;
9389 goto out;
9390 }
9391 kq_ids = kalloc(bufsize);
9392 assert(kq_ids != NULL);
9393 }
9394
9395 kqhash_lock(p);
9396
9397 if (fdp->fd_kqhashmask > 0) {
9398 for (uint32_t i = 0; i < fdp->fd_kqhashmask + 1; i++) {
9399 struct kqworkloop *kqwl;
9400
9401 SLIST_FOREACH(kqwl, &fdp->fd_kqhash[i], kqwl_hashlink) {
9402 /* report the number of kqueues, even if they don't all fit */
9403 if (nkqueues < buflen) {
9404 kq_ids[nkqueues] = kqwl->kqwl_dynamicid;
9405 }
9406 nkqueues++;
9407 }
9408 }
9409 }
9410
9411 kqhash_unlock(p);
9412
9413 if (kq_ids) {
9414 size_t copysize;
9415 if (os_mul_overflow(sizeof(kqueue_id_t), min(ubuflen, nkqueues), &copysize)) {
9416 err = ERANGE;
9417 goto out;
9418 }
9419
9420 assert(ubufsize >= copysize);
9421 err = copyout(kq_ids, ubuf, copysize);
9422 }
9423
9424 out:
9425 if (kq_ids) {
9426 kfree(kq_ids, bufsize);
9427 }
9428
9429 if (!err) {
9430 *nkqueues_out = (int)min(nkqueues, PROC_PIDDYNKQUEUES_MAX);
9431 }
9432 return err;
9433 }
9434
9435 int
9436 kevent_copyout_dynkqinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9437 uint32_t ubufsize, int32_t *size_out)
9438 {
9439 proc_t p = (proc_t)proc;
9440 struct kqueue *kq;
9441 int err = 0;
9442 struct kqueue_dyninfo kqdi = { };
9443
9444 assert(p != NULL);
9445
9446 if (ubufsize < sizeof(struct kqueue_info)) {
9447 return ENOBUFS;
9448 }
9449
9450 kqhash_lock(p);
9451 kq = kqueue_hash_lookup(p, kq_id);
9452 if (!kq) {
9453 kqhash_unlock(p);
9454 return ESRCH;
9455 }
9456 kqueue_retain(kq);
9457 kqhash_unlock(p);
9458
9459 /*
9460 * backward compatibility: allow the argument to this call to only be
9461 * a struct kqueue_info
9462 */
9463 if (ubufsize >= sizeof(struct kqueue_dyninfo)) {
9464 ubufsize = sizeof(struct kqueue_dyninfo);
9465 err = fill_kqueue_dyninfo(kq, &kqdi);
9466 } else {
9467 ubufsize = sizeof(struct kqueue_info);
9468 err = fill_kqueueinfo(kq, &kqdi.kqdi_info);
9469 }
9470 if (err == 0 && (err = copyout(&kqdi, ubuf, ubufsize)) == 0) {
9471 *size_out = ubufsize;
9472 }
9473 kqueue_release_last(p, kq);
9474 return err;
9475 }
9476
9477 int
9478 kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
9479 uint32_t ubufsize, int32_t *nknotes_out)
9480 {
9481 proc_t p = (proc_t)proc;
9482 struct kqueue *kq;
9483 int err;
9484
9485 assert(p != NULL);
9486
9487 kqhash_lock(p);
9488 kq = kqueue_hash_lookup(p, kq_id);
9489 if (!kq) {
9490 kqhash_unlock(p);
9491 return ESRCH;
9492 }
9493 kqueue_retain(kq);
9494 kqhash_unlock(p);
9495
9496 err = pid_kqueue_extinfo(p, kq, ubuf, ubufsize, nknotes_out);
9497 kqueue_release_last(p, kq);
9498 return err;
9499 }
9500
9501 int
9502 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
9503 uint32_t bufsize, int32_t *retval)
9504 {
9505 struct knote *kn;
9506 int i;
9507 int err = 0;
9508 struct filedesc *fdp = p->p_fd;
9509 unsigned long nknotes = 0;
9510 unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
9511 struct kevent_extinfo *kqext = NULL;
9512
9513 /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
9514 buflen = min(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
9515
9516 kqext = kalloc(buflen * sizeof(struct kevent_extinfo));
9517 if (kqext == NULL) {
9518 err = ENOMEM;
9519 goto out;
9520 }
9521 bzero(kqext, buflen * sizeof(struct kevent_extinfo));
9522
9523 proc_fdlock(p);
9524 for (i = 0; i < fdp->fd_knlistsize; i++) {
9525 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
9526 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9527 }
9528 proc_fdunlock(p);
9529
9530 if (fdp->fd_knhashmask != 0) {
9531 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
9532 kqhash_lock(p);
9533 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
9534 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
9535 kqhash_unlock(p);
9536 }
9537 }
9538
9539 assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
9540 err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
9541
9542 out:
9543 if (kqext) {
9544 kfree(kqext, buflen * sizeof(struct kevent_extinfo));
9545 kqext = NULL;
9546 }
9547
9548 if (!err) {
9549 *retval = min(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
9550 }
9551 return err;
9552 }
9553
9554 static unsigned int
9555 klist_copy_udata(struct klist *list, uint64_t *buf,
9556 unsigned int buflen, unsigned int nknotes)
9557 {
9558 struct kevent_internal_s *kev;
9559 struct knote *kn;
9560 SLIST_FOREACH(kn, list, kn_link) {
9561 if (nknotes < buflen) {
9562 struct kqueue *kq = knote_get_kq(kn);
9563 kqlock(kq);
9564 kev = &(kn->kn_kevent);
9565 buf[nknotes] = kev->udata;
9566 kqunlock(kq);
9567 }
9568 /* we return total number of knotes, which may be more than requested */
9569 nknotes++;
9570 }
9571
9572 return nknotes;
9573 }
9574
9575 static unsigned int
9576 kqlist_copy_dynamicids(__assert_only proc_t p, struct kqlist *list,
9577 uint64_t *buf, unsigned int buflen, unsigned int nids)
9578 {
9579 kqhash_lock_held(p);
9580 struct kqworkloop *kqwl;
9581 SLIST_FOREACH(kqwl, list, kqwl_hashlink) {
9582 if (nids < buflen) {
9583 buf[nids] = kqwl->kqwl_dynamicid;
9584 }
9585 nids++;
9586 }
9587 return nids;
9588 }
9589
9590 int
9591 kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize)
9592 {
9593 proc_t p = (proc_t)proc;
9594 struct filedesc *fdp = p->p_fd;
9595 unsigned int nuptrs = 0;
9596 unsigned long buflen = bufsize / sizeof(uint64_t);
9597
9598 if (buflen > 0) {
9599 assert(buf != NULL);
9600 }
9601
9602 proc_fdlock(p);
9603 for (int i = 0; i < fdp->fd_knlistsize; i++) {
9604 nuptrs = klist_copy_udata(&fdp->fd_knlist[i], buf, buflen, nuptrs);
9605 }
9606 knhash_lock(p);
9607 proc_fdunlock(p);
9608 if (fdp->fd_knhashmask != 0) {
9609 for (int i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
9610 nuptrs = klist_copy_udata(&fdp->fd_knhash[i], buf, buflen, nuptrs);
9611 }
9612 }
9613 knhash_unlock(p);
9614
9615 kqhash_lock(p);
9616 if (fdp->fd_kqhashmask != 0) {
9617 for (int i = 0; i < (int)fdp->fd_kqhashmask + 1; i++) {
9618 nuptrs = kqlist_copy_dynamicids(p, &fdp->fd_kqhash[i], buf, buflen,
9619 nuptrs);
9620 }
9621 }
9622 kqhash_unlock(p);
9623
9624 return (int)nuptrs;
9625 }
9626
9627 static void
9628 kevent_redrive_proc_thread_request(proc_t p)
9629 {
9630 __assert_only int ret;
9631 ret = (*pthread_functions->workq_threadreq)(p, NULL, WORKQ_THREADREQ_REDRIVE, 0, 0);
9632 assert(ret == 0 || ret == ECANCELED);
9633 }
9634
9635 static void
9636 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
9637 {
9638 uint64_t ast_addr;
9639 bool proc_is_64bit = !!(p->p_flag & P_LP64);
9640 size_t user_addr_size = proc_is_64bit ? 8 : 4;
9641 uint32_t ast_flags32 = 0;
9642 uint64_t ast_flags64 = 0;
9643 struct uthread *ut = get_bsdthread_info(thread);
9644
9645 if (ut->uu_kqueue_bound != NULL) {
9646 if (ut->uu_kqueue_flags & KEVENT_FLAG_WORKLOOP) {
9647 ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
9648 } else if (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ) {
9649 ast_flags64 |= R2K_WORKQ_PENDING_EVENTS;
9650 }
9651 }
9652
9653 if (ast_flags64 == 0) {
9654 return;
9655 }
9656
9657 if (!(p->p_flag & P_LP64)) {
9658 ast_flags32 = (uint32_t)ast_flags64;
9659 assert(ast_flags64 < 0x100000000ull);
9660 }
9661
9662 ast_addr = thread_rettokern_addr(thread);
9663 if (ast_addr == 0) {
9664 return;
9665 }
9666
9667 if (copyout((proc_is_64bit ? (void *)&ast_flags64 : (void *)&ast_flags32),
9668 (user_addr_t)ast_addr,
9669 user_addr_size) != 0) {
9670 printf("pid %d (tid:%llu): copyout of return_to_kernel ast flags failed with "
9671 "ast_addr = %llu\n", p->p_pid, thread_tid(current_thread()), ast_addr);
9672 }
9673 }
9674
9675 void
9676 kevent_ast(thread_t thread, uint16_t bits)
9677 {
9678 proc_t p = current_proc();
9679
9680 if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
9681 kevent_redrive_proc_thread_request(p);
9682 }
9683 if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
9684 kevent_set_return_to_kernel_user_tsd(p, thread);
9685 }
9686 }
9687
9688 #if DEVELOPMENT || DEBUG
9689
9690 #define KEVENT_SYSCTL_BOUND_ID 1
9691
9692 static int
9693 kevent_sysctl SYSCTL_HANDLER_ARGS
9694 {
9695 #pragma unused(oidp, arg2)
9696 uintptr_t type = (uintptr_t)arg1;
9697 uint64_t bound_id = 0;
9698 struct uthread *ut;
9699 struct kqueue *kq;
9700
9701 if (type != KEVENT_SYSCTL_BOUND_ID) {
9702 return EINVAL;
9703 }
9704
9705 if (req->newptr) {
9706 return EINVAL;
9707 }
9708
9709 ut = get_bsdthread_info(current_thread());
9710 if (!ut) {
9711 return EFAULT;
9712 }
9713
9714 kq = ut->uu_kqueue_bound;
9715 if (kq) {
9716 if (kq->kq_state & KQ_WORKLOOP) {
9717 bound_id = ((struct kqworkloop *)kq)->kqwl_dynamicid;
9718 } else if (kq->kq_state & KQ_WORKQ) {
9719 bound_id = -1;
9720 }
9721 }
9722
9723 return sysctl_io_number(req, bound_id, sizeof(bound_id), NULL, NULL);
9724 }
9725
9726 SYSCTL_NODE(_kern, OID_AUTO, kevent, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
9727 "kevent information");
9728
9729 SYSCTL_PROC(_kern_kevent, OID_AUTO, bound_id,
9730 CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
9731 (void *)KEVENT_SYSCTL_BOUND_ID,
9732 sizeof(kqueue_id_t), kevent_sysctl, "Q",
9733 "get the ID of the bound kqueue");
9734
9735 #endif /* DEVELOPMENT || DEBUG */