]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_event.c
xnu-3789.1.32.tar.gz
[apple/xnu.git] / bsd / kern / kern_event.c
1 /*
2 * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29 /*-
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54 /*
55 * @(#)kern_event.c 1.0 (3/31/2000)
56 */
57 #include <stdint.h>
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/filedesc.h>
62 #include <sys/kernel.h>
63 #include <sys/proc_internal.h>
64 #include <sys/kauth.h>
65 #include <sys/malloc.h>
66 #include <sys/unistd.h>
67 #include <sys/file_internal.h>
68 #include <sys/fcntl.h>
69 #include <sys/select.h>
70 #include <sys/queue.h>
71 #include <sys/event.h>
72 #include <sys/eventvar.h>
73 #include <sys/protosw.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/stat.h>
77 #include <sys/sysctl.h>
78 #include <sys/uio.h>
79 #include <sys/sysproto.h>
80 #include <sys/user.h>
81 #include <sys/vnode_internal.h>
82 #include <string.h>
83 #include <sys/proc_info.h>
84 #include <sys/codesign.h>
85 #include <sys/pthread_shims.h>
86
87 #include <kern/locks.h>
88 #include <kern/clock.h>
89 #include <kern/policy_internal.h>
90 #include <kern/thread_call.h>
91 #include <kern/sched_prim.h>
92 #include <kern/waitq.h>
93 #include <kern/zalloc.h>
94 #include <kern/kalloc.h>
95 #include <kern/assert.h>
96
97 #include <machine/spl.h>
98
99 #include <libkern/libkern.h>
100 #include "net/net_str_id.h"
101
102 #include <mach/task.h>
103
104 #if CONFIG_MEMORYSTATUS
105 #include <sys/kern_memorystatus.h>
106 #endif
107
108 /*
109 * JMM - this typedef needs to be unified with pthread_priority_t
110 * and mach_msg_priority_t. It also needs to be the same type
111 * everywhere.
112 */
113 typedef int32_t qos_t;
114
115 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
116
117 #define KQ_EVENT NO_EVENT64
118
119 static inline void kqlock(struct kqueue *kq);
120 static inline void kqunlock(struct kqueue *kq);
121
122 static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn);
123 static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
124 static int kqlock2knotedetach(struct kqueue *kq, struct knote *kn);
125 static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int defer_drop);
126
127 static int kqueue_read(struct fileproc *fp, struct uio *uio,
128 int flags, vfs_context_t ctx);
129 static int kqueue_write(struct fileproc *fp, struct uio *uio,
130 int flags, vfs_context_t ctx);
131 static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
132 vfs_context_t ctx);
133 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
134 vfs_context_t ctx);
135 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
136 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
137 vfs_context_t ctx);
138 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
139
140 static const struct fileops kqueueops = {
141 .fo_type = DTYPE_KQUEUE,
142 .fo_read = kqueue_read,
143 .fo_write = kqueue_write,
144 .fo_ioctl = kqueue_ioctl,
145 .fo_select = kqueue_select,
146 .fo_close = kqueue_close,
147 .fo_kqfilter = kqueue_kqfilter,
148 .fo_drain = kqueue_drain,
149 };
150
151 static int kevent_internal(struct proc *p, int fd,
152 user_addr_t changelist, int nchanges,
153 user_addr_t eventlist, int nevents,
154 user_addr_t data_out, uint64_t data_available,
155 unsigned int flags, user_addr_t utimeout,
156 kqueue_continue_t continuation,
157 int32_t *retval);
158 static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp,
159 struct proc *p, unsigned int flags);
160 static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
161 struct proc *p, unsigned int flags);
162 char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
163
164 static void kqueue_interrupt(struct kqueue *kq);
165 static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
166 void *data);
167 static void kevent_continue(struct kqueue *kq, void *data, int error);
168 static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
169 static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data,
170 struct filt_process_s *process_data, kq_index_t servicer_qos_index,
171 int *countp, struct proc *p);
172 static int kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags);
173 static void kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags);
174 static struct kqtailq *kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index);
175 static struct kqtailq *kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index);
176 static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index);
177
178 static struct kqtailq *kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index);
179
180 static void kqworkq_request_thread(struct kqworkq *kqwq, kq_index_t qos_index);
181 static void kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index, uint32_t type);
182 static void kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index);
183 static void kqworkq_bind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
184 static void kqworkq_unbind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
185 static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
186
187
188 static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data,
189 struct filt_process_s *process_data, struct proc *p);
190 #if 0
191 static void knote_put(struct knote *kn);
192 #endif
193
194 static int knote_fdadd(struct knote *kn, struct proc *p);
195 static void knote_fdremove(struct knote *kn, struct proc *p);
196 static struct knote *knote_fdfind(struct kqueue *kq, struct kevent_internal_s *kev, struct proc *p);
197
198 static void knote_drop(struct knote *kn, struct proc *p);
199 static struct knote *knote_alloc(void);
200 static void knote_free(struct knote *kn);
201
202 static void knote_activate(struct knote *kn);
203 static void knote_deactivate(struct knote *kn);
204
205 static void knote_enable(struct knote *kn);
206 static void knote_disable(struct knote *kn);
207
208 static int knote_enqueue(struct knote *kn);
209 static void knote_dequeue(struct knote *kn);
210
211 static void knote_suppress(struct knote *kn);
212 static void knote_unsuppress(struct knote *kn);
213 static void knote_wakeup(struct knote *kn);
214
215 static kq_index_t knote_get_queue_index(struct knote *kn);
216 static struct kqtailq *knote_get_queue(struct knote *kn);
217 static struct kqtailq *knote_get_suppressed_queue(struct knote *kn);
218 static kq_index_t knote_get_req_index(struct knote *kn);
219 static kq_index_t knote_get_qos_index(struct knote *kn);
220 static void knote_set_qos_index(struct knote *kn, kq_index_t qos_index);
221 static kq_index_t knote_get_qos_override_index(struct knote *kn);
222 static void knote_set_qos_override_index(struct knote *kn, kq_index_t qos_index);
223
224 static int filt_fileattach(struct knote *kn);
225 static struct filterops file_filtops = {
226 .f_isfd = 1,
227 .f_attach = filt_fileattach,
228 };
229
230 static void filt_kqdetach(struct knote *kn);
231 static int filt_kqueue(struct knote *kn, long hint);
232 static int filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev);
233 static int filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
234 static struct filterops kqread_filtops = {
235 .f_isfd = 1,
236 .f_detach = filt_kqdetach,
237 .f_event = filt_kqueue,
238 .f_touch = filt_kqtouch,
239 .f_process = filt_kqprocess,
240 };
241
242 /* placeholder for not-yet-implemented filters */
243 static int filt_badattach(struct knote *kn);
244 static struct filterops bad_filtops = {
245 .f_attach = filt_badattach,
246 };
247
248 static int filt_procattach(struct knote *kn);
249 static void filt_procdetach(struct knote *kn);
250 static int filt_proc(struct knote *kn, long hint);
251 static int filt_proctouch(struct knote *kn, struct kevent_internal_s *kev);
252 static int filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
253 static struct filterops proc_filtops = {
254 .f_attach = filt_procattach,
255 .f_detach = filt_procdetach,
256 .f_event = filt_proc,
257 .f_touch = filt_proctouch,
258 .f_process = filt_procprocess,
259 };
260
261 #if CONFIG_MEMORYSTATUS
262 extern struct filterops memorystatus_filtops;
263 #endif /* CONFIG_MEMORYSTATUS */
264
265 extern struct filterops fs_filtops;
266
267 extern struct filterops sig_filtops;
268
269 /* Timer filter */
270 static int filt_timerattach(struct knote *kn);
271 static void filt_timerdetach(struct knote *kn);
272 static int filt_timer(struct knote *kn, long hint);
273 static int filt_timertouch(struct knote *kn, struct kevent_internal_s *kev);
274 static int filt_timerprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
275 static struct filterops timer_filtops = {
276 .f_attach = filt_timerattach,
277 .f_detach = filt_timerdetach,
278 .f_event = filt_timer,
279 .f_touch = filt_timertouch,
280 .f_process = filt_timerprocess,
281 };
282
283 /* Helpers */
284 static void filt_timerexpire(void *knx, void *param1);
285 static int filt_timervalidate(struct knote *kn);
286 static void filt_timerupdate(struct knote *kn, int num_fired);
287 static void filt_timercancel(struct knote *kn);
288
289 #define TIMER_RUNNING 0x1
290 #define TIMER_CANCELWAIT 0x2
291
292 static lck_mtx_t _filt_timerlock;
293 static void filt_timerlock(void);
294 static void filt_timerunlock(void);
295
296 static zone_t knote_zone;
297 static zone_t kqfile_zone;
298 static zone_t kqworkq_zone;
299
300 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
301
302 #if 0
303 extern struct filterops aio_filtops;
304 #endif
305
306 /* Mach portset filter */
307 extern struct filterops machport_filtops;
308
309 /* User filter */
310 static int filt_userattach(struct knote *kn);
311 static void filt_userdetach(struct knote *kn);
312 static int filt_user(struct knote *kn, long hint);
313 static int filt_usertouch(struct knote *kn, struct kevent_internal_s *kev);
314 static int filt_userprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
315 static struct filterops user_filtops = {
316 .f_attach = filt_userattach,
317 .f_detach = filt_userdetach,
318 .f_event = filt_user,
319 .f_touch = filt_usertouch,
320 .f_process = filt_userprocess,
321 };
322
323 static lck_spin_t _filt_userlock;
324 static void filt_userlock(void);
325 static void filt_userunlock(void);
326
327 extern struct filterops pipe_rfiltops;
328 extern struct filterops pipe_wfiltops;
329 extern struct filterops ptsd_kqops;
330 extern struct filterops soread_filtops;
331 extern struct filterops sowrite_filtops;
332 extern struct filterops sock_filtops;
333 extern struct filterops soexcept_filtops;
334 extern struct filterops spec_filtops;
335 extern struct filterops bpfread_filtops;
336 extern struct filterops necp_fd_rfiltops;
337 extern struct filterops skywalk_channel_rfiltops;
338 extern struct filterops skywalk_channel_wfiltops;
339 extern struct filterops fsevent_filtops;
340 extern struct filterops vnode_filtops;
341
342 /*
343 *
344 * Rules for adding new filters to the system:
345 * Public filters:
346 * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
347 * in the exported section of the header
348 * - Update the EVFILT_SYSCOUNT value to reflect the new addition
349 * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
350 * of the Public Filters section in the array.
351 * Private filters:
352 * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
353 * in the XNU_KERNEL_PRIVATE section of the header
354 * - Update the EVFILTID_MAX value to reflect the new addition
355 * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
356 * the Private filters section of the array.
357 */
358 static struct filterops *sysfilt_ops[EVFILTID_MAX] = {
359 /* Public Filters */
360 [~EVFILT_READ] = &file_filtops,
361 [~EVFILT_WRITE] = &file_filtops,
362 [~EVFILT_AIO] = &bad_filtops,
363 [~EVFILT_VNODE] = &file_filtops,
364 [~EVFILT_PROC] = &proc_filtops,
365 [~EVFILT_SIGNAL] = &sig_filtops,
366 [~EVFILT_TIMER] = &timer_filtops,
367 [~EVFILT_MACHPORT] = &machport_filtops,
368 [~EVFILT_FS] = &fs_filtops,
369 [~EVFILT_USER] = &user_filtops,
370 &bad_filtops,
371 &bad_filtops,
372 [~EVFILT_SOCK] = &file_filtops,
373 #if CONFIG_MEMORYSTATUS
374 [~EVFILT_MEMORYSTATUS] = &memorystatus_filtops,
375 #else
376 [~EVFILT_MEMORYSTATUS] = &bad_filtops,
377 #endif
378 [~EVFILT_EXCEPT] = &file_filtops,
379
380 /* Private filters */
381 [EVFILTID_KQREAD] = &kqread_filtops,
382 [EVFILTID_PIPE_R] = &pipe_rfiltops,
383 [EVFILTID_PIPE_W] = &pipe_wfiltops,
384 [EVFILTID_PTSD] = &ptsd_kqops,
385 [EVFILTID_SOREAD] = &soread_filtops,
386 [EVFILTID_SOWRITE] = &sowrite_filtops,
387 [EVFILTID_SCK] = &sock_filtops,
388 [EVFILTID_SOEXCEPT] = &soexcept_filtops,
389 [EVFILTID_SPEC] = &spec_filtops,
390 [EVFILTID_BPFREAD] = &bpfread_filtops,
391 [EVFILTID_NECP_FD] = &necp_fd_rfiltops,
392 [EVFILTID_FSEVENT] = &fsevent_filtops,
393 [EVFILTID_VN] = &vnode_filtops
394 };
395
396 /* waitq prepost callback */
397 void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos);
398
399 #ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
400 #define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */
401 #endif
402 #ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
403 #define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG 0x80000000 /* request overcommit threads */
404 #endif
405 #ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK
406 #define _PTHREAD_PRIORITY_QOS_CLASS_MASK 0x003fff00 /* QoS class mask */
407 #endif
408 #ifndef _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32
409 #define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 8
410 #endif
411
412 static inline
413 qos_t canonicalize_kevent_qos(qos_t qos)
414 {
415 unsigned long canonical;
416
417 /* preserve manager and overcommit flags in this case */
418 canonical = pthread_priority_canonicalize(qos, FALSE);
419 return (qos_t)canonical;
420 }
421
422 static inline
423 kq_index_t qos_index_from_qos(qos_t qos, boolean_t propagation)
424 {
425 kq_index_t qos_index;
426 unsigned long flags = 0;
427
428 qos_index = (kq_index_t)thread_qos_from_pthread_priority(
429 (unsigned long)qos, &flags);
430
431 if (!propagation && (flags & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG))
432 return KQWQ_QOS_MANAGER;
433
434 return qos_index;
435 }
436
437 static inline
438 qos_t qos_from_qos_index(kq_index_t qos_index)
439 {
440 if (qos_index == KQWQ_QOS_MANAGER)
441 return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
442
443 if (qos_index == 0)
444 return 0; /* Unspecified */
445
446 /* Should have support from pthread kext support */
447 return (1 << (qos_index - 1 +
448 _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32));
449 }
450
451 static inline
452 kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags)
453 {
454 kq_index_t qos_index;
455
456 if (flags & KEVENT_FLAG_WORKQ_MANAGER)
457 return KQWQ_QOS_MANAGER;
458
459 /*
460 * If the caller didn't pass in a class (legacy pthread kext)
461 * the we use the thread policy QoS of the current thread.
462 */
463 assert(qos_class != -1);
464 if (qos_class == -1)
465 qos_index = proc_get_thread_policy(thread,
466 TASK_POLICY_ATTRIBUTE,
467 TASK_POLICY_QOS);
468 else
469 qos_index = (kq_index_t)qos_class;
470
471 assert(qos_index > 0 && qos_index < KQWQ_NQOS);
472
473 return qos_index;
474 }
475
476 /*
477 * kqueue/note lock implementations
478 *
479 * The kqueue lock guards the kq state, the state of its queues,
480 * and the kqueue-aware status and use counts of individual knotes.
481 *
482 * The kqueue workq lock is used to protect state guarding the
483 * interaction of the kqueue with the workq. This state cannot
484 * be guarded by the kq lock - as it needs to be taken when we
485 * already have the waitq set lock held (during the waitq hook
486 * callback). It might be better to use the waitq lock itself
487 * for this, but the IRQ requirements make that difficult).
488 *
489 * Knote flags, filter flags, and associated data are protected
490 * by the underlying object lock - and are only ever looked at
491 * by calling the filter to get a [consistent] snapshot of that
492 * data.
493 */
494 lck_grp_attr_t * kq_lck_grp_attr;
495 lck_grp_t * kq_lck_grp;
496 lck_attr_t * kq_lck_attr;
497
498 static inline void
499 kqlock(struct kqueue *kq)
500 {
501 lck_spin_lock(&kq->kq_lock);
502 }
503
504 static inline void
505 kqunlock(struct kqueue *kq)
506 {
507 lck_spin_unlock(&kq->kq_lock);
508 }
509
510
511 /*
512 * Convert a kq lock to a knote use referece.
513 *
514 * If the knote is being dropped, or has
515 * vanished, we can't get a use reference.
516 * Just return with it still locked.
517 *
518 * - kq locked at entry
519 * - unlock on exit if we get the use reference
520 */
521 static int
522 kqlock2knoteuse(struct kqueue *kq, struct knote *kn)
523 {
524 if (kn->kn_status & (KN_DROPPING | KN_VANISHED))
525 return (0);
526
527 assert(kn->kn_status & KN_ATTACHED);
528 kn->kn_inuse++;
529 kqunlock(kq);
530 return (1);
531 }
532
533
534 /*
535 * Convert from a knote use reference back to kq lock.
536 *
537 * Drop a use reference and wake any waiters if
538 * this is the last one.
539 *
540 * If someone is trying to drop the knote, but the
541 * caller has events they must deliver, take
542 * responsibility for the drop later - and wake the
543 * other attempted dropper in a manner that informs
544 * him of the transfer of responsibility.
545 *
546 * The exit return indicates if the knote is still alive
547 * (or if not, the other dropper has been given the green
548 * light to drop it).
549 *
550 * The kqueue lock is re-taken unconditionally.
551 */
552 static int
553 knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int steal_drop)
554 {
555 int dropped = 0;
556
557 kqlock(kq);
558 if (--kn->kn_inuse == 0) {
559
560 if ((kn->kn_status & KN_ATTACHING) != 0) {
561 kn->kn_status &= ~KN_ATTACHING;
562 }
563
564 if ((kn->kn_status & KN_USEWAIT) != 0) {
565 wait_result_t result;
566
567 /* If we need to, try and steal the drop */
568 if (kn->kn_status & KN_DROPPING) {
569 if (steal_drop && !(kn->kn_status & KN_STOLENDROP)) {
570 kn->kn_status |= KN_STOLENDROP;
571 } else {
572 dropped = 1;
573 }
574 }
575
576 /* wakeup indicating if ANY USE stole the drop */
577 result = (kn->kn_status & KN_STOLENDROP) ?
578 THREAD_RESTART : THREAD_AWAKENED;
579
580 kn->kn_status &= ~KN_USEWAIT;
581 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
582 CAST_EVENT64_T(&kn->kn_status),
583 result,
584 WAITQ_ALL_PRIORITIES);
585 } else {
586 /* should have seen use-wait if dropping with use refs */
587 assert((kn->kn_status & (KN_DROPPING|KN_STOLENDROP)) == 0);
588 }
589
590 } else if (kn->kn_status & KN_DROPPING) {
591 /* not the last ref but want to steal a drop if present */
592 if (steal_drop && ((kn->kn_status & KN_STOLENDROP) == 0)) {
593 kn->kn_status |= KN_STOLENDROP;
594
595 /* but we now have to wait to be the last ref */
596 kn->kn_status |= KN_USEWAIT;
597 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
598 CAST_EVENT64_T(&kn->kn_status),
599 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
600 kqunlock(kq);
601 thread_block(THREAD_CONTINUE_NULL);
602 kqlock(kq);
603 } else {
604 dropped = 1;
605 }
606 }
607
608 return (!dropped);
609 }
610
611 /*
612 * Convert a kq lock to a knote use reference
613 * (for the purpose of detaching AND vanishing it).
614 *
615 * If the knote is being dropped, we can't get
616 * a detach reference, so wait for the knote to
617 * finish dropping before returning.
618 *
619 * If the knote is being used for other purposes,
620 * we cannot detach it until those uses are done
621 * as well. Again, just wait for them to finish
622 * (caller will start over at lookup).
623 *
624 * - kq locked at entry
625 * - unlocked on exit
626 */
627 static int
628 kqlock2knotedetach(struct kqueue *kq, struct knote *kn)
629 {
630 if ((kn->kn_status & KN_DROPPING) || kn->kn_inuse) {
631 /* have to wait for dropper or current uses to go away */
632 kn->kn_status |= KN_USEWAIT;
633 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
634 CAST_EVENT64_T(&kn->kn_status),
635 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
636 kqunlock(kq);
637 thread_block(THREAD_CONTINUE_NULL);
638 return (0);
639 }
640 assert((kn->kn_status & KN_VANISHED) == 0);
641 assert(kn->kn_status & KN_ATTACHED);
642 kn->kn_status &= ~KN_ATTACHED;
643 kn->kn_status |= KN_VANISHED;
644 kn->kn_inuse++;
645 kqunlock(kq);
646 return (1);
647 }
648
649 /*
650 * Convert a kq lock to a knote drop reference.
651 *
652 * If the knote is in use, wait for the use count
653 * to subside. We first mark our intention to drop
654 * it - keeping other users from "piling on."
655 * If we are too late, we have to wait for the
656 * other drop to complete.
657 *
658 * - kq locked at entry
659 * - always unlocked on exit.
660 * - caller can't hold any locks that would prevent
661 * the other dropper from completing.
662 */
663 static int
664 kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
665 {
666 int oktodrop;
667 wait_result_t result;
668
669 oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
670 /* if another thread is attaching, they will become the dropping thread */
671 kn->kn_status |= KN_DROPPING;
672 knote_unsuppress(kn);
673 knote_dequeue(kn);
674 if (oktodrop) {
675 if (kn->kn_inuse == 0) {
676 kqunlock(kq);
677 return (oktodrop);
678 }
679 }
680 kn->kn_status |= KN_USEWAIT;
681 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
682 CAST_EVENT64_T(&kn->kn_status),
683 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
684 kqunlock(kq);
685 result = thread_block(THREAD_CONTINUE_NULL);
686 /* THREAD_RESTART == another thread stole the knote drop */
687 return (result == THREAD_AWAKENED);
688 }
689
690 #if 0
691 /*
692 * Release a knote use count reference.
693 */
694 static void
695 knote_put(struct knote *kn)
696 {
697 struct kqueue *kq = knote_get_kq(kn);
698
699 kqlock(kq);
700 if (--kn->kn_inuse == 0) {
701 if ((kn->kn_status & KN_USEWAIT) != 0) {
702 kn->kn_status &= ~KN_USEWAIT;
703 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
704 CAST_EVENT64_T(&kn->kn_status),
705 THREAD_AWAKENED,
706 WAITQ_ALL_PRIORITIES);
707 }
708 }
709 kqunlock(kq);
710 }
711 #endif
712
713 static int
714 filt_fileattach(struct knote *kn)
715 {
716 return (fo_kqfilter(kn->kn_fp, kn, vfs_context_current()));
717 }
718
719 #define f_flag f_fglob->fg_flag
720 #define f_msgcount f_fglob->fg_msgcount
721 #define f_cred f_fglob->fg_cred
722 #define f_ops f_fglob->fg_ops
723 #define f_offset f_fglob->fg_offset
724 #define f_data f_fglob->fg_data
725
726 static void
727 filt_kqdetach(struct knote *kn)
728 {
729 struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
730 struct kqueue *kq = &kqf->kqf_kqueue;
731
732 kqlock(kq);
733 KNOTE_DETACH(&kqf->kqf_sel.si_note, kn);
734 kqunlock(kq);
735 }
736
737 /*ARGSUSED*/
738 static int
739 filt_kqueue(struct knote *kn, __unused long hint)
740 {
741 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
742 int count;
743
744 count = kq->kq_count;
745 return (count > 0);
746 }
747
748 static int
749 filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev)
750 {
751 #pragma unused(kev)
752 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
753 int res;
754
755 kqlock(kq);
756 kn->kn_data = kq->kq_count;
757 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
758 kn->kn_udata = kev->udata;
759 res = (kn->kn_data > 0);
760
761 kqunlock(kq);
762
763 return res;
764 }
765
766 static int
767 filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
768 {
769 #pragma unused(data)
770 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
771 int res;
772
773 kqlock(kq);
774 kn->kn_data = kq->kq_count;
775 res = (kn->kn_data > 0);
776 if (res) {
777 *kev = kn->kn_kevent;
778 if (kn->kn_flags & EV_CLEAR)
779 kn->kn_data = 0;
780 }
781 kqunlock(kq);
782
783 return res;
784 }
785
786 static int
787 filt_procattach(struct knote *kn)
788 {
789 struct proc *p;
790
791 assert(PID_MAX < NOTE_PDATAMASK);
792
793 if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
794 kn->kn_flags = EV_ERROR;
795 kn->kn_data = ENOTSUP;
796 return 0;
797 }
798
799 p = proc_find(kn->kn_id);
800 if (p == NULL) {
801 kn->kn_flags = EV_ERROR;
802 kn->kn_data = ESRCH;
803 return 0;
804 }
805
806 const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
807
808 if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
809 do {
810 pid_t selfpid = proc_selfpid();
811
812 if (p->p_ppid == selfpid)
813 break; /* parent => ok */
814
815 if ((p->p_lflag & P_LTRACED) != 0 &&
816 (p->p_oppid == selfpid))
817 break; /* parent-in-waiting => ok */
818
819 proc_rele(p);
820 kn->kn_flags = EV_ERROR;
821 kn->kn_data = EACCES;
822 return 0;
823 } while (0);
824
825 proc_klist_lock();
826
827 kn->kn_ptr.p_proc = p; /* store the proc handle */
828
829 KNOTE_ATTACH(&p->p_klist, kn);
830
831 proc_klist_unlock();
832
833 proc_rele(p);
834
835 /*
836 * only captures edge-triggered events after this point
837 * so it can't already be fired.
838 */
839 return (0);
840 }
841
842
843 /*
844 * The knote may be attached to a different process, which may exit,
845 * leaving nothing for the knote to be attached to. In that case,
846 * the pointer to the process will have already been nulled out.
847 */
848 static void
849 filt_procdetach(struct knote *kn)
850 {
851 struct proc *p;
852
853 proc_klist_lock();
854
855 p = kn->kn_ptr.p_proc;
856 if (p != PROC_NULL) {
857 kn->kn_ptr.p_proc = PROC_NULL;
858 KNOTE_DETACH(&p->p_klist, kn);
859 }
860
861 proc_klist_unlock();
862 }
863
864 static int
865 filt_proc(struct knote *kn, long hint)
866 {
867 u_int event;
868
869 /* ALWAYS CALLED WITH proc_klist_lock */
870
871 /*
872 * Note: a lot of bits in hint may be obtained from the knote
873 * To free some of those bits, see <rdar://problem/12592988> Freeing up
874 * bits in hint for filt_proc
875 *
876 * mask off extra data
877 */
878 event = (u_int)hint & NOTE_PCTRLMASK;
879
880 /*
881 * termination lifecycle events can happen while a debugger
882 * has reparented a process, in which case notifications
883 * should be quashed except to the tracing parent. When
884 * the debugger reaps the child (either via wait4(2) or
885 * process exit), the child will be reparented to the original
886 * parent and these knotes re-fired.
887 */
888 if (event & NOTE_EXIT) {
889 if ((kn->kn_ptr.p_proc->p_oppid != 0)
890 && (knote_get_kq(kn)->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
891 /*
892 * This knote is not for the current ptrace(2) parent, ignore.
893 */
894 return 0;
895 }
896 }
897
898 /*
899 * if the user is interested in this event, record it.
900 */
901 if (kn->kn_sfflags & event)
902 kn->kn_fflags |= event;
903
904 #pragma clang diagnostic push
905 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
906 if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
907 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
908 }
909 #pragma clang diagnostic pop
910
911
912 /*
913 * The kernel has a wrapper in place that returns the same data
914 * as is collected here, in kn_data. Any changes to how
915 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
916 * should also be reflected in the proc_pidnoteexit() wrapper.
917 */
918 if (event == NOTE_EXIT) {
919 kn->kn_data = 0;
920 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
921 kn->kn_fflags |= NOTE_EXITSTATUS;
922 kn->kn_data |= (hint & NOTE_PDATAMASK);
923 }
924 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
925 kn->kn_fflags |= NOTE_EXIT_DETAIL;
926 if ((kn->kn_ptr.p_proc->p_lflag &
927 P_LTERM_DECRYPTFAIL) != 0) {
928 kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
929 }
930 if ((kn->kn_ptr.p_proc->p_lflag &
931 P_LTERM_JETSAM) != 0) {
932 kn->kn_data |= NOTE_EXIT_MEMORY;
933 switch (kn->kn_ptr.p_proc->p_lflag & P_JETSAM_MASK) {
934 case P_JETSAM_VMPAGESHORTAGE:
935 kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
936 break;
937 case P_JETSAM_VMTHRASHING:
938 kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
939 break;
940 case P_JETSAM_FCTHRASHING:
941 kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
942 break;
943 case P_JETSAM_VNODE:
944 kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
945 break;
946 case P_JETSAM_HIWAT:
947 kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
948 break;
949 case P_JETSAM_PID:
950 kn->kn_data |= NOTE_EXIT_MEMORY_PID;
951 break;
952 case P_JETSAM_IDLEEXIT:
953 kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
954 break;
955 }
956 }
957 if ((kn->kn_ptr.p_proc->p_csflags &
958 CS_KILLED) != 0) {
959 kn->kn_data |= NOTE_EXIT_CSERROR;
960 }
961 }
962 }
963
964 /* if we have any matching state, activate the knote */
965 return (kn->kn_fflags != 0);
966 }
967
968 static int
969 filt_proctouch(struct knote *kn, struct kevent_internal_s *kev)
970 {
971 int res;
972
973 proc_klist_lock();
974
975 /* accept new filter flags and mask off output events no long interesting */
976 kn->kn_sfflags = kev->fflags;
977 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
978 kn->kn_udata = kev->udata;
979
980 /* restrict the current results to the (smaller?) set of new interest */
981 /*
982 * For compatibility with previous implementations, we leave kn_fflags
983 * as they were before.
984 */
985 //kn->kn_fflags &= kn->kn_sfflags;
986
987 res = (kn->kn_fflags != 0);
988
989 proc_klist_unlock();
990
991 return res;
992 }
993
994 static int
995 filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
996 {
997 #pragma unused(data)
998 int res;
999
1000 proc_klist_lock();
1001 res = (kn->kn_fflags != 0);
1002 if (res) {
1003 *kev = kn->kn_kevent;
1004 kn->kn_flags |= EV_CLEAR; /* automatically set */
1005 kn->kn_fflags = 0;
1006 kn->kn_data = 0;
1007 }
1008 proc_klist_unlock();
1009 return res;
1010 }
1011
1012 /*
1013 * filt_timervalidate - process data from user
1014 *
1015 * Converts to either interval or deadline format.
1016 *
1017 * The saved-data field in the knote contains the
1018 * time value. The saved filter-flags indicates
1019 * the unit of measurement.
1020 *
1021 * After validation, either the saved-data field
1022 * contains the interval in absolute time, or ext[0]
1023 * contains the expected deadline. If that deadline
1024 * is in the past, ext[0] is 0.
1025 *
1026 * Returns EINVAL for unrecognized units of time.
1027 *
1028 * Timer filter lock is held.
1029 *
1030 */
1031 static int
1032 filt_timervalidate(struct knote *kn)
1033 {
1034 uint64_t multiplier;
1035 uint64_t raw = 0;
1036
1037 switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS)) {
1038 case NOTE_SECONDS:
1039 multiplier = NSEC_PER_SEC;
1040 break;
1041 case NOTE_USECONDS:
1042 multiplier = NSEC_PER_USEC;
1043 break;
1044 case NOTE_NSECONDS:
1045 multiplier = 1;
1046 break;
1047 case 0: /* milliseconds (default) */
1048 multiplier = NSEC_PER_SEC / 1000;
1049 break;
1050 default:
1051 return (EINVAL);
1052 }
1053
1054 /* transform the slop delta(leeway) in kn_ext[1] if passed to same time scale */
1055 if(kn->kn_sfflags & NOTE_LEEWAY){
1056 nanoseconds_to_absolutetime((uint64_t)kn->kn_ext[1] * multiplier, &raw);
1057 kn->kn_ext[1] = raw;
1058 }
1059
1060 nanoseconds_to_absolutetime((uint64_t)kn->kn_sdata * multiplier, &raw);
1061
1062 kn->kn_ext[0] = 0;
1063 kn->kn_sdata = 0;
1064
1065 if (kn->kn_sfflags & NOTE_ABSOLUTE) {
1066 clock_sec_t seconds;
1067 clock_nsec_t nanoseconds;
1068 uint64_t now;
1069
1070 clock_get_calendar_nanotime(&seconds, &nanoseconds);
1071 nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC +
1072 nanoseconds, &now);
1073
1074 /* if time is in the future */
1075 if (now < raw) {
1076 raw -= now;
1077
1078 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1079 clock_continuoustime_interval_to_deadline(raw,
1080 &kn->kn_ext[0]);
1081 } else {
1082 clock_absolutetime_interval_to_deadline(raw,
1083 &kn->kn_ext[0]);
1084 }
1085 }
1086 } else {
1087 kn->kn_sdata = raw;
1088 }
1089
1090 return (0);
1091 }
1092
1093 /*
1094 * filt_timerupdate - compute the next deadline
1095 *
1096 * Repeating timers store their interval in kn_sdata. Absolute
1097 * timers have already calculated the deadline, stored in ext[0].
1098 *
1099 * On return, the next deadline (or zero if no deadline is needed)
1100 * is stored in kn_ext[0].
1101 *
1102 * Timer filter lock is held.
1103 */
1104 static void
1105 filt_timerupdate(struct knote *kn, int num_fired)
1106 {
1107 assert(num_fired > 0);
1108
1109 /* if there's no interval, deadline is just in kn_ext[0] */
1110 if (kn->kn_sdata == 0)
1111 return;
1112
1113 /* if timer hasn't fired before, fire in interval nsecs */
1114 if (kn->kn_ext[0] == 0) {
1115 assert(num_fired == 1);
1116 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
1117 clock_continuoustime_interval_to_deadline(kn->kn_sdata,
1118 &kn->kn_ext[0]);
1119 } else {
1120 clock_absolutetime_interval_to_deadline(kn->kn_sdata,
1121 &kn->kn_ext[0]);
1122 }
1123 } else {
1124 /*
1125 * If timer has fired before, schedule the next pop
1126 * relative to the last intended deadline.
1127 *
1128 * We could check for whether the deadline has expired,
1129 * but the thread call layer can handle that.
1130 *
1131 * Go forward an additional number of periods, in the case the
1132 * timer fired multiple times while the system was asleep.
1133 */
1134 kn->kn_ext[0] += (kn->kn_sdata * num_fired);
1135 }
1136 }
1137
1138 /*
1139 * filt_timerexpire - the timer callout routine
1140 *
1141 * Just propagate the timer event into the knote
1142 * filter routine (by going through the knote
1143 * synchronization point). Pass a hint to
1144 * indicate this is a real event, not just a
1145 * query from above.
1146 */
1147 static void
1148 filt_timerexpire(void *knx, __unused void *spare)
1149 {
1150 struct klist timer_list;
1151 struct knote *kn = knx;
1152
1153 filt_timerlock();
1154
1155 kn->kn_hookid &= ~TIMER_RUNNING;
1156
1157 /* no "object" for timers, so fake a list */
1158 SLIST_INIT(&timer_list);
1159 SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
1160 KNOTE(&timer_list, 1);
1161
1162 /* if someone is waiting for timer to pop */
1163 if (kn->kn_hookid & TIMER_CANCELWAIT) {
1164 struct kqueue *kq = knote_get_kq(kn);
1165 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
1166 CAST_EVENT64_T(&kn->kn_hook),
1167 THREAD_AWAKENED,
1168 WAITQ_ALL_PRIORITIES);
1169 }
1170
1171 filt_timerunlock();
1172 }
1173
1174 /*
1175 * Cancel a running timer (or wait for the pop).
1176 * Timer filter lock is held.
1177 */
1178 static void
1179 filt_timercancel(struct knote *kn)
1180 {
1181 struct kqueue *kq = knote_get_kq(kn);
1182 thread_call_t callout = kn->kn_hook;
1183 boolean_t cancelled;
1184
1185 if (kn->kn_hookid & TIMER_RUNNING) {
1186 /* cancel the callout if we can */
1187 cancelled = thread_call_cancel(callout);
1188 if (cancelled) {
1189 kn->kn_hookid &= ~TIMER_RUNNING;
1190 } else {
1191 /* we have to wait for the expire routine. */
1192 kn->kn_hookid |= TIMER_CANCELWAIT;
1193 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
1194 CAST_EVENT64_T(&kn->kn_hook),
1195 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1196 filt_timerunlock();
1197 thread_block(THREAD_CONTINUE_NULL);
1198 filt_timerlock();
1199 assert((kn->kn_hookid & TIMER_RUNNING) == 0);
1200 }
1201 }
1202 }
1203
1204 /*
1205 * Allocate a thread call for the knote's lifetime, and kick off the timer.
1206 */
1207 static int
1208 filt_timerattach(struct knote *kn)
1209 {
1210 thread_call_t callout;
1211 int error;
1212 int res;
1213
1214 callout = thread_call_allocate(filt_timerexpire, kn);
1215 if (NULL == callout) {
1216 kn->kn_flags = EV_ERROR;
1217 kn->kn_data = ENOMEM;
1218 return 0;
1219 }
1220
1221 filt_timerlock();
1222 error = filt_timervalidate(kn);
1223 if (error != 0) {
1224 filt_timerunlock();
1225 thread_call_free(callout);
1226 kn->kn_flags = EV_ERROR;
1227 kn->kn_data = error;
1228 return 0;
1229 }
1230
1231 kn->kn_hook = (void*)callout;
1232 kn->kn_hookid = 0;
1233
1234 /* absolute=EV_ONESHOT */
1235 if (kn->kn_sfflags & NOTE_ABSOLUTE)
1236 kn->kn_flags |= EV_ONESHOT;
1237
1238 filt_timerupdate(kn, 1);
1239 if (kn->kn_ext[0]) {
1240 kn->kn_flags |= EV_CLEAR;
1241 unsigned int timer_flags = 0;
1242 if (kn->kn_sfflags & NOTE_CRITICAL)
1243 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1244 else if (kn->kn_sfflags & NOTE_BACKGROUND)
1245 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1246 else
1247 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1248
1249 if (kn->kn_sfflags & NOTE_LEEWAY)
1250 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1251 if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
1252 timer_flags |= THREAD_CALL_CONTINUOUS;
1253
1254 thread_call_enter_delayed_with_leeway(callout, NULL,
1255 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1256
1257 kn->kn_hookid |= TIMER_RUNNING;
1258 } else {
1259 /* fake immediate */
1260 kn->kn_data = 1;
1261 }
1262
1263 res = (kn->kn_data > 0);
1264
1265 filt_timerunlock();
1266
1267 return res;
1268 }
1269
1270 /*
1271 * Shut down the timer if it's running, and free the callout.
1272 */
1273 static void
1274 filt_timerdetach(struct knote *kn)
1275 {
1276 thread_call_t callout;
1277
1278 filt_timerlock();
1279
1280 callout = (thread_call_t)kn->kn_hook;
1281 filt_timercancel(kn);
1282
1283 filt_timerunlock();
1284
1285 thread_call_free(callout);
1286 }
1287
1288
1289 static int filt_timer_num_fired(struct knote *kn)
1290 {
1291 /* by default we fire a timer once */
1292 int num_fired = 1;
1293
1294 /*
1295 * When the time base is mach_continuous_time, we have to calculate
1296 * the number of times the timer fired while we were asleep.
1297 */
1298 if ((kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) &&
1299 (kn->kn_sdata != 0) &&
1300 (kn->kn_ext[0] != 0))
1301 {
1302 const uint64_t now = mach_continuous_time();
1303 // time for timer to fire (right now) is kn_ext[0]
1304 // kn_sdata is period for timer to fire
1305 assert(now >= kn->kn_ext[0]);
1306 assert(kn->kn_sdata > 0);
1307
1308 const uint64_t overrun_ticks = now - kn->kn_ext[0];
1309 const uint64_t kn_sdata = kn->kn_sdata;
1310
1311 if (overrun_ticks < kn_sdata) {
1312 num_fired = 1;
1313 } else if (overrun_ticks < (kn_sdata << 1)) {
1314 num_fired = 2;
1315 } else {
1316 num_fired = (overrun_ticks / kn_sdata) + 1;
1317 }
1318 }
1319
1320 return num_fired;
1321 }
1322
1323 /*
1324 * filt_timer - post events to a timer knote
1325 *
1326 * Count the timer fire and re-arm as requested.
1327 * This always crosses the threshold of interest,
1328 * so always return an indication that the knote
1329 * should be activated (if not already).
1330 */
1331 static int
1332 filt_timer(
1333 struct knote *kn,
1334 long hint)
1335 {
1336 #pragma unused(hint)
1337
1338 /* real timer pop -- timer lock held by filt_timerexpire */
1339 int num_fired = filt_timer_num_fired(kn);
1340 kn->kn_data += num_fired;
1341
1342 if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) &&
1343 ((kn->kn_flags & EV_ONESHOT) == 0)) {
1344 /* evaluate next time to fire */
1345 filt_timerupdate(kn, num_fired);
1346
1347 if (kn->kn_ext[0]) {
1348 unsigned int timer_flags = 0;
1349
1350 /* keep the callout and re-arm */
1351 if (kn->kn_sfflags & NOTE_CRITICAL)
1352 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1353 else if (kn->kn_sfflags & NOTE_BACKGROUND)
1354 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1355 else
1356 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1357
1358 if (kn->kn_sfflags & NOTE_LEEWAY)
1359 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1360
1361 thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
1362 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1363
1364 kn->kn_hookid |= TIMER_RUNNING;
1365 }
1366 }
1367 return (1);
1368 }
1369
1370
1371
1372 /*
1373 * filt_timertouch - update timer knote with new user input
1374 *
1375 * Cancel and restart the timer based on new user data. When
1376 * the user picks up a knote, clear the count of how many timer
1377 * pops have gone off (in kn_data).
1378 */
1379 static int
1380 filt_timertouch(
1381 struct knote *kn,
1382 struct kevent_internal_s *kev)
1383 {
1384 int error;
1385 int res;
1386
1387 filt_timerlock();
1388
1389 /* cancel current call */
1390 filt_timercancel(kn);
1391
1392 /* capture the new values used to compute deadline */
1393 kn->kn_sdata = kev->data;
1394 kn->kn_sfflags = kev->fflags;
1395 kn->kn_ext[0] = kev->ext[0];
1396 kn->kn_ext[1] = kev->ext[1];
1397
1398 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1399 kn->kn_udata = kev->udata;
1400
1401 /* recalculate deadline */
1402 error = filt_timervalidate(kn);
1403 if (error) {
1404 /* no way to report error, so mark it in the knote */
1405 filt_timerunlock();
1406 kn->kn_flags |= EV_ERROR;
1407 kn->kn_data = error;
1408 return 1;
1409 }
1410
1411 /* start timer if necessary */
1412 filt_timerupdate(kn, 1);
1413
1414 if (kn->kn_ext[0]) {
1415 unsigned int timer_flags = 0;
1416 if (kn->kn_sfflags & NOTE_CRITICAL)
1417 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1418 else if (kn->kn_sfflags & NOTE_BACKGROUND)
1419 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1420 else
1421 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1422
1423 if (kn->kn_sfflags & NOTE_LEEWAY)
1424 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1425
1426 thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
1427 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1428
1429 kn->kn_hookid |= TIMER_RUNNING;
1430 } else {
1431 /* pretend the timer has fired */
1432 kn->kn_data = 1;
1433 }
1434
1435 /* capture if already fired */
1436 res = (kn->kn_data > 0);
1437
1438 filt_timerunlock();
1439
1440 return res;
1441 }
1442
1443 /*
1444 * filt_timerprocess - query state of knote and snapshot event data
1445 *
1446 * Determine if the timer has fired in the past, snapshot the state
1447 * of the kevent for returning to user-space, and clear pending event
1448 * counters for the next time.
1449 */
1450 static int
1451 filt_timerprocess(
1452 struct knote *kn,
1453 __unused struct filt_process_s *data,
1454 struct kevent_internal_s *kev)
1455 {
1456 filt_timerlock();
1457
1458 /* user-query */
1459 if (kn->kn_data == 0) {
1460 filt_timerunlock();
1461 return 0;
1462 }
1463
1464 /*
1465 * Copy out the interesting kevent state,
1466 * but don't leak out the raw time calculations.
1467 */
1468 *kev = kn->kn_kevent;
1469 kev->ext[0] = 0;
1470 /* kev->ext[1] = 0; JMM - shouldn't we hide this too? */
1471
1472 /*
1473 * reset the timer pop count in kn_data
1474 * and (optionally) clear the fflags.
1475 */
1476 kn->kn_data = 0;
1477 if (kn->kn_flags & EV_CLEAR)
1478 kn->kn_fflags = 0;
1479
1480 filt_timerunlock();
1481 return 1;
1482 }
1483
1484 static void
1485 filt_timerlock(void)
1486 {
1487 lck_mtx_lock(&_filt_timerlock);
1488 }
1489
1490 static void
1491 filt_timerunlock(void)
1492 {
1493 lck_mtx_unlock(&_filt_timerlock);
1494 }
1495
1496 static void
1497 filt_userlock(void)
1498 {
1499 lck_spin_lock(&_filt_userlock);
1500 }
1501
1502 static void
1503 filt_userunlock(void)
1504 {
1505 lck_spin_unlock(&_filt_userlock);
1506 }
1507
1508 static int
1509 filt_userattach(struct knote *kn)
1510 {
1511 /* EVFILT_USER knotes are not attached to anything in the kernel */
1512 /* Cant discover this knote until after attach - so no lock needed */
1513 kn->kn_hook = NULL;
1514 if (kn->kn_fflags & NOTE_TRIGGER) {
1515 kn->kn_hookid = 1;
1516 } else {
1517 kn->kn_hookid = 0;
1518 }
1519 return (kn->kn_hookid);
1520 }
1521
1522 static void
1523 filt_userdetach(__unused struct knote *kn)
1524 {
1525 /* EVFILT_USER knotes are not attached to anything in the kernel */
1526 }
1527
1528 static int
1529 filt_user(
1530 __unused struct knote *kn,
1531 __unused long hint)
1532 {
1533 panic("filt_user");
1534 return 0;
1535 }
1536
1537 static int
1538 filt_usertouch(
1539 struct knote *kn,
1540 struct kevent_internal_s *kev)
1541 {
1542 uint32_t ffctrl;
1543 int fflags;
1544 int active;
1545
1546 filt_userlock();
1547
1548 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1549 fflags = kev->fflags & NOTE_FFLAGSMASK;
1550 switch (ffctrl) {
1551 case NOTE_FFNOP:
1552 break;
1553 case NOTE_FFAND:
1554 kn->kn_sfflags &= fflags;
1555 break;
1556 case NOTE_FFOR:
1557 kn->kn_sfflags |= fflags;
1558 break;
1559 case NOTE_FFCOPY:
1560 kn->kn_sfflags = fflags;
1561 break;
1562 }
1563 kn->kn_sdata = kev->data;
1564
1565 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
1566 kn->kn_udata = kev->udata;
1567
1568 if (kev->fflags & NOTE_TRIGGER) {
1569 kn->kn_hookid = 1;
1570 }
1571 active = kn->kn_hookid;
1572
1573 filt_userunlock();
1574
1575 return (active);
1576 }
1577
1578 static int
1579 filt_userprocess(
1580 struct knote *kn,
1581 __unused struct filt_process_s *data,
1582 struct kevent_internal_s *kev)
1583 {
1584 filt_userlock();
1585
1586 if (kn->kn_hookid == 0) {
1587 filt_userunlock();
1588 return 0;
1589 }
1590
1591 *kev = kn->kn_kevent;
1592 kev->fflags = (volatile UInt32)kn->kn_sfflags;
1593 kev->data = kn->kn_sdata;
1594 if (kn->kn_flags & EV_CLEAR) {
1595 kn->kn_hookid = 0;
1596 kn->kn_data = 0;
1597 kn->kn_fflags = 0;
1598 }
1599 filt_userunlock();
1600
1601 return 1;
1602 }
1603
1604 /*
1605 * JMM - placeholder for not-yet-implemented filters
1606 */
1607 static int
1608 filt_badattach(__unused struct knote *kn)
1609 {
1610 kn->kn_flags |= EV_ERROR;
1611 kn->kn_data = ENOTSUP;
1612 return 0;
1613 }
1614
1615 struct kqueue *
1616 kqueue_alloc(struct proc *p, unsigned int flags)
1617 {
1618 struct filedesc *fdp = p->p_fd;
1619 struct kqueue *kq = NULL;
1620 int policy;
1621 void *hook;
1622 uint64_t kq_addr_offset;
1623
1624 if (flags & KEVENT_FLAG_WORKQ) {
1625 struct kqworkq *kqwq;
1626 int i;
1627
1628 kqwq = (struct kqworkq *)zalloc(kqworkq_zone);
1629 if (kqwq == NULL)
1630 return NULL;
1631
1632 kq = &kqwq->kqwq_kqueue;
1633 bzero(kqwq, sizeof (struct kqworkq));
1634
1635 kqwq->kqwq_state = KQ_WORKQ;
1636
1637 for (i = 0; i < KQWQ_NBUCKETS; i++) {
1638 TAILQ_INIT(&kq->kq_queue[i]);
1639 }
1640 for (i = 0; i < KQWQ_NQOS; i++) {
1641 TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed);
1642 }
1643
1644 lck_spin_init(&kqwq->kqwq_reqlock, kq_lck_grp, kq_lck_attr);
1645 policy = SYNC_POLICY_FIFO;
1646 hook = (void *)kqwq;
1647
1648 } else {
1649 struct kqfile *kqf;
1650
1651 kqf = (struct kqfile *)zalloc(kqfile_zone);
1652 if (kqf == NULL)
1653 return NULL;
1654
1655 kq = &kqf->kqf_kqueue;
1656 bzero(kqf, sizeof (struct kqfile));
1657 TAILQ_INIT(&kq->kq_queue[0]);
1658 TAILQ_INIT(&kqf->kqf_suppressed);
1659
1660 policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST;
1661 hook = NULL;
1662
1663 }
1664
1665 waitq_set_init(&kq->kq_wqs, policy, NULL, hook);
1666 lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
1667 kq->kq_p = p;
1668
1669 if (fdp->fd_knlistsize < 0) {
1670 proc_fdlock(p);
1671 if (fdp->fd_knlistsize < 0)
1672 fdp->fd_knlistsize = 0; /* this process has had a kq */
1673 proc_fdunlock(p);
1674 }
1675
1676 kq_addr_offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS);
1677 /* Assert that the address can be pointer compacted for use with knote */
1678 assert(kq_addr_offset < (uint64_t)(1ull << KNOTE_KQ_BITSIZE));
1679 return (kq);
1680 }
1681
1682 /*
1683 * kqueue_dealloc - detach all knotes from a kqueue and free it
1684 *
1685 * We walk each list looking for knotes referencing this
1686 * this kqueue. If we find one, we try to drop it. But
1687 * if we fail to get a drop reference, that will wait
1688 * until it is dropped. So, we can just restart again
1689 * safe in the assumption that the list will eventually
1690 * not contain any more references to this kqueue (either
1691 * we dropped them all, or someone else did).
1692 *
1693 * Assumes no new events are being added to the kqueue.
1694 * Nothing locked on entry or exit.
1695 */
1696 void
1697 kqueue_dealloc(struct kqueue *kq)
1698 {
1699 struct proc *p;
1700 struct filedesc *fdp;
1701 struct knote *kn;
1702 int i;
1703
1704 if (kq == NULL)
1705 return;
1706
1707 p = kq->kq_p;
1708 fdp = p->p_fd;
1709
1710 proc_fdlock(p);
1711 for (i = 0; i < fdp->fd_knlistsize; i++) {
1712 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1713 while (kn != NULL) {
1714 if (kq == knote_get_kq(kn)) {
1715 kqlock(kq);
1716 proc_fdunlock(p);
1717 /* drop it ourselves or wait */
1718 if (kqlock2knotedrop(kq, kn)) {
1719 knote_drop(kn, p);
1720 }
1721 proc_fdlock(p);
1722 /* start over at beginning of list */
1723 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1724 continue;
1725 }
1726 kn = SLIST_NEXT(kn, kn_link);
1727 }
1728 }
1729 if (fdp->fd_knhashmask != 0) {
1730 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
1731 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1732 while (kn != NULL) {
1733 if (kq == knote_get_kq(kn)) {
1734 kqlock(kq);
1735 proc_fdunlock(p);
1736 /* drop it ourselves or wait */
1737 if (kqlock2knotedrop(kq, kn)) {
1738 knote_drop(kn, p);
1739 }
1740 proc_fdlock(p);
1741 /* start over at beginning of list */
1742 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1743 continue;
1744 }
1745 kn = SLIST_NEXT(kn, kn_link);
1746 }
1747 }
1748 }
1749 proc_fdunlock(p);
1750
1751 /*
1752 * waitq_set_deinit() remove the KQ's waitq set from
1753 * any select sets to which it may belong.
1754 */
1755 waitq_set_deinit(&kq->kq_wqs);
1756 lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
1757
1758 if (kq->kq_state & KQ_WORKQ) {
1759 struct kqworkq *kqwq = (struct kqworkq *)kq;
1760
1761 lck_spin_destroy(&kqwq->kqwq_reqlock, kq_lck_grp);
1762 zfree(kqworkq_zone, kqwq);
1763 } else {
1764 struct kqfile *kqf = (struct kqfile *)kq;
1765
1766 zfree(kqfile_zone, kqf);
1767 }
1768 }
1769
1770 int
1771 kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
1772 {
1773 struct kqueue *kq;
1774 struct fileproc *fp;
1775 int fd, error;
1776
1777 error = falloc_withalloc(p,
1778 &fp, &fd, vfs_context_current(), fp_zalloc, cra);
1779 if (error) {
1780 return (error);
1781 }
1782
1783 kq = kqueue_alloc(p, 0);
1784 if (kq == NULL) {
1785 fp_free(p, fd, fp);
1786 return (ENOMEM);
1787 }
1788
1789 fp->f_flag = FREAD | FWRITE;
1790 fp->f_ops = &kqueueops;
1791 fp->f_data = kq;
1792
1793 proc_fdlock(p);
1794 *fdflags(p, fd) |= UF_EXCLOSE;
1795 procfdtbl_releasefd(p, fd, NULL);
1796 fp_drop(p, fd, fp, 1);
1797 proc_fdunlock(p);
1798
1799 *retval = fd;
1800 return (error);
1801 }
1802
1803 int
1804 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
1805 {
1806 return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
1807 }
1808
1809 static int
1810 kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
1811 unsigned int flags)
1812 {
1813 int advance;
1814 int error;
1815
1816 if (flags & KEVENT_FLAG_LEGACY32) {
1817 bzero(kevp, sizeof (*kevp));
1818
1819 if (IS_64BIT_PROCESS(p)) {
1820 struct user64_kevent kev64;
1821
1822 advance = sizeof (kev64);
1823 error = copyin(*addrp, (caddr_t)&kev64, advance);
1824 if (error)
1825 return (error);
1826 kevp->ident = kev64.ident;
1827 kevp->filter = kev64.filter;
1828 kevp->flags = kev64.flags;
1829 kevp->udata = kev64.udata;
1830 kevp->fflags = kev64.fflags;
1831 kevp->data = kev64.data;
1832 } else {
1833 struct user32_kevent kev32;
1834
1835 advance = sizeof (kev32);
1836 error = copyin(*addrp, (caddr_t)&kev32, advance);
1837 if (error)
1838 return (error);
1839 kevp->ident = (uintptr_t)kev32.ident;
1840 kevp->filter = kev32.filter;
1841 kevp->flags = kev32.flags;
1842 kevp->udata = CAST_USER_ADDR_T(kev32.udata);
1843 kevp->fflags = kev32.fflags;
1844 kevp->data = (intptr_t)kev32.data;
1845 }
1846 } else if (flags & KEVENT_FLAG_LEGACY64) {
1847 struct kevent64_s kev64;
1848
1849 bzero(kevp, sizeof (*kevp));
1850
1851 advance = sizeof (struct kevent64_s);
1852 error = copyin(*addrp, (caddr_t)&kev64, advance);
1853 if (error)
1854 return(error);
1855 kevp->ident = kev64.ident;
1856 kevp->filter = kev64.filter;
1857 kevp->flags = kev64.flags;
1858 kevp->udata = kev64.udata;
1859 kevp->fflags = kev64.fflags;
1860 kevp->data = kev64.data;
1861 kevp->ext[0] = kev64.ext[0];
1862 kevp->ext[1] = kev64.ext[1];
1863
1864 } else {
1865 struct kevent_qos_s kevqos;
1866
1867 bzero(kevp, sizeof (*kevp));
1868
1869 advance = sizeof (struct kevent_qos_s);
1870 error = copyin(*addrp, (caddr_t)&kevqos, advance);
1871 if (error)
1872 return error;
1873 kevp->ident = kevqos.ident;
1874 kevp->filter = kevqos.filter;
1875 kevp->flags = kevqos.flags;
1876 kevp->qos = kevqos.qos;
1877 // kevp->xflags = kevqos.xflags;
1878 kevp->udata = kevqos.udata;
1879 kevp->fflags = kevqos.fflags;
1880 kevp->data = kevqos.data;
1881 kevp->ext[0] = kevqos.ext[0];
1882 kevp->ext[1] = kevqos.ext[1];
1883 kevp->ext[2] = kevqos.ext[2];
1884 kevp->ext[3] = kevqos.ext[3];
1885 }
1886 if (!error)
1887 *addrp += advance;
1888 return (error);
1889 }
1890
1891 static int
1892 kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
1893 unsigned int flags)
1894 {
1895 user_addr_t addr = *addrp;
1896 int advance;
1897 int error;
1898
1899 /*
1900 * fully initialize the differnt output event structure
1901 * types from the internal kevent (and some universal
1902 * defaults for fields not represented in the internal
1903 * form).
1904 */
1905 if (flags & KEVENT_FLAG_LEGACY32) {
1906 assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0);
1907
1908 if (IS_64BIT_PROCESS(p)) {
1909 struct user64_kevent kev64;
1910
1911 advance = sizeof (kev64);
1912 bzero(&kev64, advance);
1913
1914 /*
1915 * deal with the special case of a user-supplied
1916 * value of (uintptr_t)-1.
1917 */
1918 kev64.ident = (kevp->ident == (uintptr_t)-1) ?
1919 (uint64_t)-1LL : (uint64_t)kevp->ident;
1920
1921 kev64.filter = kevp->filter;
1922 kev64.flags = kevp->flags;
1923 kev64.fflags = kevp->fflags;
1924 kev64.data = (int64_t) kevp->data;
1925 kev64.udata = kevp->udata;
1926 error = copyout((caddr_t)&kev64, addr, advance);
1927 } else {
1928 struct user32_kevent kev32;
1929
1930 advance = sizeof (kev32);
1931 bzero(&kev32, advance);
1932 kev32.ident = (uint32_t)kevp->ident;
1933 kev32.filter = kevp->filter;
1934 kev32.flags = kevp->flags;
1935 kev32.fflags = kevp->fflags;
1936 kev32.data = (int32_t)kevp->data;
1937 kev32.udata = kevp->udata;
1938 error = copyout((caddr_t)&kev32, addr, advance);
1939 }
1940 } else if (flags & KEVENT_FLAG_LEGACY64) {
1941 struct kevent64_s kev64;
1942
1943 advance = sizeof (struct kevent64_s);
1944 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1945 addr -= advance;
1946 }
1947 bzero(&kev64, advance);
1948 kev64.ident = kevp->ident;
1949 kev64.filter = kevp->filter;
1950 kev64.flags = kevp->flags;
1951 kev64.fflags = kevp->fflags;
1952 kev64.data = (int64_t) kevp->data;
1953 kev64.udata = kevp->udata;
1954 kev64.ext[0] = kevp->ext[0];
1955 kev64.ext[1] = kevp->ext[1];
1956 error = copyout((caddr_t)&kev64, addr, advance);
1957 } else {
1958 struct kevent_qos_s kevqos;
1959
1960 advance = sizeof (struct kevent_qos_s);
1961 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1962 addr -= advance;
1963 }
1964 bzero(&kevqos, advance);
1965 kevqos.ident = kevp->ident;
1966 kevqos.filter = kevp->filter;
1967 kevqos.flags = kevp->flags;
1968 kevqos.qos = kevp->qos;
1969 kevqos.udata = kevp->udata;
1970 kevqos.fflags = kevp->fflags;
1971 kevqos.xflags = 0;
1972 kevqos.data = (int64_t) kevp->data;
1973 kevqos.ext[0] = kevp->ext[0];
1974 kevqos.ext[1] = kevp->ext[1];
1975 kevqos.ext[2] = kevp->ext[2];
1976 kevqos.ext[3] = kevp->ext[3];
1977 error = copyout((caddr_t)&kevqos, addr, advance);
1978 }
1979 if (!error) {
1980 if (flags & KEVENT_FLAG_STACK_EVENTS)
1981 *addrp = addr;
1982 else
1983 *addrp = addr + advance;
1984 }
1985 return (error);
1986 }
1987
1988 static int
1989 kevent_get_data_size(struct proc *p,
1990 uint64_t data_available,
1991 unsigned int flags,
1992 user_size_t *residp)
1993 {
1994 user_size_t resid;
1995 int error = 0;
1996
1997 if (data_available != USER_ADDR_NULL) {
1998 if (flags & KEVENT_FLAG_KERNEL) {
1999 resid = *(user_size_t *)(uintptr_t)data_available;
2000 } else if (IS_64BIT_PROCESS(p)) {
2001 user64_size_t usize;
2002 error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
2003 resid = (user_size_t)usize;
2004 } else {
2005 user32_size_t usize;
2006 error = copyin((user_addr_t)data_available, &usize, sizeof(usize));
2007 resid = (user_size_t)usize;
2008 }
2009 if (error)
2010 return(error);
2011 } else {
2012 resid = 0;
2013 }
2014 *residp = resid;
2015 return 0;
2016 }
2017
2018 static int
2019 kevent_put_data_size(struct proc *p,
2020 uint64_t data_available,
2021 unsigned int flags,
2022 user_size_t resid)
2023 {
2024 int error = 0;
2025
2026 if (data_available) {
2027 if (flags & KEVENT_FLAG_KERNEL) {
2028 *(user_size_t *)(uintptr_t)data_available = resid;
2029 } else if (IS_64BIT_PROCESS(p)) {
2030 user64_size_t usize = (user64_size_t)resid;
2031 error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
2032 } else {
2033 user32_size_t usize = (user32_size_t)resid;
2034 error = copyout(&usize, (user_addr_t)data_available, sizeof(usize));
2035 }
2036 }
2037 return error;
2038 }
2039
2040 /*
2041 * kevent_continue - continue a kevent syscall after blocking
2042 *
2043 * assume we inherit a use count on the kq fileglob.
2044 */
2045
2046 __attribute__((noreturn))
2047 static void
2048 kevent_continue(__unused struct kqueue *kq, void *data, int error)
2049 {
2050 struct _kevent *cont_args;
2051 struct fileproc *fp;
2052 uint64_t data_available;
2053 user_size_t data_size;
2054 user_size_t data_resid;
2055 unsigned int flags;
2056 int32_t *retval;
2057 int noutputs;
2058 int fd;
2059 struct proc *p = current_proc();
2060
2061 cont_args = (struct _kevent *)data;
2062 data_available = cont_args->data_available;
2063 flags = cont_args->process_data.fp_flags;
2064 data_size = cont_args->process_data.fp_data_size;
2065 data_resid = cont_args->process_data.fp_data_resid;
2066 noutputs = cont_args->eventout;
2067 retval = cont_args->retval;
2068 fd = cont_args->fd;
2069 fp = cont_args->fp;
2070
2071 if (fp != NULL)
2072 fp_drop(p, fd, fp, 0);
2073
2074 /* don't abandon other output just because of residual copyout failures */
2075 if (error == 0 && data_available && data_resid != data_size) {
2076 (void)kevent_put_data_size(p, data_available, flags, data_resid);
2077 }
2078
2079 /* don't restart after signals... */
2080 if (error == ERESTART)
2081 error = EINTR;
2082 else if (error == EWOULDBLOCK)
2083 error = 0;
2084 if (error == 0)
2085 *retval = noutputs;
2086 unix_syscall_return(error);
2087 }
2088
2089 /*
2090 * kevent - [syscall] register and wait for kernel events
2091 *
2092 */
2093 int
2094 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
2095 {
2096 unsigned int flags = KEVENT_FLAG_LEGACY32;
2097
2098 return kevent_internal(p,
2099 uap->fd,
2100 uap->changelist, uap->nchanges,
2101 uap->eventlist, uap->nevents,
2102 0ULL, 0ULL,
2103 flags,
2104 uap->timeout,
2105 kevent_continue,
2106 retval);
2107 }
2108
2109 int
2110 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
2111 {
2112 unsigned int flags;
2113
2114 /* restrict to user flags and set legacy64 */
2115 flags = uap->flags & KEVENT_FLAG_USER;
2116 flags |= KEVENT_FLAG_LEGACY64;
2117
2118 return kevent_internal(p,
2119 uap->fd,
2120 uap->changelist, uap->nchanges,
2121 uap->eventlist, uap->nevents,
2122 0ULL, 0ULL,
2123 flags,
2124 uap->timeout,
2125 kevent_continue,
2126 retval);
2127 }
2128
2129 int
2130 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
2131 {
2132 /* restrict to user flags */
2133 uap->flags &= KEVENT_FLAG_USER;
2134
2135 return kevent_internal(p,
2136 uap->fd,
2137 uap->changelist, uap->nchanges,
2138 uap->eventlist, uap->nevents,
2139 uap->data_out, (uint64_t)uap->data_available,
2140 uap->flags,
2141 0ULL,
2142 kevent_continue,
2143 retval);
2144 }
2145
2146 int
2147 kevent_qos_internal(struct proc *p, int fd,
2148 user_addr_t changelist, int nchanges,
2149 user_addr_t eventlist, int nevents,
2150 user_addr_t data_out, user_size_t *data_available,
2151 unsigned int flags,
2152 int32_t *retval)
2153 {
2154 return kevent_internal(p,
2155 fd,
2156 changelist, nchanges,
2157 eventlist, nevents,
2158 data_out, (uint64_t)data_available,
2159 (flags | KEVENT_FLAG_KERNEL),
2160 0ULL,
2161 NULL,
2162 retval);
2163 }
2164
2165 static int
2166 kevent_get_timeout(struct proc *p,
2167 user_addr_t utimeout,
2168 unsigned int flags,
2169 struct timeval *atvp)
2170 {
2171 struct timeval atv;
2172 int error = 0;
2173
2174 if (flags & KEVENT_FLAG_IMMEDIATE) {
2175 getmicrouptime(&atv);
2176 } else if (utimeout != USER_ADDR_NULL) {
2177 struct timeval rtv;
2178 if (flags & KEVENT_FLAG_KERNEL) {
2179 struct timespec *tsp = (struct timespec *)utimeout;
2180 TIMESPEC_TO_TIMEVAL(&rtv, tsp);
2181 } else if (IS_64BIT_PROCESS(p)) {
2182 struct user64_timespec ts;
2183 error = copyin(utimeout, &ts, sizeof(ts));
2184 if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
2185 error = EINVAL;
2186 else
2187 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
2188 } else {
2189 struct user32_timespec ts;
2190 error = copyin(utimeout, &ts, sizeof(ts));
2191 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
2192 }
2193 if (error)
2194 return (error);
2195 if (itimerfix(&rtv))
2196 return (EINVAL);
2197 getmicrouptime(&atv);
2198 timevaladd(&atv, &rtv);
2199 } else {
2200 /* wait forever value */
2201 atv.tv_sec = 0;
2202 atv.tv_usec = 0;
2203 }
2204 *atvp = atv;
2205 return 0;
2206 }
2207
2208 static int
2209 kevent_set_kq_mode(struct kqueue *kq, unsigned int flags)
2210 {
2211 /* each kq should only be used for events of one type */
2212 kqlock(kq);
2213 if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) {
2214 if (flags & KEVENT_FLAG_LEGACY32) {
2215 if ((kq->kq_state & KQ_KEV32) == 0) {
2216 kqunlock(kq);
2217 return EINVAL;
2218 }
2219 } else if (kq->kq_state & KQ_KEV32) {
2220 kqunlock(kq);
2221 return EINVAL;
2222 }
2223 } else if (flags & KEVENT_FLAG_LEGACY32) {
2224 kq->kq_state |= KQ_KEV32;
2225 } else {
2226 /* JMM - set KQ_KEVQOS when we are ready for exclusive */
2227 kq->kq_state |= KQ_KEV64;
2228 }
2229 kqunlock(kq);
2230 return 0;
2231 }
2232
2233 static int
2234 kevent_get_kq(struct proc *p, int fd, unsigned int flags, struct fileproc **fpp, struct kqueue **kqp)
2235 {
2236 struct fileproc *fp = NULL;
2237 struct kqueue *kq;
2238 int error;
2239
2240 if (flags & KEVENT_FLAG_WORKQ) {
2241 /*
2242 * use the private kq associated with the proc workq.
2243 * Just being a thread within the process (and not
2244 * being the exit/exec thread) is enough to hold a
2245 * reference on this special kq.
2246 */
2247 kq = p->p_wqkqueue;
2248 if (kq == NULL) {
2249 struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ);
2250 if (alloc_kq == NULL)
2251 return ENOMEM;
2252
2253 proc_fdlock(p);
2254 if (p->p_wqkqueue == NULL) {
2255 kq = p->p_wqkqueue = alloc_kq;
2256 proc_fdunlock(p);
2257 } else {
2258 proc_fdunlock(p);
2259 kq = p->p_wqkqueue;
2260 kqueue_dealloc(alloc_kq);
2261 }
2262 }
2263 } else {
2264 /* get a usecount for the kq itself */
2265 if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
2266 return (error);
2267 }
2268 if ((error = kevent_set_kq_mode(kq, flags)) != 0) {
2269 /* drop the usecount */
2270 if (fp != NULL)
2271 fp_drop(p, fd, fp, 0);
2272 return error;
2273 }
2274
2275 *fpp = fp;
2276 *kqp = kq;
2277 return 0;
2278 }
2279
2280
2281 static int
2282 kevent_internal(struct proc *p,
2283 int fd,
2284 user_addr_t changelist, int nchanges,
2285 user_addr_t ueventlist, int nevents,
2286 user_addr_t data_out, uint64_t data_available,
2287 unsigned int flags,
2288 user_addr_t utimeout,
2289 kqueue_continue_t continuation,
2290 int32_t *retval)
2291 {
2292 struct _kevent *cont_args;
2293 uthread_t ut;
2294 struct kqueue *kq;
2295 struct fileproc *fp = NULL;
2296 struct kevent_internal_s kev;
2297 int error, noutputs;
2298 struct timeval atv;
2299 user_size_t data_size;
2300 user_size_t data_resid;
2301
2302 /* Don't allow user-space threads to process output events from the workq kq */
2303 if ((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ &&
2304 !(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0)
2305 return EINVAL;
2306
2307 /* prepare to deal with stack-wise allocation of out events */
2308 if (flags & KEVENT_FLAG_STACK_EVENTS) {
2309 int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
2310 (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
2311 sizeof(struct user32_kevent)) :
2312 ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
2313 sizeof(struct kevent_qos_s)));
2314 ueventlist += nevents * scale;
2315 }
2316
2317 /* convert timeout to absolute - if we have one (and not immediate) */
2318 error = kevent_get_timeout(p, utimeout, flags, &atv);
2319 if (error)
2320 return error;
2321
2322 /* copyin initial value of data residual from data_available */
2323 error = kevent_get_data_size(p, data_available, flags, &data_size);
2324 if (error)
2325 return error;
2326
2327 /* get the kq we are going to be working on */
2328 error = kevent_get_kq(p, fd, flags, &fp, &kq);
2329 if (error)
2330 return error;
2331
2332 /* register all the change requests the user provided... */
2333 noutputs = 0;
2334 while (nchanges > 0 && error == 0) {
2335 error = kevent_copyin(&changelist, &kev, p, flags);
2336 if (error)
2337 break;
2338
2339 /* Make sure user doesn't pass in any system flags */
2340 kev.flags &= ~EV_SYSFLAGS;
2341
2342 kevent_register(kq, &kev, p);
2343
2344 if (nevents > 0 &&
2345 ((kev.flags & EV_ERROR) || (kev.flags & EV_RECEIPT))) {
2346 if (kev.flags & EV_RECEIPT) {
2347 kev.flags |= EV_ERROR;
2348 kev.data = 0;
2349 }
2350 error = kevent_copyout(&kev, &ueventlist, p, flags);
2351 if (error == 0) {
2352 nevents--;
2353 noutputs++;
2354 }
2355 } else if (kev.flags & EV_ERROR) {
2356 error = kev.data;
2357 }
2358 nchanges--;
2359 }
2360
2361 /* short-circuit the scan if we only want error events */
2362 if (flags & KEVENT_FLAG_ERROR_EVENTS)
2363 nevents = 0;
2364
2365 /* process pending events */
2366 if (nevents > 0 && noutputs == 0 && error == 0) {
2367
2368 /* store the continuation/completion data in the uthread */
2369 ut = (uthread_t)get_bsdthread_info(current_thread());
2370 cont_args = &ut->uu_kevent.ss_kevent;
2371 cont_args->fp = fp;
2372 cont_args->fd = fd;
2373 cont_args->retval = retval;
2374 cont_args->eventlist = ueventlist;
2375 cont_args->eventcount = nevents;
2376 cont_args->eventout = noutputs;
2377 cont_args->data_available = data_available;
2378 cont_args->process_data.fp_fd = fd;
2379 cont_args->process_data.fp_flags = flags;
2380 cont_args->process_data.fp_data_out = data_out;
2381 cont_args->process_data.fp_data_size = data_size;
2382 cont_args->process_data.fp_data_resid = data_size;
2383
2384 error = kqueue_scan(kq, kevent_callback,
2385 continuation, cont_args,
2386 &cont_args->process_data,
2387 &atv, p);
2388
2389 /* process remaining outputs */
2390 noutputs = cont_args->eventout;
2391 data_resid = cont_args->process_data.fp_data_resid;
2392
2393 /* copyout residual data size value (if it needs to be copied out) */
2394 /* don't abandon other output just because of residual copyout failures */
2395 if (error == 0 && data_available && data_resid != data_size) {
2396 (void)kevent_put_data_size(p, data_available, flags, data_resid);
2397 }
2398 }
2399
2400 /* don't restart after signals... */
2401 if (error == ERESTART)
2402 error = EINTR;
2403 else if (error == EWOULDBLOCK)
2404 error = 0;
2405 if (error == 0)
2406 *retval = noutputs;
2407 if (fp != NULL)
2408 fp_drop(p, fd, fp, 0);
2409 return (error);
2410 }
2411
2412
2413 /*
2414 * kevent_callback - callback for each individual event
2415 *
2416 * called with nothing locked
2417 * caller holds a reference on the kqueue
2418 */
2419 static int
2420 kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
2421 void *data)
2422 {
2423 struct _kevent *cont_args;
2424 int error;
2425
2426 cont_args = (struct _kevent *)data;
2427 assert(cont_args->eventout < cont_args->eventcount);
2428
2429 /*
2430 * Copy out the appropriate amount of event data for this user.
2431 */
2432 error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
2433 cont_args->process_data.fp_flags);
2434
2435 /*
2436 * If there isn't space for additional events, return
2437 * a harmless error to stop the processing here
2438 */
2439 if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
2440 error = EWOULDBLOCK;
2441 return (error);
2442 }
2443
2444 /*
2445 * kevent_description - format a description of a kevent for diagnostic output
2446 *
2447 * called with a 256-byte string buffer
2448 */
2449
2450 char *
2451 kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
2452 {
2453 snprintf(s, n,
2454 "kevent="
2455 "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
2456 kevp->ident,
2457 kevp->filter,
2458 kevp->flags,
2459 kevp->udata,
2460 kevp->fflags,
2461 kevp->data,
2462 kevp->ext[0],
2463 kevp->ext[1] );
2464
2465 return (s);
2466 }
2467
2468 /*
2469 * kevent_register - add a new event to a kqueue
2470 *
2471 * Creates a mapping between the event source and
2472 * the kqueue via a knote data structure.
2473 *
2474 * Because many/most the event sources are file
2475 * descriptor related, the knote is linked off
2476 * the filedescriptor table for quick access.
2477 *
2478 * called with nothing locked
2479 * caller holds a reference on the kqueue
2480 */
2481
2482 void
2483 kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
2484 __unused struct proc *ctxp)
2485 {
2486 struct proc *p = kq->kq_p;
2487 struct filterops *fops;
2488 struct knote *kn = NULL;
2489 int result = 0;
2490 int error = 0;
2491
2492 if (kev->filter < 0) {
2493 if (kev->filter + EVFILT_SYSCOUNT < 0) {
2494 error = EINVAL;
2495 goto out;
2496 }
2497 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
2498 } else {
2499 error = EINVAL;
2500 goto out;
2501 }
2502
2503 /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
2504 if ((kev->flags & EV_VANISHED) &&
2505 (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) {
2506 error = EINVAL;
2507 goto out;
2508 }
2509
2510 /* Simplify the flags - delete and disable overrule */
2511 if (kev->flags & EV_DELETE)
2512 kev->flags &= ~EV_ADD;
2513 if (kev->flags & EV_DISABLE)
2514 kev->flags &= ~EV_ENABLE;
2515
2516 restart:
2517
2518 proc_fdlock(p);
2519
2520 /* find the matching knote from the fd tables/hashes */
2521 kn = knote_fdfind(kq, kev, p);
2522
2523 if (kn == NULL) {
2524 if (kev->flags & EV_ADD) {
2525 struct fileproc *fp = NULL;
2526
2527 /* grab a file reference for the new knote */
2528 if (fops->f_isfd) {
2529 if ((error = fp_lookup(p, kev->ident, &fp, 1)) != 0) {
2530 proc_fdunlock(p);
2531 goto out;
2532 }
2533 }
2534
2535 kn = knote_alloc();
2536 if (kn == NULL) {
2537 proc_fdunlock(p);
2538 error = ENOMEM;
2539 if (fp != NULL)
2540 fp_drop(p, kev->ident, fp, 0);
2541 goto out;
2542 }
2543
2544 kn->kn_fp = fp;
2545 knote_set_kq(kn,kq);
2546 kn->kn_filtid = ~kev->filter;
2547 kn->kn_inuse = 1; /* for f_attach() */
2548 kn->kn_status = KN_ATTACHING | KN_ATTACHED;
2549
2550 /* was vanish support requested */
2551 if (kev->flags & EV_VANISHED) {
2552 kev->flags &= ~EV_VANISHED;
2553 kn->kn_status |= KN_REQVANISH;
2554 }
2555
2556 /* snapshot matching/dispatching protcol flags into knote */
2557 if (kev->flags & EV_DISPATCH)
2558 kn->kn_status |= KN_DISPATCH;
2559 if (kev->flags & EV_UDATA_SPECIFIC)
2560 kn->kn_status |= KN_UDATA_SPECIFIC;
2561
2562 /*
2563 * copy the kevent state into knote
2564 * protocol is that fflags and data
2565 * are saved off, and cleared before
2566 * calling the attach routine.
2567 */
2568 kn->kn_kevent = *kev;
2569 kn->kn_sfflags = kev->fflags;
2570 kn->kn_sdata = kev->data;
2571 kn->kn_fflags = 0;
2572 kn->kn_data = 0;
2573
2574 /* invoke pthread kext to convert kevent qos to thread qos */
2575 if (kq->kq_state & KQ_WORKQ) {
2576 kn->kn_qos = canonicalize_kevent_qos(kn->kn_qos);
2577 knote_set_qos_index(kn, qos_index_from_qos(kn->kn_qos, FALSE));
2578 knote_set_qos_override_index(kn, QOS_INDEX_KQFILE);
2579 assert(knote_get_qos_index(kn) < KQWQ_NQOS);
2580 } else {
2581 knote_set_qos_index(kn, QOS_INDEX_KQFILE);
2582 knote_set_qos_override_index(kn, QOS_INDEX_KQFILE);
2583 }
2584
2585 /* before anyone can find it */
2586 if (kev->flags & EV_DISABLE)
2587 knote_disable(kn);
2588
2589 /* Add the knote for lookup thru the fd table */
2590 error = knote_fdadd(kn, p);
2591 proc_fdunlock(p);
2592
2593 if (error) {
2594 knote_free(kn);
2595 if (fp != NULL)
2596 fp_drop(p, kev->ident, fp, 0);
2597 goto out;
2598 }
2599
2600 /* fp reference count now applies to knote */
2601
2602 /* call filter attach routine */
2603 result = fops->f_attach(kn);
2604
2605 /*
2606 * Trade knote use count for kq lock.
2607 * Cannot be dropped because we held
2608 * KN_ATTACHING throughout.
2609 */
2610 knoteuse2kqlock(kq, kn, 1);
2611
2612 if (kn->kn_flags & EV_ERROR) {
2613 /*
2614 * Failed to attach correctly, so drop.
2615 * All other possible users/droppers
2616 * have deferred to us. Save the error
2617 * to return to our caller.
2618 */
2619 kn->kn_status &= ~KN_ATTACHED;
2620 kn->kn_status |= KN_DROPPING;
2621 error = kn->kn_data;
2622 kqunlock(kq);
2623 knote_drop(kn, p);
2624 goto out;
2625 }
2626
2627 /* end "attaching" phase - now just attached */
2628 kn->kn_status &= ~KN_ATTACHING;
2629
2630 if (kn->kn_status & KN_DROPPING) {
2631 /*
2632 * Attach succeeded, but someone else
2633 * deferred their drop - now we have
2634 * to do it for them.
2635 */
2636 kqunlock(kq);
2637 knote_drop(kn, p);
2638 goto out;
2639 }
2640
2641 /*
2642 * If the attach routine indicated that an
2643 * event is already fired, activate the knote.
2644 */
2645 if (result)
2646 knote_activate(kn);
2647
2648 } else {
2649 proc_fdunlock(p);
2650 error = ENOENT;
2651 goto out;
2652 }
2653
2654 } else {
2655 /* existing knote - get kqueue lock */
2656 kqlock(kq);
2657 proc_fdunlock(p);
2658
2659 if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
2660 /*
2661 * The knote is not in a stable state, wait for that
2662 * transition to complete and then redrive the lookup.
2663 */
2664 kn->kn_status |= KN_USEWAIT;
2665 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
2666 CAST_EVENT64_T(&kn->kn_status),
2667 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
2668 kqunlock(kq);
2669 thread_block(THREAD_CONTINUE_NULL);
2670 goto restart;
2671 }
2672
2673 if (kev->flags & EV_DELETE) {
2674
2675 /*
2676 * If attempting to delete a disabled dispatch2 knote,
2677 * we must wait for the knote to be re-enabled (unless
2678 * it is being re-enabled atomically here).
2679 */
2680 if ((kev->flags & EV_ENABLE) == 0 &&
2681 (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) ==
2682 (KN_DISPATCH2 | KN_DISABLED)) {
2683 kn->kn_status |= KN_DEFERDELETE;
2684 kqunlock(kq);
2685 error = EINPROGRESS;
2686 } else if (kqlock2knotedrop(kq, kn)) {
2687 knote_drop(kn, p);
2688 } else {
2689 /*
2690 * The kqueue is unlocked, it's not being
2691 * dropped, and kqlock2knotedrop returned 0:
2692 * this means that someone stole the drop of
2693 * the knote from us.
2694 */
2695 error = EINPROGRESS;
2696 }
2697 goto out;
2698 }
2699
2700 /*
2701 * If we are re-enabling a deferred-delete knote,
2702 * just enable it now and avoid calling the
2703 * filter touch routine (it has delivered its
2704 * last event already).
2705 */
2706 if ((kev->flags & EV_ENABLE) &&
2707 (kn->kn_status & KN_DEFERDELETE)) {
2708 assert(kn->kn_status & KN_DISABLED);
2709 knote_activate(kn);
2710 knote_enable(kn);
2711 kqunlock(kq);
2712 goto out;
2713 }
2714
2715 /*
2716 * If we are disabling, do it before unlocking and
2717 * calling the touch routine (so no processing can
2718 * see the new kevent state before the disable is
2719 * applied).
2720 */
2721 if (kev->flags & EV_DISABLE)
2722 knote_disable(kn);
2723
2724 /*
2725 * Convert the kqlock to a use reference on the
2726 * knote so we can call the filter touch routine.
2727 */
2728 if (kqlock2knoteuse(kq, kn)) {
2729
2730 /*
2731 * Call touch routine to notify filter of changes
2732 * in filter values (and to re-determine if any
2733 * events are fired).
2734 */
2735 result = knote_fops(kn)->f_touch(kn, kev);
2736
2737 /* Get the kq lock back (don't defer droppers). */
2738 if (!knoteuse2kqlock(kq, kn, 0)) {
2739 kqunlock(kq);
2740 goto out;
2741 }
2742
2743 /* Activate it if the touch routine said to */
2744 if (result)
2745 knote_activate(kn);
2746 }
2747
2748 /* Enable the knote if called for */
2749 if (kev->flags & EV_ENABLE)
2750 knote_enable(kn);
2751
2752 }
2753
2754 /* still have kqlock held and knote is valid */
2755 kqunlock(kq);
2756
2757 out:
2758 /* output local errors through the kevent */
2759 if (error) {
2760 kev->flags |= EV_ERROR;
2761 kev->data = error;
2762 }
2763 }
2764
2765
2766 /*
2767 * knote_process - process a triggered event
2768 *
2769 * Validate that it is really still a triggered event
2770 * by calling the filter routines (if necessary). Hold
2771 * a use reference on the knote to avoid it being detached.
2772 *
2773 * If it is still considered triggered, we will have taken
2774 * a copy of the state under the filter lock. We use that
2775 * snapshot to dispatch the knote for future processing (or
2776 * not, if this was a lost event).
2777 *
2778 * Our caller assures us that nobody else can be processing
2779 * events from this knote during the whole operation. But
2780 * others can be touching or posting events to the knote
2781 * interspersed with our processing it.
2782 *
2783 * caller holds a reference on the kqueue.
2784 * kqueue locked on entry and exit - but may be dropped
2785 */
2786 static int
2787 knote_process(struct knote *kn,
2788 kevent_callback_t callback,
2789 void *callback_data,
2790 struct filt_process_s *process_data,
2791 struct proc *p)
2792 {
2793 struct kevent_internal_s kev;
2794 struct kqueue *kq = knote_get_kq(kn);
2795 int result = 0;
2796 int error = 0;
2797
2798 bzero(&kev, sizeof(kev));
2799
2800 /*
2801 * Must be active or stayactive
2802 * Must be queued and not disabled/suppressed
2803 */
2804 assert(kn->kn_status & KN_QUEUED);
2805 assert(kn->kn_status & (KN_ACTIVE|KN_STAYACTIVE));
2806 assert(!(kn->kn_status & (KN_DISABLED|KN_SUPPRESSED|KN_DROPPING)));
2807
2808 /*
2809 * For deferred-drop or vanished events, we just create a fake
2810 * event to acknowledge end-of-life. Otherwise, we call the
2811 * filter's process routine to snapshot the kevent state under
2812 * the filter's locking protocol.
2813 */
2814 if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
2815 /* create fake event */
2816 kev.filter = kn->kn_filter;
2817 kev.ident = kn->kn_id;
2818 kev.qos = kn->kn_qos;
2819 kev.flags = (kn->kn_status & KN_DEFERDELETE) ?
2820 EV_DELETE : EV_VANISHED;
2821 kev.flags |= (EV_DISPATCH2 | EV_ONESHOT);
2822 kev.udata = kn->kn_udata;
2823 result = 1;
2824
2825 knote_suppress(kn);
2826 } else {
2827
2828 /* deactivate - so new activations indicate a wakeup */
2829 knote_deactivate(kn);
2830
2831 /* suppress knotes to avoid returning the same event multiple times in a single call. */
2832 knote_suppress(kn);
2833
2834 /* convert lock to a knote use reference */
2835 if (!kqlock2knoteuse(kq, kn))
2836 panic("dropping knote found on queue\n");
2837
2838 /* call out to the filter to process with just a ref */
2839 result = knote_fops(kn)->f_process(kn, process_data, &kev);
2840
2841 /*
2842 * convert our reference back to a lock. accept drop
2843 * responsibility from others if we've committed to
2844 * delivering event data.
2845 */
2846 if (!knoteuse2kqlock(kq, kn, result)) {
2847 /* knote dropped */
2848 kn = NULL;
2849 }
2850 }
2851
2852 if (kn != NULL) {
2853 /*
2854 * Determine how to dispatch the knote for future event handling.
2855 * not-fired: just return (do not callout, leave deactivated).
2856 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
2857 * is the deferred delete event delivery itself). Otherwise,
2858 * drop it.
2859 * stolendrop:We took responsibility for someone else's drop attempt.
2860 * treat this just like one-shot and prepare to turn it back
2861 * into a deferred delete if required.
2862 * Dispatch: don't clear state, just mark it disabled.
2863 * Cleared: just leave it deactivated.
2864 * Others: re-activate as there may be more events to handle.
2865 * This will not wake up more handlers right now, but
2866 * at the completion of handling events it may trigger
2867 * more handler threads (TODO: optimize based on more than
2868 * just this one event being detected by the filter).
2869 */
2870
2871 if (result == 0)
2872 return (EJUSTRETURN);
2873
2874 if ((kev.flags & EV_ONESHOT) || (kn->kn_status & KN_STOLENDROP)) {
2875 if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) {
2876 /* defer dropping non-delete oneshot dispatch2 events */
2877 kn->kn_status |= KN_DEFERDELETE;
2878 knote_disable(kn);
2879
2880 /* if we took over another's drop clear those flags here */
2881 if (kn->kn_status & KN_STOLENDROP) {
2882 assert(kn->kn_status & KN_DROPPING);
2883 /*
2884 * the knote will be dropped when the
2885 * deferred deletion occurs
2886 */
2887 kn->kn_status &= ~(KN_DROPPING|KN_STOLENDROP);
2888 }
2889 } else if (kn->kn_status & KN_STOLENDROP) {
2890 /* We now own the drop of the knote. */
2891 assert(kn->kn_status & KN_DROPPING);
2892 knote_unsuppress(kn);
2893 kqunlock(kq);
2894 knote_drop(kn, p);
2895 kqlock(kq);
2896 } else if (kqlock2knotedrop(kq, kn)) {
2897 /* just EV_ONESHOT, _not_ DISPATCH2 */
2898 knote_drop(kn, p);
2899 kqlock(kq);
2900 }
2901 } else if (kn->kn_status & KN_DISPATCH) {
2902 /* disable all dispatch knotes */
2903 knote_disable(kn);
2904 } else if ((kev.flags & EV_CLEAR) == 0) {
2905 /* re-activate in case there are more events */
2906 knote_activate(kn);
2907 }
2908 }
2909
2910 /*
2911 * callback to handle each event as we find it.
2912 * If we have to detach and drop the knote, do
2913 * it while we have the kq unlocked.
2914 */
2915 if (result) {
2916 kqunlock(kq);
2917 error = (callback)(kq, &kev, callback_data);
2918 kqlock(kq);
2919 }
2920 return (error);
2921 }
2922
2923
2924 /*
2925 * Return 0 to indicate that processing should proceed,
2926 * -1 if there is nothing to process.
2927 *
2928 * Called with kqueue locked and returns the same way,
2929 * but may drop lock temporarily.
2930 */
2931 static int
2932 kqworkq_begin_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
2933 {
2934 struct kqrequest *kqr;
2935 thread_t self = current_thread();
2936 __assert_only struct uthread *ut = get_bsdthread_info(self);
2937 thread_t thread;
2938
2939 assert(kqwq->kqwq_state & KQ_WORKQ);
2940 assert(qos_index < KQWQ_NQOS);
2941
2942 kqwq_req_lock(kqwq);
2943 kqr = kqworkq_get_request(kqwq, qos_index);
2944
2945 thread = kqr->kqr_thread;
2946
2947 /* manager skips buckets that haven't ask for its help */
2948 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
2949
2950 /* If nothing for manager to do, just return */
2951 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
2952 assert(kqr->kqr_thread != self);
2953 kqwq_req_unlock(kqwq);
2954 return -1;
2955 }
2956
2957 /* bind manager thread from this time on */
2958 kqworkq_bind_thread(kqwq, qos_index, self, flags);
2959
2960 } else {
2961 /* must have been bound by now */
2962 assert(thread == self);
2963 assert(ut->uu_kqueue_bound == qos_index);
2964 assert((ut->uu_kqueue_flags & flags) == ut->uu_kqueue_flags);
2965 }
2966
2967 /* nobody else should still be processing */
2968 assert(kqr->kqr_state & KQWQ_THREQUESTED);
2969 assert((kqr->kqr_state & KQWQ_PROCESSING) == 0);
2970
2971 /* anything left to process? */
2972 if (kqueue_queue_empty(&kqwq->kqwq_kqueue, qos_index)) {
2973 kqwq_req_unlock(kqwq);
2974 return -1;
2975 }
2976
2977 /* convert to processing mode */
2978 /* reset workq triggers and thread requests - maybe processing */
2979 kqr->kqr_state &= ~(KQWQ_HOOKCALLED | KQWQ_WAKEUP);
2980 kqr->kqr_state |= KQWQ_PROCESSING;
2981 kqwq_req_unlock(kqwq);
2982 return 0;
2983 }
2984
2985 /*
2986 * Return 0 to indicate that processing should proceed,
2987 * -1 if there is nothing to process.
2988 *
2989 * Called with kqueue locked and returns the same way,
2990 * but may drop lock temporarily.
2991 * May block.
2992 */
2993 static int
2994 kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags)
2995 {
2996 struct kqtailq *suppressq;
2997
2998 if (kq->kq_state & KQ_WORKQ)
2999 return kqworkq_begin_processing((struct kqworkq *)kq, qos_index, flags);
3000
3001 assert(qos_index == QOS_INDEX_KQFILE);
3002
3003 /* wait to become the exclusive processing thread */
3004 for (;;) {
3005 if (kq->kq_state & KQ_DRAIN)
3006 return -1;
3007
3008 if ((kq->kq_state & KQ_PROCESSING) == 0)
3009 break;
3010
3011 /* if someone else is processing the queue, wait */
3012 kq->kq_state |= KQ_PROCWAIT;
3013 suppressq = kqueue_get_suppressed_queue(kq, qos_index);
3014 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
3015 CAST_EVENT64_T(suppressq),
3016 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
3017
3018 kqunlock(kq);
3019 thread_block(THREAD_CONTINUE_NULL);
3020 kqlock(kq);
3021 }
3022
3023 /* Nobody else processing */
3024
3025 /* clear pre-posts and KQ_WAKEUP now, in case we bail early */
3026 waitq_set_clear_preposts(&kq->kq_wqs);
3027 kq->kq_state &= ~KQ_WAKEUP;
3028
3029 /* anything left to process? */
3030 if (kqueue_queue_empty(kq, qos_index))
3031 return -1;
3032
3033 /* convert to processing mode */
3034 kq->kq_state |= KQ_PROCESSING;
3035
3036 return 0;
3037 }
3038
3039 /*
3040 * kqworkq_end_processing - Complete the processing of a workq kqueue
3041 *
3042 * We may have to request new threads.
3043 * This can happen there are no waiting processing threads and:
3044 * - there were active events we never got to (count > 0)
3045 * - we pended waitq hook callouts during processing
3046 * - we pended wakeups while processing (or unsuppressing)
3047 *
3048 * Called with kqueue lock held.
3049 */
3050 static void
3051 kqworkq_end_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
3052 {
3053 #pragma unused(flags)
3054
3055 struct kqueue *kq = &kqwq->kqwq_kqueue;
3056 struct kqtailq *suppressq = kqueue_get_suppressed_queue(kq, qos_index);
3057
3058 thread_t self = current_thread();
3059 __assert_only struct uthread *ut = get_bsdthread_info(self);
3060 struct knote *kn;
3061 struct kqrequest *kqr;
3062 int queued_events;
3063 uint16_t pended;
3064 thread_t thread;
3065
3066 assert(kqwq->kqwq_state & KQ_WORKQ);
3067 assert(qos_index < KQWQ_NQOS);
3068
3069 /* leave early if we are not even processing */
3070 kqwq_req_lock(kqwq);
3071 kqr = kqworkq_get_request(kqwq, qos_index);
3072 thread = kqr->kqr_thread;
3073
3074 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
3075 assert(ut->uu_kqueue_bound == KQWQ_QOS_MANAGER);
3076 assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
3077
3078 /* if this bucket didn't need manager help, bail */
3079 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
3080 assert(thread != self);
3081 kqwq_req_unlock(kqwq);
3082 return;
3083 }
3084
3085 assert(kqr->kqr_state & KQWQ_THREQUESTED);
3086
3087 /* unbound bucket - see if still needs servicing */
3088 if (thread == THREAD_NULL) {
3089 assert((kqr->kqr_state & KQWQ_PROCESSING) == 0);
3090 assert(TAILQ_EMPTY(suppressq));
3091 } else {
3092 assert(thread == self);
3093 }
3094
3095 } else {
3096 assert(thread == self);
3097 assert(ut->uu_kqueue_bound == qos_index);
3098 assert((ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0);
3099 }
3100
3101 kqwq_req_unlock(kqwq);
3102
3103 /* Any events queued before we put suppressed ones back? */
3104 queued_events = !kqueue_queue_empty(kq, qos_index);
3105
3106 /*
3107 * Return suppressed knotes to their original state.
3108 * For workq kqueues, suppressed ones that are still
3109 * truly active (not just forced into the queue) will
3110 * set flags we check below to see if anything got
3111 * woken up.
3112 */
3113 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
3114 assert(kn->kn_status & KN_SUPPRESSED);
3115 knote_unsuppress(kn);
3116 }
3117
3118 kqwq_req_lock(kqwq);
3119
3120 /* Determine if wakeup-type events were pended during servicing */
3121 pended = (kqr->kqr_state & (KQWQ_HOOKCALLED | KQWQ_WAKEUP));
3122
3123 /* unbind thread thread */
3124 kqworkq_unbind_thread(kqwq, qos_index, self, flags);
3125
3126 /* Indicate that we are done processing */
3127 kqr->kqr_state &= ~(KQWQ_PROCESSING | \
3128 KQWQ_THREQUESTED | KQWQ_THMANAGER);
3129
3130 /*
3131 * request a new thread if events have happened
3132 * (not just putting stay-active events back).
3133 */
3134 if ((queued_events || pended) &&
3135 !kqueue_queue_empty(kq, qos_index)) {
3136 kqworkq_request_thread(kqwq, qos_index);
3137 }
3138
3139 kqwq_req_unlock(kqwq);
3140 }
3141
3142 /*
3143 * Called with kqueue lock held.
3144 */
3145 static void
3146 kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags)
3147 {
3148 struct knote *kn;
3149 struct kqtailq *suppressq;
3150 int procwait;
3151
3152 if (kq->kq_state & KQ_WORKQ) {
3153 kqworkq_end_processing((struct kqworkq *)kq, qos_index, flags);
3154 return;
3155 }
3156
3157 assert(qos_index == QOS_INDEX_KQFILE);
3158
3159 /*
3160 * Return suppressed knotes to their original state.
3161 * For workq kqueues, suppressed ones that are still
3162 * truly active (not just forced into the queue) will
3163 * set flags we check below to see if anything got
3164 * woken up.
3165 */
3166 suppressq = kqueue_get_suppressed_queue(kq, qos_index);
3167 while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
3168 assert(kn->kn_status & KN_SUPPRESSED);
3169 knote_unsuppress(kn);
3170 }
3171
3172 procwait = (kq->kq_state & KQ_PROCWAIT);
3173 kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
3174
3175 if (procwait) {
3176 /* first wake up any thread already waiting to process */
3177 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
3178 CAST_EVENT64_T(suppressq),
3179 THREAD_AWAKENED,
3180 WAITQ_ALL_PRIORITIES);
3181 }
3182 }
3183
3184 /*
3185 * kevent_qos_internal_bind - bind thread to processing kqueue
3186 *
3187 * Indicates that the provided thread will be responsible for
3188 * servicing the particular QoS class index specified in the
3189 * parameters. Once the binding is done, any overrides that may
3190 * be associated with the cooresponding events can be applied.
3191 *
3192 * This should be called as soon as the thread identity is known,
3193 * preferably while still at high priority during creation.
3194 *
3195 * - caller holds a reference on the kqueue.
3196 * - the thread MUST call kevent_qos_internal after being bound
3197 * or the bucket of events may never be delivered.
3198 * - Nothing locked (may take mutex or block).
3199 */
3200
3201 int
3202 kevent_qos_internal_bind(
3203 struct proc *p,
3204 int qos_class,
3205 thread_t thread,
3206 unsigned int flags)
3207 {
3208 struct fileproc *fp = NULL;
3209 struct kqueue *kq = NULL;
3210 struct kqworkq *kqwq;
3211 struct kqrequest *kqr;
3212 struct uthread *ut;
3213 kq_index_t qos_index;
3214 int res = 0;
3215
3216 assert(thread != THREAD_NULL);
3217 assert(flags & KEVENT_FLAG_WORKQ);
3218
3219 if (thread == THREAD_NULL ||
3220 (flags & KEVENT_FLAG_WORKQ) == 0) {
3221 return EINVAL;
3222 }
3223
3224 ut = get_bsdthread_info(thread);
3225
3226 /* find the kqueue */
3227 res = kevent_get_kq(p, -1, flags, &fp, &kq);
3228 assert(fp == NULL);
3229 if (res)
3230 return res;
3231
3232 /* get the qos index we're going to service */
3233 qos_index = qos_index_for_servicer(qos_class, thread, flags);
3234
3235 /* No need to bind the manager thread to any bucket */
3236 if (qos_index == KQWQ_QOS_MANAGER) {
3237 assert(ut->uu_kqueue_bound == 0);
3238 ut->uu_kqueue_bound = qos_index;
3239 ut->uu_kqueue_flags = flags;
3240 return 0;
3241 }
3242
3243 kqlock(kq);
3244 assert(kq->kq_state & KQ_WORKQ);
3245
3246 kqwq = (struct kqworkq *)kq;
3247 kqr = kqworkq_get_request(kqwq, qos_index);
3248
3249 kqwq_req_lock(kqwq);
3250
3251 /*
3252 * A (non-emergency) request should have been made
3253 * and nobody should already be servicing this bucket.
3254 */
3255 assert(kqr->kqr_state & KQWQ_THREQUESTED);
3256 assert((kqr->kqr_state & KQWQ_THMANAGER) == 0);
3257 assert((kqr->kqr_state & KQWQ_PROCESSING) == 0);
3258
3259 /* Is this is an extraneous bind? */
3260 if (thread == kqr->kqr_thread) {
3261 assert(ut->uu_kqueue_bound == qos_index);
3262 goto out;
3263 }
3264
3265 /* nobody else bound and we're not bound elsewhere */
3266 assert(ut->uu_kqueue_bound == 0);
3267 assert(ut->uu_kqueue_flags == 0);
3268 assert(kqr->kqr_thread == THREAD_NULL);
3269
3270 /* Don't bind if there is a conflict */
3271 if (kqr->kqr_thread != THREAD_NULL ||
3272 (kqr->kqr_state & KQWQ_THMANAGER)) {
3273 res = EINPROGRESS;
3274 goto out;
3275 }
3276
3277 /* finally bind the thread */
3278 kqr->kqr_thread = thread;
3279 ut->uu_kqueue_bound = qos_index;
3280 ut->uu_kqueue_flags = flags;
3281
3282 /* add any pending overrides to the thread */
3283 if (kqr->kqr_override_delta) {
3284 thread_add_ipc_override(thread, qos_index + kqr->kqr_override_delta);
3285 }
3286
3287 out:
3288 kqwq_req_unlock(kqwq);
3289 kqunlock(kq);
3290
3291 return res;
3292 }
3293
3294 /*
3295 * kevent_qos_internal_unbind - unbind thread from processing kqueue
3296 *
3297 * End processing the per-QoS bucket of events and allow other threads
3298 * to be requested for future servicing.
3299 *
3300 * caller holds a reference on the kqueue.
3301 * thread is the current thread.
3302 */
3303
3304 int
3305 kevent_qos_internal_unbind(
3306 struct proc *p,
3307 int qos_class,
3308 thread_t thread,
3309 unsigned int flags)
3310 {
3311 struct kqueue *kq;
3312 struct uthread *ut;
3313 struct fileproc *fp = NULL;
3314 kq_index_t qos_index;
3315 kq_index_t end_index;
3316 int res;
3317
3318 assert(flags & KEVENT_FLAG_WORKQ);
3319 assert(thread == current_thread());
3320
3321 if (thread == THREAD_NULL ||
3322 (flags & KEVENT_FLAG_WORKQ) == 0)
3323 return EINVAL;
3324
3325 /* get the kq */
3326 res = kevent_get_kq(p, -1, flags, &fp, &kq);
3327 assert(fp == NULL);
3328 if (res)
3329 return res;
3330
3331 assert(kq->kq_state & KQ_WORKQ);
3332
3333 /* get the index we have been servicing */
3334 qos_index = qos_index_for_servicer(qos_class, thread, flags);
3335
3336 ut = get_bsdthread_info(thread);
3337
3338 /* early out if we were already unbound - or never bound */
3339 if (ut->uu_kqueue_bound != qos_index) {
3340 __assert_only struct kqworkq *kqwq = (struct kqworkq *)kq;
3341 __assert_only struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
3342
3343 assert(ut->uu_kqueue_bound == 0);
3344 assert(ut->uu_kqueue_flags == 0);
3345 assert(kqr->kqr_thread != thread);
3346 return EALREADY;
3347 }
3348
3349 /* unbind from all the buckets we might own */
3350 end_index = (qos_index == KQWQ_QOS_MANAGER) ?
3351 0 : qos_index;
3352 kqlock(kq);
3353 do {
3354 kqueue_end_processing(kq, qos_index, flags);
3355 } while (qos_index-- > end_index);
3356 kqunlock(kq);
3357
3358 /* indicate that we are done processing in the uthread */
3359 ut->uu_kqueue_bound = 0;
3360 ut->uu_kqueue_flags = 0;
3361
3362 return 0;
3363 }
3364
3365 /*
3366 * kqueue_process - process the triggered events in a kqueue
3367 *
3368 * Walk the queued knotes and validate that they are
3369 * really still triggered events by calling the filter
3370 * routines (if necessary). Hold a use reference on
3371 * the knote to avoid it being detached. For each event
3372 * that is still considered triggered, invoke the
3373 * callback routine provided.
3374 *
3375 * caller holds a reference on the kqueue.
3376 * kqueue locked on entry and exit - but may be dropped
3377 * kqueue list locked (held for duration of call)
3378 */
3379
3380 static int
3381 kqueue_process(struct kqueue *kq,
3382 kevent_callback_t callback,
3383 void *callback_data,
3384 struct filt_process_s *process_data,
3385 kq_index_t servicer_qos_index,
3386 int *countp,
3387 struct proc *p)
3388 {
3389 unsigned int flags = process_data ? process_data->fp_flags : 0;
3390 kq_index_t start_index, end_index, i;
3391 struct knote *kn;
3392 int nevents = 0;
3393 int error = 0;
3394
3395 /*
3396 * Based on the native QoS of the servicer,
3397 * determine the range of QoSes that need checking
3398 */
3399 start_index = servicer_qos_index;
3400 end_index = (start_index == KQWQ_QOS_MANAGER) ? 0 : start_index;
3401
3402 i = start_index;
3403
3404 do {
3405 if (kqueue_begin_processing(kq, i, flags) == -1) {
3406 *countp = 0;
3407 /* Nothing to process */
3408 continue;
3409 }
3410
3411 /*
3412 * loop through the enqueued knotes, processing each one and
3413 * revalidating those that need it. As they are processed,
3414 * they get moved to the inprocess queue (so the loop can end).
3415 */
3416 error = 0;
3417
3418 struct kqtailq *base_queue = kqueue_get_base_queue(kq, i);
3419 struct kqtailq *queue = kqueue_get_high_queue(kq, i);
3420 do {
3421 while (error == 0 &&
3422 (kn = TAILQ_FIRST(queue)) != NULL) {
3423 /* Process the knote */
3424 error = knote_process(kn, callback, callback_data, process_data, p);
3425 if (error == EJUSTRETURN)
3426 error = 0;
3427 else
3428 nevents++;
3429
3430 /* break out if no more space for additional events */
3431 if (error == EWOULDBLOCK) {
3432 if ((kq->kq_state & KQ_WORKQ) == 0)
3433 kqueue_end_processing(kq, i, flags);
3434 error = 0;
3435 goto out;
3436 }
3437 }
3438 } while (error == 0 && queue-- > base_queue);
3439
3440 /* let somebody else process events if we're not in workq mode */
3441 if ((kq->kq_state & KQ_WORKQ) == 0)
3442 kqueue_end_processing(kq, i, flags);
3443
3444 } while (i-- > end_index);
3445
3446 out:
3447 *countp = nevents;
3448 return (error);
3449 }
3450
3451 static void
3452 kqueue_scan_continue(void *data, wait_result_t wait_result)
3453 {
3454 thread_t self = current_thread();
3455 uthread_t ut = (uthread_t)get_bsdthread_info(self);
3456 struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
3457 struct kqueue *kq = (struct kqueue *)data;
3458 struct filt_process_s *process_data = cont_args->process_data;
3459 int error;
3460 int count;
3461
3462 /* convert the (previous) wait_result to a proper error */
3463 switch (wait_result) {
3464 case THREAD_AWAKENED: {
3465 kqlock(kq);
3466 retry:
3467 error = kqueue_process(kq, cont_args->call, cont_args->data,
3468 process_data, cont_args->servicer_qos_index,
3469 &count, current_proc());
3470 if (error == 0 && count == 0) {
3471 if (kq->kq_state & KQ_WAKEUP)
3472 goto retry;
3473 waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
3474 KQ_EVENT, THREAD_ABORTSAFE,
3475 cont_args->deadline);
3476 kq->kq_state |= KQ_SLEEP;
3477 kqunlock(kq);
3478 thread_block_parameter(kqueue_scan_continue, kq);
3479 /* NOTREACHED */
3480 }
3481 kqunlock(kq);
3482 } break;
3483 case THREAD_TIMED_OUT:
3484 error = EWOULDBLOCK;
3485 break;
3486 case THREAD_INTERRUPTED:
3487 error = EINTR;
3488 break;
3489 case THREAD_RESTART:
3490 error = EBADF;
3491 break;
3492 default:
3493 panic("%s: - invalid wait_result (%d)", __func__,
3494 wait_result);
3495 error = 0;
3496 }
3497
3498 /* call the continuation with the results */
3499 assert(cont_args->cont != NULL);
3500 (cont_args->cont)(kq, cont_args->data, error);
3501 }
3502
3503
3504 /*
3505 * kqueue_scan - scan and wait for events in a kqueue
3506 *
3507 * Process the triggered events in a kqueue.
3508 *
3509 * If there are no events triggered arrange to
3510 * wait for them. If the caller provided a
3511 * continuation routine, then kevent_scan will
3512 * also.
3513 *
3514 * The callback routine must be valid.
3515 * The caller must hold a use-count reference on the kq.
3516 */
3517
3518 int
3519 kqueue_scan(struct kqueue *kq,
3520 kevent_callback_t callback,
3521 kqueue_continue_t continuation,
3522 void *callback_data,
3523 struct filt_process_s *process_data,
3524 struct timeval *atvp,
3525 struct proc *p)
3526 {
3527 thread_continue_t cont = THREAD_CONTINUE_NULL;
3528 kq_index_t servicer_qos_index;
3529 unsigned int flags;
3530 uint64_t deadline;
3531 int error;
3532 int first;
3533 int fd;
3534
3535 assert(callback != NULL);
3536
3537 /*
3538 * Determine which QoS index we are servicing
3539 */
3540 flags = (process_data) ? process_data->fp_flags : 0;
3541 fd = (process_data) ? process_data->fp_fd : -1;
3542 servicer_qos_index = (kq->kq_state & KQ_WORKQ) ?
3543 qos_index_for_servicer(fd, current_thread(), flags) :
3544 QOS_INDEX_KQFILE;
3545
3546 first = 1;
3547 for (;;) {
3548 wait_result_t wait_result;
3549 int count;
3550
3551 /*
3552 * Make a pass through the kq to find events already
3553 * triggered.
3554 */
3555 kqlock(kq);
3556 error = kqueue_process(kq, callback, callback_data,
3557 process_data, servicer_qos_index,
3558 &count, p);
3559 if (error || count)
3560 break; /* lock still held */
3561
3562 /* looks like we have to consider blocking */
3563 if (first) {
3564 first = 0;
3565 /* convert the timeout to a deadline once */
3566 if (atvp->tv_sec || atvp->tv_usec) {
3567 uint64_t now;
3568
3569 clock_get_uptime(&now);
3570 nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
3571 atvp->tv_usec * (long)NSEC_PER_USEC,
3572 &deadline);
3573 if (now >= deadline) {
3574 /* non-blocking call */
3575 error = EWOULDBLOCK;
3576 break; /* lock still held */
3577 }
3578 deadline -= now;
3579 clock_absolutetime_interval_to_deadline(deadline, &deadline);
3580 } else {
3581 deadline = 0; /* block forever */
3582 }
3583
3584 if (continuation) {
3585 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
3586 struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
3587
3588 cont_args->call = callback;
3589 cont_args->cont = continuation;
3590 cont_args->deadline = deadline;
3591 cont_args->data = callback_data;
3592 cont_args->process_data = process_data;
3593 cont_args->servicer_qos_index = servicer_qos_index;
3594 cont = kqueue_scan_continue;
3595 }
3596 }
3597
3598 /* If awakened during processing, try again */
3599 if (kq->kq_state & KQ_WAKEUP) {
3600 kqunlock(kq);
3601 continue;
3602 }
3603
3604 /* go ahead and wait */
3605 waitq_assert_wait64_leeway((struct waitq *)&kq->kq_wqs,
3606 KQ_EVENT, THREAD_ABORTSAFE,
3607 TIMEOUT_URGENCY_USER_NORMAL,
3608 deadline, TIMEOUT_NO_LEEWAY);
3609 kq->kq_state |= KQ_SLEEP;
3610 kqunlock(kq);
3611 wait_result = thread_block_parameter(cont, kq);
3612 /* NOTREACHED if (continuation != NULL) */
3613
3614 switch (wait_result) {
3615 case THREAD_AWAKENED:
3616 continue;
3617 case THREAD_TIMED_OUT:
3618 return EWOULDBLOCK;
3619 case THREAD_INTERRUPTED:
3620 return EINTR;
3621 case THREAD_RESTART:
3622 return EBADF;
3623 default:
3624 panic("%s: - bad wait_result (%d)", __func__,
3625 wait_result);
3626 error = 0;
3627 }
3628 }
3629 kqunlock(kq);
3630 return (error);
3631 }
3632
3633
3634 /*
3635 * XXX
3636 * This could be expanded to call kqueue_scan, if desired.
3637 */
3638 /*ARGSUSED*/
3639 static int
3640 kqueue_read(__unused struct fileproc *fp,
3641 __unused struct uio *uio,
3642 __unused int flags,
3643 __unused vfs_context_t ctx)
3644 {
3645 return (ENXIO);
3646 }
3647
3648 /*ARGSUSED*/
3649 static int
3650 kqueue_write(__unused struct fileproc *fp,
3651 __unused struct uio *uio,
3652 __unused int flags,
3653 __unused vfs_context_t ctx)
3654 {
3655 return (ENXIO);
3656 }
3657
3658 /*ARGSUSED*/
3659 static int
3660 kqueue_ioctl(__unused struct fileproc *fp,
3661 __unused u_long com,
3662 __unused caddr_t data,
3663 __unused vfs_context_t ctx)
3664 {
3665 return (ENOTTY);
3666 }
3667
3668 /*ARGSUSED*/
3669 static int
3670 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
3671 __unused vfs_context_t ctx)
3672 {
3673 struct kqueue *kq = (struct kqueue *)fp->f_data;
3674 struct kqtailq *queue;
3675 struct kqtailq *suppressq;
3676 struct knote *kn;
3677 int retnum = 0;
3678
3679 if (which != FREAD)
3680 return (0);
3681
3682 kqlock(kq);
3683
3684 assert((kq->kq_state & KQ_WORKQ) == 0);
3685
3686 /*
3687 * If this is the first pass, link the wait queue associated with the
3688 * the kqueue onto the wait queue set for the select(). Normally we
3689 * use selrecord() for this, but it uses the wait queue within the
3690 * selinfo structure and we need to use the main one for the kqueue to
3691 * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
3692 * (The select() call will unlink them when it ends).
3693 */
3694 if (wq_link_id != NULL) {
3695 thread_t cur_act = current_thread();
3696 struct uthread * ut = get_bsdthread_info(cur_act);
3697
3698 kq->kq_state |= KQ_SEL;
3699 waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset,
3700 WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
3701
3702 /* always consume the reserved link object */
3703 waitq_link_release(*(uint64_t *)wq_link_id);
3704 *(uint64_t *)wq_link_id = 0;
3705
3706 /*
3707 * selprocess() is expecting that we send it back the waitq
3708 * that was just added to the thread's waitq set. In order
3709 * to not change the selrecord() API (which is exported to
3710 * kexts), we pass this value back through the
3711 * void *wq_link_id pointer we were passed. We need to use
3712 * memcpy here because the pointer may not be properly aligned
3713 * on 32-bit systems.
3714 */
3715 void *wqptr = &kq->kq_wqs;
3716 memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
3717 }
3718
3719 if (kqueue_begin_processing(kq, QOS_INDEX_KQFILE, 0) == -1) {
3720 kqunlock(kq);
3721 return (0);
3722 }
3723
3724 queue = kqueue_get_base_queue(kq, QOS_INDEX_KQFILE);
3725 if (!TAILQ_EMPTY(queue)) {
3726 /*
3727 * there is something queued - but it might be a
3728 * KN_STAYACTIVE knote, which may or may not have
3729 * any events pending. Otherwise, we have to walk
3730 * the list of knotes to see, and peek at the
3731 * (non-vanished) stay-active ones to be really sure.
3732 */
3733 while ((kn = (struct knote *)TAILQ_FIRST(queue)) != NULL) {
3734 if (kn->kn_status & KN_ACTIVE) {
3735 retnum = 1;
3736 goto out;
3737 }
3738 assert(kn->kn_status & KN_STAYACTIVE);
3739 knote_suppress(kn);
3740 }
3741
3742 /*
3743 * There were no regular events on the queue, so take
3744 * a deeper look at the stay-queued ones we suppressed.
3745 */
3746 suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
3747 while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
3748 unsigned peek = 1;
3749
3750 /* If didn't vanish while suppressed - peek at it */
3751 if (kqlock2knoteuse(kq, kn)) {
3752
3753 peek = knote_fops(kn)->f_peek(kn);
3754
3755 /* if it dropped while getting lock - move on */
3756 if (!knoteuse2kqlock(kq, kn, 0))
3757 continue;
3758 }
3759
3760 /* unsuppress it */
3761 knote_unsuppress(kn);
3762
3763 /* has data or it has to report a vanish */
3764 if (peek > 0) {
3765 retnum = 1;
3766 goto out;
3767 }
3768 }
3769 }
3770
3771 out:
3772 kqueue_end_processing(kq, QOS_INDEX_KQFILE, 0);
3773 kqunlock(kq);
3774 return (retnum);
3775 }
3776
3777 /*
3778 * kqueue_close -
3779 */
3780 /*ARGSUSED*/
3781 static int
3782 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
3783 {
3784 struct kqfile *kqf = (struct kqfile *)fg->fg_data;
3785
3786 assert((kqf->kqf_state & KQ_WORKQ) == 0);
3787 kqueue_dealloc(&kqf->kqf_kqueue);
3788 fg->fg_data = NULL;
3789 return (0);
3790 }
3791
3792 /*ARGSUSED*/
3793 /*
3794 * The callers has taken a use-count reference on this kqueue and will donate it
3795 * to the kqueue we are being added to. This keeps the kqueue from closing until
3796 * that relationship is torn down.
3797 */
3798 static int
3799 kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
3800 {
3801 struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
3802 struct kqueue *kq = &kqf->kqf_kqueue;
3803 struct kqueue *parentkq = knote_get_kq(kn);
3804
3805 assert((kqf->kqf_state & KQ_WORKQ) == 0);
3806
3807 if (parentkq == kq ||
3808 kn->kn_filter != EVFILT_READ) {
3809 kn->kn_flags = EV_ERROR;
3810 kn->kn_data = EINVAL;
3811 return 0;
3812 }
3813
3814 /*
3815 * We have to avoid creating a cycle when nesting kqueues
3816 * inside another. Rather than trying to walk the whole
3817 * potential DAG of nested kqueues, we just use a simple
3818 * ceiling protocol. When a kqueue is inserted into another,
3819 * we check that the (future) parent is not already nested
3820 * into another kqueue at a lower level than the potenial
3821 * child (because it could indicate a cycle). If that test
3822 * passes, we just mark the nesting levels accordingly.
3823 */
3824
3825 kqlock(parentkq);
3826 if (parentkq->kq_level > 0 &&
3827 parentkq->kq_level < kq->kq_level)
3828 {
3829 kqunlock(parentkq);
3830 kn->kn_flags = EV_ERROR;
3831 kn->kn_data = EINVAL;
3832 return 0;
3833 } else {
3834 /* set parent level appropriately */
3835 if (parentkq->kq_level == 0)
3836 parentkq->kq_level = 2;
3837 if (parentkq->kq_level < kq->kq_level + 1)
3838 parentkq->kq_level = kq->kq_level + 1;
3839 kqunlock(parentkq);
3840
3841 kn->kn_filtid = EVFILTID_KQREAD;
3842 kqlock(kq);
3843 KNOTE_ATTACH(&kqf->kqf_sel.si_note, kn);
3844 /* indicate nesting in child, if needed */
3845 if (kq->kq_level == 0)
3846 kq->kq_level = 1;
3847
3848 int count = kq->kq_count;
3849 kqunlock(kq);
3850 return (count > 0);
3851 }
3852 }
3853
3854 /*
3855 * kqueue_drain - called when kq is closed
3856 */
3857 /*ARGSUSED*/
3858 static int
3859 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
3860 {
3861 struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
3862
3863 assert((kq->kq_state & KQ_WORKQ) == 0);
3864
3865 kqlock(kq);
3866 kq->kq_state |= KQ_DRAIN;
3867 kqueue_interrupt(kq);
3868 kqunlock(kq);
3869 return (0);
3870 }
3871
3872 /*ARGSUSED*/
3873 int
3874 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
3875 {
3876 assert((kq->kq_state & KQ_WORKQ) == 0);
3877
3878 kqlock(kq);
3879 if (isstat64 != 0) {
3880 struct stat64 *sb64 = (struct stat64 *)ub;
3881
3882 bzero((void *)sb64, sizeof(*sb64));
3883 sb64->st_size = kq->kq_count;
3884 if (kq->kq_state & KQ_KEV_QOS)
3885 sb64->st_blksize = sizeof(struct kevent_qos_s);
3886 else if (kq->kq_state & KQ_KEV64)
3887 sb64->st_blksize = sizeof(struct kevent64_s);
3888 else if (IS_64BIT_PROCESS(p))
3889 sb64->st_blksize = sizeof(struct user64_kevent);
3890 else
3891 sb64->st_blksize = sizeof(struct user32_kevent);
3892 sb64->st_mode = S_IFIFO;
3893 } else {
3894 struct stat *sb = (struct stat *)ub;
3895
3896 bzero((void *)sb, sizeof(*sb));
3897 sb->st_size = kq->kq_count;
3898 if (kq->kq_state & KQ_KEV_QOS)
3899 sb->st_blksize = sizeof(struct kevent_qos_s);
3900 else if (kq->kq_state & KQ_KEV64)
3901 sb->st_blksize = sizeof(struct kevent64_s);
3902 else if (IS_64BIT_PROCESS(p))
3903 sb->st_blksize = sizeof(struct user64_kevent);
3904 else
3905 sb->st_blksize = sizeof(struct user32_kevent);
3906 sb->st_mode = S_IFIFO;
3907 }
3908 kqunlock(kq);
3909 return (0);
3910 }
3911
3912
3913 /*
3914 * Interact with the pthread kext to request a servicing there.
3915 * Eventually, this will request threads at specific QoS levels.
3916 * For now, it only requests a dispatch-manager-QoS thread, and
3917 * only one-at-a-time.
3918 *
3919 * - Caller holds the workq request lock
3920 *
3921 * - May be called with the kqueue's wait queue set locked,
3922 * so cannot do anything that could recurse on that.
3923 */
3924 static void
3925 kqworkq_request_thread(
3926 struct kqworkq *kqwq,
3927 kq_index_t qos_index)
3928 {
3929 struct kqrequest *kqr;
3930
3931 assert(kqwq->kqwq_state & KQ_WORKQ);
3932 assert(qos_index < KQWQ_NQOS);
3933
3934 kqr = kqworkq_get_request(kqwq, qos_index);
3935
3936 /*
3937 * If we have already requested a thread, and it hasn't
3938 * started processing yet, there's no use hammering away
3939 * on the pthread kext.
3940 */
3941 if (kqr->kqr_state & KQWQ_THREQUESTED)
3942 return;
3943
3944 assert(kqr->kqr_thread == THREAD_NULL);
3945
3946 /* request additional workq threads if appropriate */
3947 if (pthread_functions != NULL &&
3948 pthread_functions->workq_reqthreads != NULL) {
3949 unsigned int flags = KEVENT_FLAG_WORKQ;
3950
3951 /* Compute a priority based on qos_index. */
3952 struct workq_reqthreads_req_s request = {
3953 .priority = qos_from_qos_index(qos_index),
3954 .count = 1
3955 };
3956
3957 thread_t wqthread;
3958 wqthread = (*pthread_functions->workq_reqthreads)(kqwq->kqwq_p, 1, &request);
3959 kqr->kqr_state |= KQWQ_THREQUESTED;
3960
3961 /* Have we been switched to the emergency/manager thread? */
3962 if (wqthread == (thread_t)-1) {
3963 flags |= KEVENT_FLAG_WORKQ_MANAGER;
3964 wqthread = THREAD_NULL;
3965 } else if (qos_index == KQWQ_QOS_MANAGER)
3966 flags |= KEVENT_FLAG_WORKQ_MANAGER;
3967
3968 /* bind the thread */
3969 kqworkq_bind_thread(kqwq, qos_index, wqthread, flags);
3970 }
3971 }
3972
3973 /*
3974 * If we aren't already busy processing events [for this QoS],
3975 * request workq thread support as appropriate.
3976 *
3977 * TBD - for now, we don't segregate out processing by QoS.
3978 *
3979 * - May be called with the kqueue's wait queue set locked,
3980 * so cannot do anything that could recurse on that.
3981 */
3982 static void
3983 kqworkq_request_help(
3984 struct kqworkq *kqwq,
3985 kq_index_t qos_index,
3986 uint32_t type)
3987 {
3988 struct kqrequest *kqr;
3989
3990 /* convert to thread qos value */
3991 assert(qos_index < KQWQ_NQOS);
3992
3993 kqwq_req_lock(kqwq);
3994 kqr = kqworkq_get_request(kqwq, qos_index);
3995
3996 /*
3997 * If someone is processing the queue, just mark what type
3998 * of attempt this was (from a kq wakeup or from a waitq hook).
3999 * They'll be noticed at the end of servicing and a new thread
4000 * will be requested at that point.
4001 */
4002 if (kqr->kqr_state & KQWQ_PROCESSING) {
4003 kqr->kqr_state |= type;
4004 kqwq_req_unlock(kqwq);
4005 return;
4006 }
4007
4008 kqworkq_request_thread(kqwq, qos_index);
4009 kqwq_req_unlock(kqwq);
4010 }
4011
4012 /*
4013 * These arrays described the low and high qindexes for a given qos_index.
4014 * The values come from the chart in <sys/eventvar.h> (must stay in sync).
4015 */
4016 static kq_index_t _kq_base_index[KQWQ_NQOS] = {0, 0, 6, 11, 15, 18, 20, 21};
4017 static kq_index_t _kq_high_index[KQWQ_NQOS] = {0, 5, 10, 14, 17, 19, 20, 21};
4018
4019 static struct kqtailq *
4020 kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index)
4021 {
4022 assert(qos_index < KQWQ_NQOS);
4023 return &kq->kq_queue[_kq_base_index[qos_index]];
4024 }
4025
4026 static struct kqtailq *
4027 kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index)
4028 {
4029 assert(qos_index < KQWQ_NQOS);
4030 return &kq->kq_queue[_kq_high_index[qos_index]];
4031 }
4032
4033 static int
4034 kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
4035 {
4036 struct kqtailq *base_queue = kqueue_get_base_queue(kq, qos_index);
4037 struct kqtailq *queue = kqueue_get_high_queue(kq, qos_index);
4038
4039 do {
4040 if (!TAILQ_EMPTY(queue))
4041 return 0;
4042 } while (queue-- > base_queue);
4043 return 1;
4044 }
4045
4046 static struct kqtailq *
4047 kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index)
4048 {
4049 if (kq->kq_state & KQ_WORKQ) {
4050 struct kqworkq *kqwq = (struct kqworkq *)kq;
4051 struct kqrequest *kqr;
4052
4053 kqr = kqworkq_get_request(kqwq, qos_index);
4054 return &kqr->kqr_suppressed;
4055 } else {
4056 struct kqfile *kqf = (struct kqfile *)kq;
4057 return &kqf->kqf_suppressed;
4058 }
4059 }
4060
4061 static kq_index_t
4062 knote_get_queue_index(struct knote *kn)
4063 {
4064 kq_index_t override_index = knote_get_qos_override_index(kn);
4065 kq_index_t qos_index = knote_get_qos_index(kn);
4066 struct kqueue *kq = knote_get_kq(kn);
4067 kq_index_t res;
4068
4069 if ((kq->kq_state & KQ_WORKQ) == 0) {
4070 assert(qos_index == 0);
4071 assert(override_index == 0);
4072 }
4073 res = _kq_base_index[qos_index];
4074 if (override_index > qos_index)
4075 res += override_index - qos_index;
4076
4077 assert(res <= _kq_high_index[qos_index]);
4078 return res;
4079 }
4080
4081 static struct kqtailq *
4082 knote_get_queue(struct knote *kn)
4083 {
4084 kq_index_t qindex = knote_get_queue_index(kn);
4085
4086 return &(knote_get_kq(kn))->kq_queue[qindex];
4087 }
4088
4089 static struct kqtailq *
4090 knote_get_suppressed_queue(struct knote *kn)
4091 {
4092 kq_index_t qos_index = knote_get_qos_index(kn);
4093 struct kqueue *kq = knote_get_kq(kn);
4094
4095 return kqueue_get_suppressed_queue(kq, qos_index);
4096 }
4097
4098 static kq_index_t
4099 knote_get_req_index(struct knote *kn)
4100 {
4101 return kn->kn_req_index;
4102 }
4103
4104 static kq_index_t
4105 knote_get_qos_index(struct knote *kn)
4106 {
4107 return kn->kn_qos_index;
4108 }
4109
4110 static void
4111 knote_set_qos_index(struct knote *kn, kq_index_t qos_index)
4112 {
4113 struct kqueue *kq = knote_get_kq(kn);
4114
4115 assert(qos_index < KQWQ_NQOS);
4116 assert((kn->kn_status & KN_QUEUED) == 0);
4117
4118 if (kq->kq_state & KQ_WORKQ)
4119 assert(qos_index > QOS_INDEX_KQFILE);
4120 else
4121 assert(qos_index == QOS_INDEX_KQFILE);
4122
4123 /* always set requested */
4124 kn->kn_req_index = qos_index;
4125
4126 /* only adjust in-use qos index when not suppressed */
4127 if ((kn->kn_status & KN_SUPPRESSED) == 0)
4128 kn->kn_qos_index = qos_index;
4129 }
4130
4131 static kq_index_t
4132 knote_get_qos_override_index(struct knote *kn)
4133 {
4134 return kn->kn_qos_override;
4135 }
4136
4137 static void
4138 knote_set_qos_override_index(struct knote *kn, kq_index_t override_index)
4139 {
4140 struct kqueue *kq = knote_get_kq(kn);
4141 kq_index_t qos_index = knote_get_qos_index(kn);
4142
4143 assert((kn->kn_status & KN_QUEUED) == 0);
4144
4145 if (override_index == KQWQ_QOS_MANAGER)
4146 assert(qos_index == KQWQ_QOS_MANAGER);
4147 else
4148 assert(override_index < KQWQ_QOS_MANAGER);
4149
4150 kn->kn_qos_override = override_index;
4151
4152 /*
4153 * If this is a workq kqueue, apply the override to the
4154 * workq servicing thread.
4155 */
4156 if (kq->kq_state & KQ_WORKQ) {
4157 struct kqworkq *kqwq = (struct kqworkq *)kq;
4158
4159 assert(qos_index > QOS_INDEX_KQFILE);
4160 kqworkq_update_override(kqwq, qos_index, override_index);
4161 }
4162 }
4163
4164 static void
4165 kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index)
4166 {
4167 struct kqrequest *kqr;
4168 kq_index_t new_delta;
4169 kq_index_t old_delta;
4170
4171 new_delta = (override_index > qos_index) ?
4172 override_index - qos_index : 0;
4173
4174 kqr = kqworkq_get_request(kqwq, qos_index);
4175
4176 kqwq_req_lock(kqwq);
4177 old_delta = kqr->kqr_override_delta;
4178
4179 if (new_delta > old_delta) {
4180 thread_t wqthread = kqr->kqr_thread;
4181
4182 /* store the new override delta */
4183 kqr->kqr_override_delta = new_delta;
4184
4185 /* apply the override to [incoming?] servicing thread */
4186 if (wqthread) {
4187 /* only apply if non-manager */
4188 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
4189 if (old_delta)
4190 thread_update_ipc_override(wqthread, override_index);
4191 else
4192 thread_add_ipc_override(wqthread, override_index);
4193 }
4194 }
4195 }
4196 kqwq_req_unlock(kqwq);
4197 }
4198
4199 /* called with the kqworkq lock held */
4200 static void
4201 kqworkq_bind_thread(
4202 struct kqworkq *kqwq,
4203 kq_index_t qos_index,
4204 thread_t thread,
4205 unsigned int flags)
4206 {
4207 struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
4208 thread_t old_thread = kqr->kqr_thread;
4209 struct uthread *ut;
4210
4211 assert(kqr->kqr_state & KQWQ_THREQUESTED);
4212
4213 /* If no identity yet, just set flags as needed */
4214 if (thread == THREAD_NULL) {
4215 assert(old_thread == THREAD_NULL);
4216
4217 /* emergency or unindetified */
4218 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
4219 assert((kqr->kqr_state & KQWQ_THMANAGER) == 0);
4220 kqr->kqr_state |= KQWQ_THMANAGER;
4221 }
4222 return;
4223 }
4224
4225 /* Known thread identity */
4226 ut = get_bsdthread_info(thread);
4227
4228 /*
4229 * If this is a manager, and the manager request bit is
4230 * not set, assure no other thread is bound. If the bit
4231 * is set, make sure the old thread is us (or not set).
4232 */
4233 if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
4234 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
4235 assert(old_thread == THREAD_NULL);
4236 kqr->kqr_state |= KQWQ_THMANAGER;
4237 } else if (old_thread == THREAD_NULL) {
4238 kqr->kqr_thread = thread;
4239 ut->uu_kqueue_bound = KQWQ_QOS_MANAGER;
4240 ut->uu_kqueue_flags = (KEVENT_FLAG_WORKQ |
4241 KEVENT_FLAG_WORKQ_MANAGER);
4242 } else {
4243 assert(thread == old_thread);
4244 assert(ut->uu_kqueue_bound == KQWQ_QOS_MANAGER);
4245 assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
4246 }
4247 return;
4248 }
4249
4250 /* Just a normal one-queue servicing thread */
4251 assert(old_thread == THREAD_NULL);
4252 assert((kqr->kqr_state & KQWQ_THMANAGER) == 0);
4253
4254 kqr->kqr_thread = thread;
4255
4256 /* apply an ipc QoS override if one is needed */
4257 if (kqr->kqr_override_delta)
4258 thread_add_ipc_override(thread, qos_index + kqr->kqr_override_delta);
4259
4260 /* indicate that we are processing in the uthread */
4261 ut->uu_kqueue_bound = qos_index;
4262 ut->uu_kqueue_flags = flags;
4263 }
4264
4265 /* called with the kqworkq lock held */
4266 static void
4267 kqworkq_unbind_thread(
4268 struct kqworkq *kqwq,
4269 kq_index_t qos_index,
4270 thread_t thread,
4271 __unused unsigned int flags)
4272 {
4273 struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
4274 kq_index_t override = 0;
4275
4276 assert(thread == current_thread());
4277
4278 /*
4279 * If there is an override, drop it from the current thread
4280 * and then we are free to recompute (a potentially lower)
4281 * minimum override to apply to the next thread request.
4282 */
4283 if (kqr->kqr_override_delta) {
4284 struct kqtailq *base_queue = kqueue_get_base_queue(&kqwq->kqwq_kqueue, qos_index);
4285 struct kqtailq *queue = kqueue_get_high_queue(&kqwq->kqwq_kqueue, qos_index);
4286
4287 /* if not bound to a manager thread, drop the current ipc override */
4288 if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
4289 assert(thread == kqr->kqr_thread);
4290 thread_drop_ipc_override(thread);
4291 }
4292
4293 /* recompute the new override */
4294 do {
4295 if (!TAILQ_EMPTY(queue)) {
4296 override = queue - base_queue;
4297 break;
4298 }
4299 } while (queue-- > base_queue);
4300 }
4301
4302 /* unbind the thread and apply the new override */
4303 kqr->kqr_thread = THREAD_NULL;
4304 kqr->kqr_override_delta = override;
4305 }
4306
4307 struct kqrequest *
4308 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
4309 {
4310 assert(qos_index < KQWQ_NQOS);
4311 return &kqwq->kqwq_request[qos_index];
4312 }
4313
4314 void
4315 knote_adjust_qos(struct knote *kn, qos_t new_qos, qos_t new_override)
4316 {
4317 if (knote_get_kq(kn)->kq_state & KQ_WORKQ) {
4318 kq_index_t new_qos_index;
4319 kq_index_t new_override_index;
4320 kq_index_t servicer_qos_index;
4321
4322 new_qos_index = qos_index_from_qos(new_qos, FALSE);
4323 new_override_index = qos_index_from_qos(new_override, TRUE);
4324
4325 /* make sure the servicer qos acts as a floor */
4326 servicer_qos_index = qos_index_from_qos(kn->kn_qos, FALSE);
4327 if (servicer_qos_index > new_qos_index)
4328 new_qos_index = servicer_qos_index;
4329 if (servicer_qos_index > new_override_index)
4330 new_override_index = servicer_qos_index;
4331
4332 kqlock(knote_get_kq(kn));
4333 if (new_qos_index != knote_get_req_index(kn) ||
4334 new_override_index != knote_get_qos_override_index(kn)) {
4335 if (kn->kn_status & KN_QUEUED) {
4336 knote_dequeue(kn);
4337 knote_set_qos_index(kn, new_qos_index);
4338 knote_set_qos_override_index(kn, new_override_index);
4339 knote_enqueue(kn);
4340 knote_wakeup(kn);
4341 } else {
4342 knote_set_qos_index(kn, new_qos_index);
4343 knote_set_qos_override_index(kn, new_override_index);
4344 }
4345 }
4346 kqunlock(knote_get_kq(kn));
4347 }
4348 }
4349
4350 static void
4351 knote_wakeup(struct knote *kn)
4352 {
4353 struct kqueue *kq = knote_get_kq(kn);
4354
4355 if (kq->kq_state & KQ_WORKQ) {
4356 /* request a servicing thread */
4357 struct kqworkq *kqwq = (struct kqworkq *)kq;
4358 kq_index_t qos_index = knote_get_qos_index(kn);
4359
4360 kqworkq_request_help(kqwq, qos_index, KQWQ_WAKEUP);
4361
4362 } else {
4363 struct kqfile *kqf = (struct kqfile *)kq;
4364
4365 /* flag wakeups during processing */
4366 if (kq->kq_state & KQ_PROCESSING)
4367 kq->kq_state |= KQ_WAKEUP;
4368
4369 /* wakeup a thread waiting on this queue */
4370 if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
4371 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
4372 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
4373 KQ_EVENT,
4374 THREAD_AWAKENED,
4375 WAITQ_ALL_PRIORITIES);
4376 }
4377
4378 /* wakeup other kqueues/select sets we're inside */
4379 KNOTE(&kqf->kqf_sel.si_note, 0);
4380 }
4381 }
4382
4383 /*
4384 * Called with the kqueue locked
4385 */
4386 static void
4387 kqueue_interrupt(struct kqueue *kq)
4388 {
4389 assert((kq->kq_state & KQ_WORKQ) == 0);
4390
4391 /* wakeup sleeping threads */
4392 if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0) {
4393 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
4394 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
4395 KQ_EVENT,
4396 THREAD_RESTART,
4397 WAITQ_ALL_PRIORITIES);
4398 }
4399
4400 /* wakeup threads waiting their turn to process */
4401 if (kq->kq_state & KQ_PROCWAIT) {
4402 struct kqtailq *suppressq;
4403
4404 assert(kq->kq_state & KQ_PROCESSING);
4405
4406 kq->kq_state &= ~KQ_PROCWAIT;
4407 suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
4408 (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
4409 CAST_EVENT64_T(suppressq),
4410 THREAD_RESTART,
4411 WAITQ_ALL_PRIORITIES);
4412 }
4413 }
4414
4415 /*
4416 * Called back from waitq code when no threads waiting and the hook was set.
4417 *
4418 * Interrupts are likely disabled and spin locks are held - minimal work
4419 * can be done in this context!!!
4420 *
4421 * JMM - in the future, this will try to determine which knotes match the
4422 * wait queue wakeup and apply these wakeups against those knotes themselves.
4423 * For now, all the events dispatched this way are dispatch-manager handled,
4424 * so hard-code that for now.
4425 */
4426 void
4427 waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos)
4428 {
4429 #pragma unused(knote_hook, qos)
4430
4431 struct kqworkq *kqwq = (struct kqworkq *)kq_hook;
4432
4433 assert(kqwq->kqwq_state & KQ_WORKQ);
4434 kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER, KQWQ_HOOKCALLED);
4435 }
4436
4437 void
4438 klist_init(struct klist *list)
4439 {
4440 SLIST_INIT(list);
4441 }
4442
4443
4444 /*
4445 * Query/Post each knote in the object's list
4446 *
4447 * The object lock protects the list. It is assumed
4448 * that the filter/event routine for the object can
4449 * determine that the object is already locked (via
4450 * the hint) and not deadlock itself.
4451 *
4452 * The object lock should also hold off pending
4453 * detach/drop operations. But we'll prevent it here
4454 * too (by taking a use reference) - just in case.
4455 */
4456 void
4457 knote(struct klist *list, long hint)
4458 {
4459 struct knote *kn;
4460
4461 SLIST_FOREACH(kn, list, kn_selnext) {
4462 struct kqueue *kq = knote_get_kq(kn);
4463
4464 kqlock(kq);
4465
4466 /* If we can get a use reference - deliver event */
4467 if (kqlock2knoteuse(kq, kn)) {
4468 int result;
4469
4470 /* call the event with only a use count */
4471 result = knote_fops(kn)->f_event(kn, hint);
4472
4473 /* if its not going away and triggered */
4474 if (knoteuse2kqlock(kq, kn, 0) && result)
4475 knote_activate(kn);
4476 /* kq lock held */
4477 }
4478 kqunlock(kq);
4479 }
4480 }
4481
4482 /*
4483 * attach a knote to the specified list. Return true if this is the first entry.
4484 * The list is protected by whatever lock the object it is associated with uses.
4485 */
4486 int
4487 knote_attach(struct klist *list, struct knote *kn)
4488 {
4489 int ret = SLIST_EMPTY(list);
4490 SLIST_INSERT_HEAD(list, kn, kn_selnext);
4491 return (ret);
4492 }
4493
4494 /*
4495 * detach a knote from the specified list. Return true if that was the last entry.
4496 * The list is protected by whatever lock the object it is associated with uses.
4497 */
4498 int
4499 knote_detach(struct klist *list, struct knote *kn)
4500 {
4501 SLIST_REMOVE(list, kn, knote, kn_selnext);
4502 return (SLIST_EMPTY(list));
4503 }
4504
4505 /*
4506 * knote_vanish - Indicate that the source has vanished
4507 *
4508 * If the knote has requested EV_VANISHED delivery,
4509 * arrange for that. Otherwise, deliver a NOTE_REVOKE
4510 * event for backward compatibility.
4511 *
4512 * The knote is marked as having vanished, but is not
4513 * actually detached from the source in this instance.
4514 * The actual detach is deferred until the knote drop.
4515 *
4516 * Our caller already has the object lock held. Calling
4517 * the detach routine would try to take that lock
4518 * recursively - which likely is not supported.
4519 */
4520 void
4521 knote_vanish(struct klist *list)
4522 {
4523 struct knote *kn;
4524 struct knote *kn_next;
4525
4526 SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
4527 struct kqueue *kq = knote_get_kq(kn);
4528 int result;
4529
4530 kqlock(kq);
4531 if ((kn->kn_status & KN_DROPPING) == 0) {
4532
4533 /* If EV_VANISH supported - prepare to deliver one */
4534 if (kn->kn_status & KN_REQVANISH) {
4535 kn->kn_status |= KN_VANISHED;
4536 knote_activate(kn);
4537
4538 } else if (kqlock2knoteuse(kq, kn)) {
4539 /* call the event with only a use count */
4540 result = knote_fops(kn)->f_event(kn, NOTE_REVOKE);
4541
4542 /* if its not going away and triggered */
4543 if (knoteuse2kqlock(kq, kn, 0) && result)
4544 knote_activate(kn);
4545 /* lock held again */
4546 }
4547 }
4548 kqunlock(kq);
4549 }
4550 }
4551
4552 /*
4553 * For a given knote, link a provided wait queue directly with the kqueue.
4554 * Wakeups will happen via recursive wait queue support. But nothing will move
4555 * the knote to the active list at wakeup (nothing calls knote()). Instead,
4556 * we permanently enqueue them here.
4557 *
4558 * kqueue and knote references are held by caller.
4559 * waitq locked by caller.
4560 *
4561 * caller provides the wait queue link structure.
4562 */
4563 int
4564 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
4565 {
4566 struct kqueue *kq = knote_get_kq(kn);
4567 kern_return_t kr;
4568
4569 kr = waitq_link(wq, &kq->kq_wqs, WAITQ_ALREADY_LOCKED, reserved_link);
4570 if (kr == KERN_SUCCESS) {
4571 knote_markstayactive(kn);
4572 return (0);
4573 } else {
4574 return (EINVAL);
4575 }
4576 }
4577
4578 /*
4579 * Unlink the provided wait queue from the kqueue associated with a knote.
4580 * Also remove it from the magic list of directly attached knotes.
4581 *
4582 * Note that the unlink may have already happened from the other side, so
4583 * ignore any failures to unlink and just remove it from the kqueue list.
4584 *
4585 * On success, caller is responsible for the link structure
4586 */
4587 int
4588 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
4589 {
4590 struct kqueue *kq = knote_get_kq(kn);
4591 kern_return_t kr;
4592
4593 kr = waitq_unlink(wq, &kq->kq_wqs);
4594 knote_clearstayactive(kn);
4595 return ((kr != KERN_SUCCESS) ? EINVAL : 0);
4596 }
4597
4598 /*
4599 * remove all knotes referencing a specified fd
4600 *
4601 * Essentially an inlined knote_remove & knote_drop
4602 * when we know for sure that the thing is a file
4603 *
4604 * Entered with the proc_fd lock already held.
4605 * It returns the same way, but may drop it temporarily.
4606 */
4607 void
4608 knote_fdclose(struct proc *p, int fd, int force)
4609 {
4610 struct klist *list;
4611 struct knote *kn;
4612
4613 restart:
4614 list = &p->p_fd->fd_knlist[fd];
4615 SLIST_FOREACH(kn, list, kn_link) {
4616 struct kqueue *kq = knote_get_kq(kn);
4617
4618 kqlock(kq);
4619
4620 if (kq->kq_p != p)
4621 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
4622 __func__, kq->kq_p, p);
4623
4624 /*
4625 * If the knote supports EV_VANISHED delivery,
4626 * transition it to vanished mode (or skip over
4627 * it if already vanished).
4628 */
4629 if (!force && (kn->kn_status & KN_REQVANISH)) {
4630
4631 if ((kn->kn_status & KN_VANISHED) == 0) {
4632 proc_fdunlock(p);
4633
4634 /* get detach reference (also marks vanished) */
4635 if (kqlock2knotedetach(kq, kn)) {
4636
4637 /* detach knote and drop fp use reference */
4638 knote_fops(kn)->f_detach(kn);
4639 if (knote_fops(kn)->f_isfd)
4640 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
4641
4642 /* activate it if it's still in existence */
4643 if (knoteuse2kqlock(kq, kn, 0)) {
4644 knote_activate(kn);
4645 }
4646 kqunlock(kq);
4647 }
4648 proc_fdlock(p);
4649 goto restart;
4650 } else {
4651 kqunlock(kq);
4652 continue;
4653 }
4654 }
4655
4656 proc_fdunlock(p);
4657
4658 /*
4659 * Convert the kq lock to a drop ref.
4660 * If we get it, go ahead and drop it.
4661 * Otherwise, we waited for the blocking
4662 * condition to complete. Either way,
4663 * we dropped the fdlock so start over.
4664 */
4665 if (kqlock2knotedrop(kq, kn)) {
4666 knote_drop(kn, p);
4667 }
4668
4669 proc_fdlock(p);
4670 goto restart;
4671 }
4672 }
4673
4674 /*
4675 * knote_fdadd - Add knote to the fd table for process
4676 *
4677 * All file-based filters associate a list of knotes by file
4678 * descriptor index. All other filters hash the knote by ident.
4679 *
4680 * May have to grow the table of knote lists to cover the
4681 * file descriptor index presented.
4682 *
4683 * proc_fdlock held on entry (and exit)
4684 */
4685 static int
4686 knote_fdadd(struct knote *kn, struct proc *p)
4687 {
4688 struct filedesc *fdp = p->p_fd;
4689 struct klist *list = NULL;
4690
4691 if (! knote_fops(kn)->f_isfd) {
4692 if (fdp->fd_knhashmask == 0)
4693 fdp->fd_knhash = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
4694 &fdp->fd_knhashmask);
4695 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
4696 } else {
4697 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
4698 u_int size = 0;
4699
4700 if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
4701 || kn->kn_id >= (uint64_t)maxfiles)
4702 return (EINVAL);
4703
4704 /* have to grow the fd_knlist */
4705 size = fdp->fd_knlistsize;
4706 while (size <= kn->kn_id)
4707 size += KQEXTENT;
4708
4709 if (size >= (UINT_MAX/sizeof(struct klist *)))
4710 return (EINVAL);
4711
4712 MALLOC(list, struct klist *,
4713 size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
4714 if (list == NULL)
4715 return (ENOMEM);
4716
4717 bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
4718 fdp->fd_knlistsize * sizeof(struct klist *));
4719 bzero((caddr_t)list +
4720 fdp->fd_knlistsize * sizeof(struct klist *),
4721 (size - fdp->fd_knlistsize) * sizeof(struct klist *));
4722 FREE(fdp->fd_knlist, M_KQUEUE);
4723 fdp->fd_knlist = list;
4724 fdp->fd_knlistsize = size;
4725 }
4726 list = &fdp->fd_knlist[kn->kn_id];
4727 }
4728 SLIST_INSERT_HEAD(list, kn, kn_link);
4729 return (0);
4730 }
4731
4732 /*
4733 * knote_fdremove - remove a knote from the fd table for process
4734 *
4735 * If the filter is file-based, remove based on fd index.
4736 * Otherwise remove from the hash based on the ident.
4737 *
4738 * proc_fdlock held on entry (and exit)
4739 */
4740 static void
4741 knote_fdremove(struct knote *kn, struct proc *p)
4742 {
4743 struct filedesc *fdp = p->p_fd;
4744 struct klist *list = NULL;
4745
4746 if (knote_fops(kn)->f_isfd) {
4747 assert ((u_int)fdp->fd_knlistsize > kn->kn_id);
4748 list = &fdp->fd_knlist[kn->kn_id];
4749 } else {
4750 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
4751 }
4752 SLIST_REMOVE(list, kn, knote, kn_link);
4753 }
4754
4755 /*
4756 * knote_fdfind - lookup a knote in the fd table for process
4757 *
4758 * If the filter is file-based, lookup based on fd index.
4759 * Otherwise use a hash based on the ident.
4760 *
4761 * Matching is based on kq, filter, and ident. Optionally,
4762 * it may also be based on the udata field in the kevent -
4763 * allowing multiple event registration for the file object
4764 * per kqueue.
4765 *
4766 * proc_fdlock held on entry (and exit)
4767 */
4768 static struct knote *
4769 knote_fdfind(struct kqueue *kq,
4770 struct kevent_internal_s *kev,
4771 struct proc *p)
4772 {
4773 struct filedesc *fdp = p->p_fd;
4774 struct klist *list = NULL;
4775 struct knote *kn = NULL;
4776 struct filterops *fops;
4777
4778 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
4779
4780 /*
4781 * determine where to look for the knote
4782 */
4783 if (fops->f_isfd) {
4784 /* fd-based knotes are linked off the fd table */
4785 if (kev->ident < (u_int)fdp->fd_knlistsize) {
4786 list = &fdp->fd_knlist[kev->ident];
4787 }
4788 } else if (fdp->fd_knhashmask != 0) {
4789 /* hash non-fd knotes here too */
4790 list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
4791 }
4792
4793 /*
4794 * scan the selected list looking for a match
4795 */
4796 if (list != NULL) {
4797 SLIST_FOREACH(kn, list, kn_link) {
4798 if (kq == knote_get_kq(kn) &&
4799 kev->ident == kn->kn_id &&
4800 kev->filter == kn->kn_filter) {
4801 if (kev->flags & EV_UDATA_SPECIFIC) {
4802 if ((kn->kn_status & KN_UDATA_SPECIFIC) &&
4803 kev->udata == kn->kn_udata) {
4804 break; /* matching udata-specific knote */
4805 }
4806 } else if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
4807 break; /* matching non-udata-specific knote */
4808 }
4809 }
4810 }
4811 }
4812 return kn;
4813 }
4814
4815 /*
4816 * knote_drop - disconnect and drop the knote
4817 *
4818 * Called with the kqueue unlocked and holding a
4819 * "drop reference" on the knote in question.
4820 * This reference is most often aquired thru a call
4821 * to kqlock2knotedrop(). But it can also be acquired
4822 * through stealing a drop reference via a call to
4823 * knoteuse2knotedrop() or during the initial attach
4824 * of the knote.
4825 *
4826 * The knote may have already been detached from
4827 * (or not yet attached to) its source object.
4828 *
4829 * should be called at spl == 0, since we don't want to hold spl
4830 * while calling fdrop and free.
4831 */
4832 static void
4833 knote_drop(struct knote *kn, __unused struct proc *ctxp)
4834 {
4835 struct kqueue *kq = knote_get_kq(kn);
4836 struct proc *p = kq->kq_p;
4837 int needswakeup;
4838
4839 /* We have to have a dropping reference on the knote */
4840 assert(kn->kn_status & KN_DROPPING);
4841
4842 /* If we are attached, disconnect from the source first */
4843 if (kn->kn_status & KN_ATTACHED) {
4844 knote_fops(kn)->f_detach(kn);
4845 }
4846
4847 proc_fdlock(p);
4848
4849 /* Remove the source from the appropriate hash */
4850 knote_fdremove(kn, p);
4851
4852 /* trade fdlock for kq lock */
4853 kqlock(kq);
4854 proc_fdunlock(p);
4855
4856 /* determine if anyone needs to know about the drop */
4857 assert((kn->kn_status & (KN_SUPPRESSED | KN_QUEUED)) == 0);
4858 needswakeup = (kn->kn_status & KN_USEWAIT);
4859 kqunlock(kq);
4860
4861 if (needswakeup)
4862 waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
4863 CAST_EVENT64_T(&kn->kn_status),
4864 THREAD_RESTART,
4865 WAITQ_ALL_PRIORITIES);
4866
4867 if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0))
4868 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
4869
4870 knote_free(kn);
4871 }
4872
4873 /* called with kqueue lock held */
4874 static void
4875 knote_activate(struct knote *kn)
4876 {
4877 if (kn->kn_status & KN_ACTIVE)
4878 return;
4879
4880 kn->kn_status |= KN_ACTIVE;
4881 if (knote_enqueue(kn))
4882 knote_wakeup(kn);
4883 }
4884
4885 /* called with kqueue lock held */
4886 static void
4887 knote_deactivate(struct knote *kn)
4888 {
4889 kn->kn_status &= ~KN_ACTIVE;
4890 if ((kn->kn_status & KN_STAYACTIVE) == 0)
4891 knote_dequeue(kn);
4892 }
4893
4894 /* called with kqueue lock held */
4895 static void
4896 knote_enable(struct knote *kn)
4897 {
4898 if ((kn->kn_status & KN_DISABLED) == 0)
4899 return;
4900
4901 kn->kn_status &= ~KN_DISABLED;
4902 if (knote_enqueue(kn))
4903 knote_wakeup(kn);
4904 }
4905
4906 /* called with kqueue lock held */
4907 static void
4908 knote_disable(struct knote *kn)
4909 {
4910 if (kn->kn_status & KN_DISABLED)
4911 return;
4912
4913 kn->kn_status |= KN_DISABLED;
4914 knote_dequeue(kn);
4915 }
4916
4917 /* called with kqueue lock held */
4918 static void
4919 knote_suppress(struct knote *kn)
4920 {
4921 struct kqtailq *suppressq;
4922
4923 if (kn->kn_status & KN_SUPPRESSED)
4924 return;
4925
4926 knote_dequeue(kn);
4927 kn->kn_status |= KN_SUPPRESSED;
4928 suppressq = knote_get_suppressed_queue(kn);
4929 TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
4930 }
4931
4932 /* called with kqueue lock held */
4933 static void
4934 knote_unsuppress(struct knote *kn)
4935 {
4936 struct kqtailq *suppressq;
4937
4938 if ((kn->kn_status & KN_SUPPRESSED) == 0)
4939 return;
4940
4941 kn->kn_status &= ~KN_SUPPRESSED;
4942 suppressq = knote_get_suppressed_queue(kn);
4943 TAILQ_REMOVE(suppressq, kn, kn_tqe);
4944
4945 /* udate in-use qos to equal requested qos */
4946 kn->kn_qos_index = kn->kn_req_index;
4947
4948 /* don't wakeup if unsuppressing just a stay-active knote */
4949 if (knote_enqueue(kn) &&
4950 (kn->kn_status & KN_ACTIVE))
4951 knote_wakeup(kn);
4952 }
4953
4954 /* called with kqueue lock held */
4955 static int
4956 knote_enqueue(struct knote *kn)
4957 {
4958 if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0 ||
4959 (kn->kn_status & (KN_DISABLED | KN_SUPPRESSED | KN_DROPPING)))
4960 return 0;
4961
4962 if ((kn->kn_status & KN_QUEUED) == 0) {
4963 struct kqtailq *queue = knote_get_queue(kn);
4964 struct kqueue *kq = knote_get_kq(kn);
4965
4966 TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
4967 kn->kn_status |= KN_QUEUED;
4968 kq->kq_count++;
4969 return 1;
4970 }
4971 return ((kn->kn_status & KN_STAYACTIVE) != 0);
4972 }
4973
4974
4975 /* called with kqueue lock held */
4976 static void
4977 knote_dequeue(struct knote *kn)
4978 {
4979 struct kqueue *kq = knote_get_kq(kn);
4980 struct kqtailq *queue;
4981
4982 if ((kn->kn_status & KN_QUEUED) == 0)
4983 return;
4984
4985 queue = knote_get_queue(kn);
4986 TAILQ_REMOVE(queue, kn, kn_tqe);
4987 kn->kn_status &= ~KN_QUEUED;
4988 kq->kq_count--;
4989 }
4990
4991 void
4992 knote_init(void)
4993 {
4994 knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote),
4995 8192, "knote zone");
4996
4997 kqfile_zone = zinit(sizeof(struct kqfile), 8192*sizeof(struct kqfile),
4998 8192, "kqueue file zone");
4999
5000 kqworkq_zone = zinit(sizeof(struct kqworkq), 8192*sizeof(struct kqworkq),
5001 8192, "kqueue workq zone");
5002
5003 /* allocate kq lock group attribute and group */
5004 kq_lck_grp_attr = lck_grp_attr_alloc_init();
5005
5006 kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr);
5007
5008 /* Allocate kq lock attribute */
5009 kq_lck_attr = lck_attr_alloc_init();
5010
5011 /* Initialize the timer filter lock */
5012 lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
5013
5014 /* Initialize the user filter lock */
5015 lck_spin_init(&_filt_userlock, kq_lck_grp, kq_lck_attr);
5016
5017 #if CONFIG_MEMORYSTATUS
5018 /* Initialize the memorystatus list lock */
5019 memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
5020 #endif
5021 }
5022 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
5023
5024 struct filterops *
5025 knote_fops(struct knote *kn)
5026 {
5027 return sysfilt_ops[kn->kn_filtid];
5028 }
5029
5030 static struct knote *
5031 knote_alloc(void)
5032 {
5033 return ((struct knote *)zalloc(knote_zone));
5034 }
5035
5036 static void
5037 knote_free(struct knote *kn)
5038 {
5039 zfree(knote_zone, kn);
5040 }
5041
5042 #if SOCKETS
5043 #include <sys/param.h>
5044 #include <sys/socket.h>
5045 #include <sys/protosw.h>
5046 #include <sys/domain.h>
5047 #include <sys/mbuf.h>
5048 #include <sys/kern_event.h>
5049 #include <sys/malloc.h>
5050 #include <sys/sys_domain.h>
5051 #include <sys/syslog.h>
5052
5053 #ifndef ROUNDUP64
5054 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
5055 #endif
5056
5057 #ifndef ADVANCE64
5058 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
5059 #endif
5060
5061 static lck_grp_attr_t *kev_lck_grp_attr;
5062 static lck_attr_t *kev_lck_attr;
5063 static lck_grp_t *kev_lck_grp;
5064 static decl_lck_rw_data(,kev_lck_data);
5065 static lck_rw_t *kev_rwlock = &kev_lck_data;
5066
5067 static int kev_attach(struct socket *so, int proto, struct proc *p);
5068 static int kev_detach(struct socket *so);
5069 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
5070 struct ifnet *ifp, struct proc *p);
5071 static lck_mtx_t * event_getlock(struct socket *, int);
5072 static int event_lock(struct socket *, int, void *);
5073 static int event_unlock(struct socket *, int, void *);
5074
5075 static int event_sofreelastref(struct socket *);
5076 static void kev_delete(struct kern_event_pcb *);
5077
5078 static struct pr_usrreqs event_usrreqs = {
5079 .pru_attach = kev_attach,
5080 .pru_control = kev_control,
5081 .pru_detach = kev_detach,
5082 .pru_soreceive = soreceive,
5083 };
5084
5085 static struct protosw eventsw[] = {
5086 {
5087 .pr_type = SOCK_RAW,
5088 .pr_protocol = SYSPROTO_EVENT,
5089 .pr_flags = PR_ATOMIC,
5090 .pr_usrreqs = &event_usrreqs,
5091 .pr_lock = event_lock,
5092 .pr_unlock = event_unlock,
5093 .pr_getlock = event_getlock,
5094 }
5095 };
5096
5097 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
5098 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
5099
5100 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
5101 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Kernel event family");
5102
5103 struct kevtstat kevtstat;
5104 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
5105 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
5106 kevt_getstat, "S,kevtstat", "");
5107
5108 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
5109 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
5110 kevt_pcblist, "S,xkevtpcb", "");
5111
5112 static lck_mtx_t *
5113 event_getlock(struct socket *so, int locktype)
5114 {
5115 #pragma unused(locktype)
5116 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
5117
5118 if (so->so_pcb != NULL) {
5119 if (so->so_usecount < 0)
5120 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
5121 so, so->so_usecount, solockhistory_nr(so));
5122 /* NOTREACHED */
5123 } else {
5124 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
5125 so, solockhistory_nr(so));
5126 /* NOTREACHED */
5127 }
5128 return (&ev_pcb->evp_mtx);
5129 }
5130
5131 static int
5132 event_lock(struct socket *so, int refcount, void *lr)
5133 {
5134 void *lr_saved;
5135
5136 if (lr == NULL)
5137 lr_saved = __builtin_return_address(0);
5138 else
5139 lr_saved = lr;
5140
5141 if (so->so_pcb != NULL) {
5142 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
5143 } else {
5144 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
5145 so, lr_saved, solockhistory_nr(so));
5146 /* NOTREACHED */
5147 }
5148
5149 if (so->so_usecount < 0) {
5150 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
5151 so, so->so_pcb, lr_saved, so->so_usecount,
5152 solockhistory_nr(so));
5153 /* NOTREACHED */
5154 }
5155
5156 if (refcount)
5157 so->so_usecount++;
5158
5159 so->lock_lr[so->next_lock_lr] = lr_saved;
5160 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
5161 return (0);
5162 }
5163
5164 static int
5165 event_unlock(struct socket *so, int refcount, void *lr)
5166 {
5167 void *lr_saved;
5168 lck_mtx_t *mutex_held;
5169
5170 if (lr == NULL)
5171 lr_saved = __builtin_return_address(0);
5172 else
5173 lr_saved = lr;
5174
5175 if (refcount)
5176 so->so_usecount--;
5177
5178 if (so->so_usecount < 0) {
5179 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
5180 so, so->so_usecount, solockhistory_nr(so));
5181 /* NOTREACHED */
5182 }
5183 if (so->so_pcb == NULL) {
5184 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
5185 so, so->so_usecount, (void *)lr_saved,
5186 solockhistory_nr(so));
5187 /* NOTREACHED */
5188 }
5189 mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
5190
5191 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5192 so->unlock_lr[so->next_unlock_lr] = lr_saved;
5193 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
5194
5195 if (so->so_usecount == 0) {
5196 VERIFY(so->so_flags & SOF_PCBCLEARING);
5197 event_sofreelastref(so);
5198 } else {
5199 lck_mtx_unlock(mutex_held);
5200 }
5201
5202 return (0);
5203 }
5204
5205 static int
5206 event_sofreelastref(struct socket *so)
5207 {
5208 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
5209
5210 lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
5211
5212 so->so_pcb = NULL;
5213
5214 /*
5215 * Disable upcall in the event another thread is in kev_post_msg()
5216 * appending record to the receive socket buffer, since sbwakeup()
5217 * may release the socket lock otherwise.
5218 */
5219 so->so_rcv.sb_flags &= ~SB_UPCALL;
5220 so->so_snd.sb_flags &= ~SB_UPCALL;
5221 so->so_event = sonullevent;
5222 lck_mtx_unlock(&(ev_pcb->evp_mtx));
5223
5224 lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
5225 lck_rw_lock_exclusive(kev_rwlock);
5226 LIST_REMOVE(ev_pcb, evp_link);
5227 kevtstat.kes_pcbcount--;
5228 kevtstat.kes_gencnt++;
5229 lck_rw_done(kev_rwlock);
5230 kev_delete(ev_pcb);
5231
5232 sofreelastref(so, 1);
5233 return (0);
5234 }
5235
5236 static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));
5237
5238 static
5239 struct kern_event_head kern_event_head;
5240
5241 static u_int32_t static_event_id = 0;
5242
5243 #define EVPCB_ZONE_MAX 65536
5244 #define EVPCB_ZONE_NAME "kerneventpcb"
5245 static struct zone *ev_pcb_zone;
5246
5247 /*
5248 * Install the protosw's for the NKE manager. Invoked at extension load time
5249 */
5250 void
5251 kern_event_init(struct domain *dp)
5252 {
5253 struct protosw *pr;
5254 int i;
5255
5256 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
5257 VERIFY(dp == systemdomain);
5258
5259 kev_lck_grp_attr = lck_grp_attr_alloc_init();
5260 if (kev_lck_grp_attr == NULL) {
5261 panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
5262 /* NOTREACHED */
5263 }
5264
5265 kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
5266 kev_lck_grp_attr);
5267 if (kev_lck_grp == NULL) {
5268 panic("%s: lck_grp_alloc_init failed\n", __func__);
5269 /* NOTREACHED */
5270 }
5271
5272 kev_lck_attr = lck_attr_alloc_init();
5273 if (kev_lck_attr == NULL) {
5274 panic("%s: lck_attr_alloc_init failed\n", __func__);
5275 /* NOTREACHED */
5276 }
5277
5278 lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
5279 if (kev_rwlock == NULL) {
5280 panic("%s: lck_mtx_alloc_init failed\n", __func__);
5281 /* NOTREACHED */
5282 }
5283
5284 for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++)
5285 net_add_proto(pr, dp, 1);
5286
5287 ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
5288 EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
5289 if (ev_pcb_zone == NULL) {
5290 panic("%s: failed allocating ev_pcb_zone", __func__);
5291 /* NOTREACHED */
5292 }
5293 zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
5294 zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
5295 }
5296
5297 static int
5298 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
5299 {
5300 int error = 0;
5301 struct kern_event_pcb *ev_pcb;
5302
5303 error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
5304 if (error != 0)
5305 return (error);
5306
5307 if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
5308 return (ENOBUFS);
5309 }
5310 bzero(ev_pcb, sizeof(struct kern_event_pcb));
5311 lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
5312
5313 ev_pcb->evp_socket = so;
5314 ev_pcb->evp_vendor_code_filter = 0xffffffff;
5315
5316 so->so_pcb = (caddr_t) ev_pcb;
5317 lck_rw_lock_exclusive(kev_rwlock);
5318 LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
5319 kevtstat.kes_pcbcount++;
5320 kevtstat.kes_gencnt++;
5321 lck_rw_done(kev_rwlock);
5322
5323 return (error);
5324 }
5325
5326 static void
5327 kev_delete(struct kern_event_pcb *ev_pcb)
5328 {
5329 VERIFY(ev_pcb != NULL);
5330 lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
5331 zfree(ev_pcb_zone, ev_pcb);
5332 }
5333
5334 static int
5335 kev_detach(struct socket *so)
5336 {
5337 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
5338
5339 if (ev_pcb != NULL) {
5340 soisdisconnected(so);
5341 so->so_flags |= SOF_PCBCLEARING;
5342 }
5343
5344 return (0);
5345 }
5346
5347 /*
5348 * For now, kev_vendor_code and mbuf_tags use the same
5349 * mechanism.
5350 */
5351 errno_t kev_vendor_code_find(
5352 const char *string,
5353 u_int32_t *out_vendor_code)
5354 {
5355 if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
5356 return (EINVAL);
5357 }
5358 return (net_str_id_find_internal(string, out_vendor_code,
5359 NSI_VENDOR_CODE, 1));
5360 }
5361
5362 errno_t
5363 kev_msg_post(struct kev_msg *event_msg)
5364 {
5365 mbuf_tag_id_t min_vendor, max_vendor;
5366
5367 net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
5368
5369 if (event_msg == NULL)
5370 return (EINVAL);
5371
5372 /*
5373 * Limit third parties to posting events for registered vendor codes
5374 * only
5375 */
5376 if (event_msg->vendor_code < min_vendor ||
5377 event_msg->vendor_code > max_vendor) {
5378 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
5379 return (EINVAL);
5380 }
5381 return (kev_post_msg(event_msg));
5382 }
5383
5384 int
5385 kev_post_msg(struct kev_msg *event_msg)
5386 {
5387 struct mbuf *m, *m2;
5388 struct kern_event_pcb *ev_pcb;
5389 struct kern_event_msg *ev;
5390 char *tmp;
5391 u_int32_t total_size;
5392 int i;
5393
5394 /* Verify the message is small enough to fit in one mbuf w/o cluster */
5395 total_size = KEV_MSG_HEADER_SIZE;
5396
5397 for (i = 0; i < 5; i++) {
5398 if (event_msg->dv[i].data_length == 0)
5399 break;
5400 total_size += event_msg->dv[i].data_length;
5401 }
5402
5403 if (total_size > MLEN) {
5404 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
5405 return (EMSGSIZE);
5406 }
5407
5408 m = m_get(M_DONTWAIT, MT_DATA);
5409 if (m == 0) {
5410 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
5411 return (ENOMEM);
5412 }
5413 ev = mtod(m, struct kern_event_msg *);
5414 total_size = KEV_MSG_HEADER_SIZE;
5415
5416 tmp = (char *) &ev->event_data[0];
5417 for (i = 0; i < 5; i++) {
5418 if (event_msg->dv[i].data_length == 0)
5419 break;
5420
5421 total_size += event_msg->dv[i].data_length;
5422 bcopy(event_msg->dv[i].data_ptr, tmp,
5423 event_msg->dv[i].data_length);
5424 tmp += event_msg->dv[i].data_length;
5425 }
5426
5427 ev->id = ++static_event_id;
5428 ev->total_size = total_size;
5429 ev->vendor_code = event_msg->vendor_code;
5430 ev->kev_class = event_msg->kev_class;
5431 ev->kev_subclass = event_msg->kev_subclass;
5432 ev->event_code = event_msg->event_code;
5433
5434 m->m_len = total_size;
5435 lck_rw_lock_shared(kev_rwlock);
5436 for (ev_pcb = LIST_FIRST(&kern_event_head);
5437 ev_pcb;
5438 ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
5439 lck_mtx_lock(&ev_pcb->evp_mtx);
5440 if (ev_pcb->evp_socket->so_pcb == NULL) {
5441 lck_mtx_unlock(&ev_pcb->evp_mtx);
5442 continue;
5443 }
5444 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
5445 if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
5446 lck_mtx_unlock(&ev_pcb->evp_mtx);
5447 continue;
5448 }
5449
5450 if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
5451 if (ev_pcb->evp_class_filter != ev->kev_class) {
5452 lck_mtx_unlock(&ev_pcb->evp_mtx);
5453 continue;
5454 }
5455
5456 if ((ev_pcb->evp_subclass_filter !=
5457 KEV_ANY_SUBCLASS) &&
5458 (ev_pcb->evp_subclass_filter !=
5459 ev->kev_subclass)) {
5460 lck_mtx_unlock(&ev_pcb->evp_mtx);
5461 continue;
5462 }
5463 }
5464 }
5465
5466 m2 = m_copym(m, 0, m->m_len, M_NOWAIT);
5467 if (m2 == 0) {
5468 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
5469 m_free(m);
5470 lck_mtx_unlock(&ev_pcb->evp_mtx);
5471 lck_rw_done(kev_rwlock);
5472 return (ENOMEM);
5473 }
5474 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
5475 /*
5476 * We use "m" for the socket stats as it would be
5477 * unsafe to use "m2"
5478 */
5479 so_inc_recv_data_stat(ev_pcb->evp_socket,
5480 1, m->m_len, MBUF_TC_BE);
5481
5482 sorwakeup(ev_pcb->evp_socket);
5483 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
5484 } else {
5485 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
5486 }
5487 lck_mtx_unlock(&ev_pcb->evp_mtx);
5488 }
5489 m_free(m);
5490 lck_rw_done(kev_rwlock);
5491
5492 return (0);
5493 }
5494
5495 static int
5496 kev_control(struct socket *so,
5497 u_long cmd,
5498 caddr_t data,
5499 __unused struct ifnet *ifp,
5500 __unused struct proc *p)
5501 {
5502 struct kev_request *kev_req = (struct kev_request *) data;
5503 struct kern_event_pcb *ev_pcb;
5504 struct kev_vendor_code *kev_vendor;
5505 u_int32_t *id_value = (u_int32_t *) data;
5506
5507 switch (cmd) {
5508 case SIOCGKEVID:
5509 *id_value = static_event_id;
5510 break;
5511 case SIOCSKEVFILT:
5512 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
5513 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
5514 ev_pcb->evp_class_filter = kev_req->kev_class;
5515 ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
5516 break;
5517 case SIOCGKEVFILT:
5518 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
5519 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
5520 kev_req->kev_class = ev_pcb->evp_class_filter;
5521 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
5522 break;
5523 case SIOCGKEVVENDOR:
5524 kev_vendor = (struct kev_vendor_code *)data;
5525 /* Make sure string is NULL terminated */
5526 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
5527 return (net_str_id_find_internal(kev_vendor->vendor_string,
5528 &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0));
5529 default:
5530 return (ENOTSUP);
5531 }
5532
5533 return (0);
5534 }
5535
5536 int
5537 kevt_getstat SYSCTL_HANDLER_ARGS
5538 {
5539 #pragma unused(oidp, arg1, arg2)
5540 int error = 0;
5541
5542 lck_rw_lock_shared(kev_rwlock);
5543
5544 if (req->newptr != USER_ADDR_NULL) {
5545 error = EPERM;
5546 goto done;
5547 }
5548 if (req->oldptr == USER_ADDR_NULL) {
5549 req->oldidx = sizeof(struct kevtstat);
5550 goto done;
5551 }
5552
5553 error = SYSCTL_OUT(req, &kevtstat,
5554 MIN(sizeof(struct kevtstat), req->oldlen));
5555 done:
5556 lck_rw_done(kev_rwlock);
5557
5558 return (error);
5559 }
5560
5561 __private_extern__ int
5562 kevt_pcblist SYSCTL_HANDLER_ARGS
5563 {
5564 #pragma unused(oidp, arg1, arg2)
5565 int error = 0;
5566 int n, i;
5567 struct xsystmgen xsg;
5568 void *buf = NULL;
5569 size_t item_size = ROUNDUP64(sizeof (struct xkevtpcb)) +
5570 ROUNDUP64(sizeof (struct xsocket_n)) +
5571 2 * ROUNDUP64(sizeof (struct xsockbuf_n)) +
5572 ROUNDUP64(sizeof (struct xsockstat_n));
5573 struct kern_event_pcb *ev_pcb;
5574
5575 buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
5576 if (buf == NULL)
5577 return (ENOMEM);
5578
5579 lck_rw_lock_shared(kev_rwlock);
5580
5581 n = kevtstat.kes_pcbcount;
5582
5583 if (req->oldptr == USER_ADDR_NULL) {
5584 req->oldidx = (n + n/8) * item_size;
5585 goto done;
5586 }
5587 if (req->newptr != USER_ADDR_NULL) {
5588 error = EPERM;
5589 goto done;
5590 }
5591 bzero(&xsg, sizeof (xsg));
5592 xsg.xg_len = sizeof (xsg);
5593 xsg.xg_count = n;
5594 xsg.xg_gen = kevtstat.kes_gencnt;
5595 xsg.xg_sogen = so_gencnt;
5596 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
5597 if (error) {
5598 goto done;
5599 }
5600 /*
5601 * We are done if there is no pcb
5602 */
5603 if (n == 0) {
5604 goto done;
5605 }
5606
5607 i = 0;
5608 for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
5609 i < n && ev_pcb != NULL;
5610 i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
5611 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
5612 struct xsocket_n *xso = (struct xsocket_n *)
5613 ADVANCE64(xk, sizeof (*xk));
5614 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
5615 ADVANCE64(xso, sizeof (*xso));
5616 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
5617 ADVANCE64(xsbrcv, sizeof (*xsbrcv));
5618 struct xsockstat_n *xsostats = (struct xsockstat_n *)
5619 ADVANCE64(xsbsnd, sizeof (*xsbsnd));
5620
5621 bzero(buf, item_size);
5622
5623 lck_mtx_lock(&ev_pcb->evp_mtx);
5624
5625 xk->kep_len = sizeof(struct xkevtpcb);
5626 xk->kep_kind = XSO_EVT;
5627 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
5628 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
5629 xk->kep_class_filter = ev_pcb->evp_class_filter;
5630 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
5631
5632 sotoxsocket_n(ev_pcb->evp_socket, xso);
5633 sbtoxsockbuf_n(ev_pcb->evp_socket ?
5634 &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
5635 sbtoxsockbuf_n(ev_pcb->evp_socket ?
5636 &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
5637 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
5638
5639 lck_mtx_unlock(&ev_pcb->evp_mtx);
5640
5641 error = SYSCTL_OUT(req, buf, item_size);
5642 }
5643
5644 if (error == 0) {
5645 /*
5646 * Give the user an updated idea of our state.
5647 * If the generation differs from what we told
5648 * her before, she knows that something happened
5649 * while we were processing this request, and it
5650 * might be necessary to retry.
5651 */
5652 bzero(&xsg, sizeof (xsg));
5653 xsg.xg_len = sizeof (xsg);
5654 xsg.xg_count = n;
5655 xsg.xg_gen = kevtstat.kes_gencnt;
5656 xsg.xg_sogen = so_gencnt;
5657 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
5658 if (error) {
5659 goto done;
5660 }
5661 }
5662
5663 done:
5664 lck_rw_done(kev_rwlock);
5665
5666 return (error);
5667 }
5668
5669 #endif /* SOCKETS */
5670
5671
5672 int
5673 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
5674 {
5675 struct vinfo_stat * st;
5676
5677 st = &kinfo->kq_stat;
5678
5679 st->vst_size = kq->kq_count;
5680 if (kq->kq_state & KQ_KEV_QOS)
5681 st->vst_blksize = sizeof(struct kevent_qos_s);
5682 else if (kq->kq_state & KQ_KEV64)
5683 st->vst_blksize = sizeof(struct kevent64_s);
5684 else
5685 st->vst_blksize = sizeof(struct kevent);
5686 st->vst_mode = S_IFIFO;
5687
5688 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
5689 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS)
5690 kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
5691
5692 return (0);
5693 }
5694
5695
5696 void
5697 knote_markstayactive(struct knote *kn)
5698 {
5699 kqlock(knote_get_kq(kn));
5700 kn->kn_status |= KN_STAYACTIVE;
5701
5702 /* handle all stayactive knotes on the manager */
5703 if (knote_get_kq(kn)->kq_state & KQ_WORKQ)
5704 knote_set_qos_index(kn, KQWQ_QOS_MANAGER);
5705
5706 knote_activate(kn);
5707 kqunlock(knote_get_kq(kn));
5708 }
5709
5710 void
5711 knote_clearstayactive(struct knote *kn)
5712 {
5713 kqlock(knote_get_kq(kn));
5714 kn->kn_status &= ~KN_STAYACTIVE;
5715 knote_deactivate(kn);
5716 kqunlock(knote_get_kq(kn));
5717 }
5718
5719 static unsigned long
5720 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
5721 unsigned long buflen, unsigned long nknotes)
5722 {
5723 struct kevent_internal_s *kevp;
5724 for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
5725 if (kq == knote_get_kq(kn)) {
5726 if (nknotes < buflen) {
5727 struct kevent_extinfo *info = &buf[nknotes];
5728 struct kevent_qos_s kevqos;
5729
5730 kqlock(kq);
5731 kevp = &(kn->kn_kevent);
5732
5733 bzero(&kevqos, sizeof(kevqos));
5734 kevqos.ident = kevp->ident;
5735 kevqos.filter = kevp->filter;
5736 kevqos.flags = kevp->flags;
5737 kevqos.fflags = kevp->fflags;
5738 kevqos.data = (int64_t) kevp->data;
5739 kevqos.udata = kevp->udata;
5740 kevqos.ext[0] = kevp->ext[0];
5741 kevqos.ext[1] = kevp->ext[1];
5742
5743 memcpy(&info->kqext_kev, &kevqos, sizeof(info->kqext_kev));
5744 info->kqext_sdata = kn->kn_sdata;
5745 info->kqext_status = kn->kn_status;
5746 info->kqext_sfflags = kn->kn_sfflags;
5747
5748 kqunlock(kq);
5749 }
5750
5751 /* we return total number of knotes, which may be more than requested */
5752 nknotes++;
5753 }
5754 }
5755
5756 return nknotes;
5757 }
5758
5759 int
5760 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
5761 uint32_t bufsize, int32_t *retval)
5762 {
5763 struct knote *kn;
5764 int i;
5765 int err = 0;
5766 struct filedesc *fdp = p->p_fd;
5767 unsigned long nknotes = 0;
5768 unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
5769 struct kevent_extinfo *kqext = NULL;
5770
5771 /* arbitrary upper limit to cap kernel memory usage, copyout size, etc. */
5772 buflen = min(buflen, PROC_PIDFDKQUEUE_KNOTES_MAX);
5773
5774 kqext = kalloc(buflen * sizeof(struct kevent_extinfo));
5775 if (kqext == NULL) {
5776 err = ENOMEM;
5777 goto out;
5778 }
5779 bzero(kqext, buflen * sizeof(struct kevent_extinfo));
5780
5781 proc_fdlock(p);
5782
5783 for (i = 0; i < fdp->fd_knlistsize; i++) {
5784 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
5785 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
5786 }
5787
5788 if (fdp->fd_knhashmask != 0) {
5789 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
5790 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
5791 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
5792 }
5793 }
5794
5795 proc_fdunlock(p);
5796
5797 assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
5798 err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
5799
5800 out:
5801 if (kqext) {
5802 kfree(kqext, buflen * sizeof(struct kevent_extinfo));
5803 kqext = NULL;
5804 }
5805
5806 if (!err) {
5807 *retval = min(nknotes, PROC_PIDFDKQUEUE_KNOTES_MAX);
5808 }
5809 return err;
5810 }
5811
5812 static unsigned long
5813 kevent_udatainfo_emit(struct kqueue *kq, struct knote *kn, uint64_t *buf,
5814 unsigned long buflen, unsigned long nknotes)
5815 {
5816 struct kevent_internal_s *kevp;
5817 for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
5818 if (kq == knote_get_kq(kn)) {
5819 if (nknotes < buflen) {
5820 kqlock(kq);
5821 kevp = &(kn->kn_kevent);
5822 buf[nknotes] = kevp->udata;
5823 kqunlock(kq);
5824 }
5825
5826 /* we return total number of knotes, which may be more than requested */
5827 nknotes++;
5828 }
5829 }
5830
5831 return nknotes;
5832 }
5833
5834 int
5835 pid_kqueue_udatainfo(proc_t p, struct kqueue *kq, uint64_t *buf,
5836 uint32_t bufsize)
5837 {
5838 struct knote *kn;
5839 int i;
5840 struct filedesc *fdp = p->p_fd;
5841 unsigned long nknotes = 0;
5842 unsigned long buflen = bufsize / sizeof(uint64_t);
5843
5844 proc_fdlock(p);
5845
5846 for (i = 0; i < fdp->fd_knlistsize; i++) {
5847 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
5848 nknotes = kevent_udatainfo_emit(kq, kn, buf, buflen, nknotes);
5849 }
5850
5851 if (fdp->fd_knhashmask != 0) {
5852 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
5853 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
5854 nknotes = kevent_udatainfo_emit(kq, kn, buf, buflen, nknotes);
5855 }
5856 }
5857
5858 proc_fdunlock(p);
5859 return (int)nknotes;
5860 }
5861