]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/kern_event.c
d80579a350d0045fd778141825b923ba4073f15a
[apple/xnu.git] / bsd / kern / kern_event.c
1 /*
2 * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29 /*-
30 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
31 * All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54 /*
55 * @(#)kern_event.c 1.0 (3/31/2000)
56 */
57 #include <stdint.h>
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/filedesc.h>
62 #include <sys/kernel.h>
63 #include <sys/proc_internal.h>
64 #include <sys/kauth.h>
65 #include <sys/malloc.h>
66 #include <sys/unistd.h>
67 #include <sys/file_internal.h>
68 #include <sys/fcntl.h>
69 #include <sys/select.h>
70 #include <sys/queue.h>
71 #include <sys/event.h>
72 #include <sys/eventvar.h>
73 #include <sys/protosw.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/stat.h>
77 #include <sys/sysctl.h>
78 #include <sys/uio.h>
79 #include <sys/sysproto.h>
80 #include <sys/user.h>
81 #include <sys/vnode_internal.h>
82 #include <string.h>
83 #include <sys/proc_info.h>
84 #include <sys/codesign.h>
85 #include <sys/pthread_shims.h>
86
87 #include <kern/locks.h>
88 #include <kern/clock.h>
89 #include <kern/thread_call.h>
90 #include <kern/sched_prim.h>
91 #include <kern/waitq.h>
92 #include <kern/zalloc.h>
93 #include <kern/kalloc.h>
94 #include <kern/assert.h>
95
96 #include <libkern/libkern.h>
97 #include "net/net_str_id.h"
98
99 #include <mach/task.h>
100
101 #if VM_PRESSURE_EVENTS
102 #include <kern/vm_pressure.h>
103 #endif
104
105 #if CONFIG_MEMORYSTATUS
106 #include <sys/kern_memorystatus.h>
107 #endif
108
109 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
110
111 #define KQ_EVENT NO_EVENT64
112
113 static inline void kqlock(struct kqueue *kq);
114 static inline void kqunlock(struct kqueue *kq);
115
116 static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn);
117 static int kqlock2knoteusewait(struct kqueue *kq, struct knote *kn);
118 static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
119 static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn);
120
121 static void kqueue_wakeup(struct kqueue *kq, int closed);
122 static int kqueue_read(struct fileproc *fp, struct uio *uio,
123 int flags, vfs_context_t ctx);
124 static int kqueue_write(struct fileproc *fp, struct uio *uio,
125 int flags, vfs_context_t ctx);
126 static int kqueue_ioctl(struct fileproc *fp, u_long com, caddr_t data,
127 vfs_context_t ctx);
128 static int kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
129 vfs_context_t ctx);
130 static int kqueue_close(struct fileglob *fg, vfs_context_t ctx);
131 static int kqueue_kqfilter(struct fileproc *fp, struct knote *kn,
132 vfs_context_t ctx);
133 static int kqueue_drain(struct fileproc *fp, vfs_context_t ctx);
134
135 static const struct fileops kqueueops = {
136 .fo_type = DTYPE_KQUEUE,
137 .fo_read = kqueue_read,
138 .fo_write = kqueue_write,
139 .fo_ioctl = kqueue_ioctl,
140 .fo_select = kqueue_select,
141 .fo_close = kqueue_close,
142 .fo_kqfilter = kqueue_kqfilter,
143 .fo_drain = kqueue_drain,
144 };
145
146 static int kevent_internal(struct proc *p, int fd,
147 user_addr_t changelist, int nchanges,
148 user_addr_t eventlist, int nevents,
149 user_addr_t data_out, user_size_t *data_available,
150 unsigned int flags, user_addr_t utimeout,
151 kqueue_continue_t continuation,
152 int32_t *retval);
153 static int kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp,
154 struct proc *p, unsigned int flags);
155 static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
156 struct proc *p, unsigned int flags);
157 char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
158
159 static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
160 void *data);
161 static void kevent_continue(struct kqueue *kq, void *data, int error);
162 static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
163 static int kqueue_process(struct kqueue *kq, kevent_callback_t callback,
164 void *data, int *countp, struct proc *p);
165 static int kqueue_begin_processing(struct kqueue *kq);
166 static void kqueue_end_processing(struct kqueue *kq);
167 static int knote_process(struct knote *kn, kevent_callback_t callback,
168 void *data, struct kqtailq *inprocessp, struct proc *p);
169 static void knote_put(struct knote *kn);
170 static int knote_fdpattach(struct knote *kn, struct filedesc *fdp,
171 struct proc *p);
172 static void knote_drop(struct knote *kn, struct proc *p);
173 static void knote_activate(struct knote *kn, int);
174 static void knote_deactivate(struct knote *kn);
175 static void knote_enqueue(struct knote *kn);
176 static void knote_dequeue(struct knote *kn);
177 static struct knote *knote_alloc(void);
178 static void knote_free(struct knote *kn);
179
180 static int filt_fileattach(struct knote *kn);
181 static struct filterops file_filtops = {
182 .f_isfd = 1,
183 .f_attach = filt_fileattach,
184 };
185
186 static void filt_kqdetach(struct knote *kn);
187 static int filt_kqueue(struct knote *kn, long hint);
188 static struct filterops kqread_filtops = {
189 .f_isfd = 1,
190 .f_detach = filt_kqdetach,
191 .f_event = filt_kqueue,
192 };
193
194 /* placeholder for not-yet-implemented filters */
195 static int filt_badattach(struct knote *kn);
196 static struct filterops bad_filtops = {
197 .f_attach = filt_badattach,
198 };
199
200 static int filt_procattach(struct knote *kn);
201 static void filt_procdetach(struct knote *kn);
202 static int filt_proc(struct knote *kn, long hint);
203 static struct filterops proc_filtops = {
204 .f_attach = filt_procattach,
205 .f_detach = filt_procdetach,
206 .f_event = filt_proc,
207 };
208
209 #if VM_PRESSURE_EVENTS
210 static int filt_vmattach(struct knote *kn);
211 static void filt_vmdetach(struct knote *kn);
212 static int filt_vm(struct knote *kn, long hint);
213 static struct filterops vm_filtops = {
214 .f_attach = filt_vmattach,
215 .f_detach = filt_vmdetach,
216 .f_event = filt_vm,
217 };
218 #endif /* VM_PRESSURE_EVENTS */
219
220 #if CONFIG_MEMORYSTATUS
221 extern struct filterops memorystatus_filtops;
222 #endif /* CONFIG_MEMORYSTATUS */
223
224 extern struct filterops fs_filtops;
225
226 extern struct filterops sig_filtops;
227
228 /* Timer filter */
229 static int filt_timerattach(struct knote *kn);
230 static void filt_timerdetach(struct knote *kn);
231 static int filt_timer(struct knote *kn, long hint);
232 static void filt_timertouch(struct knote *kn, struct kevent_internal_s *kev,
233 long type);
234 static struct filterops timer_filtops = {
235 .f_attach = filt_timerattach,
236 .f_detach = filt_timerdetach,
237 .f_event = filt_timer,
238 .f_touch = filt_timertouch,
239 };
240
241 /* Helpers */
242 static void filt_timerexpire(void *knx, void *param1);
243 static int filt_timervalidate(struct knote *kn);
244 static void filt_timerupdate(struct knote *kn);
245 static void filt_timercancel(struct knote *kn);
246
247 #define TIMER_RUNNING 0x1
248 #define TIMER_CANCELWAIT 0x2
249
250 static lck_mtx_t _filt_timerlock;
251 static void filt_timerlock(void);
252 static void filt_timerunlock(void);
253
254 static zone_t knote_zone;
255
256 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
257
258 #if 0
259 extern struct filterops aio_filtops;
260 #endif
261
262 /* Mach portset filter */
263 extern struct filterops machport_filtops;
264
265 /* User filter */
266 static int filt_userattach(struct knote *kn);
267 static void filt_userdetach(struct knote *kn);
268 static int filt_user(struct knote *kn, long hint);
269 static void filt_usertouch(struct knote *kn, struct kevent_internal_s *kev,
270 long type);
271 static struct filterops user_filtops = {
272 .f_attach = filt_userattach,
273 .f_detach = filt_userdetach,
274 .f_event = filt_user,
275 .f_touch = filt_usertouch,
276 };
277
278 /*
279 * Table for all system-defined filters.
280 */
281 static struct filterops *sysfilt_ops[] = {
282 &file_filtops, /* EVFILT_READ */
283 &file_filtops, /* EVFILT_WRITE */
284 #if 0
285 &aio_filtops, /* EVFILT_AIO */
286 #else
287 &bad_filtops, /* EVFILT_AIO */
288 #endif
289 &file_filtops, /* EVFILT_VNODE */
290 &proc_filtops, /* EVFILT_PROC */
291 &sig_filtops, /* EVFILT_SIGNAL */
292 &timer_filtops, /* EVFILT_TIMER */
293 &machport_filtops, /* EVFILT_MACHPORT */
294 &fs_filtops, /* EVFILT_FS */
295 &user_filtops, /* EVFILT_USER */
296 &bad_filtops, /* unused */
297 #if VM_PRESSURE_EVENTS
298 &vm_filtops, /* EVFILT_VM */
299 #else
300 &bad_filtops, /* EVFILT_VM */
301 #endif
302 &file_filtops, /* EVFILT_SOCK */
303 #if CONFIG_MEMORYSTATUS
304 &memorystatus_filtops, /* EVFILT_MEMORYSTATUS */
305 #else
306 &bad_filtops, /* EVFILT_MEMORYSTATUS */
307 #endif
308 };
309
310 /*
311 * kqueue/note lock attributes and implementations
312 *
313 * kqueues have locks, while knotes have use counts
314 * Most of the knote state is guarded by the object lock.
315 * the knote "inuse" count and status use the kqueue lock.
316 */
317 lck_grp_attr_t * kq_lck_grp_attr;
318 lck_grp_t * kq_lck_grp;
319 lck_attr_t * kq_lck_attr;
320
321 static inline void
322 kqlock(struct kqueue *kq)
323 {
324 lck_spin_lock(&kq->kq_lock);
325 }
326
327 static inline void
328 kqunlock(struct kqueue *kq)
329 {
330 lck_spin_unlock(&kq->kq_lock);
331 }
332
333 /*
334 * Convert a kq lock to a knote use referece.
335 *
336 * If the knote is being dropped, we can't get
337 * a use reference, so just return with it
338 * still locked.
339 * - kq locked at entry
340 * - unlock on exit if we get the use reference
341 */
342 static int
343 kqlock2knoteuse(struct kqueue *kq, struct knote *kn)
344 {
345 if (kn->kn_status & KN_DROPPING)
346 return (0);
347 kn->kn_inuse++;
348 kqunlock(kq);
349 return (1);
350 }
351
352 /*
353 * Convert a kq lock to a knote use referece,
354 * but wait for attach and drop events to complete.
355 *
356 * If the knote is being dropped, we can't get
357 * a use reference, so just return with it
358 * still locked.
359 * - kq locked at entry
360 * - kq always unlocked on exit
361 */
362 static int
363 kqlock2knoteusewait(struct kqueue *kq, struct knote *kn)
364 {
365 if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
366 kn->kn_status |= KN_USEWAIT;
367 waitq_assert_wait64((struct waitq *)kq->kq_wqs,
368 CAST_EVENT64_T(&kn->kn_status),
369 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
370 kqunlock(kq);
371 thread_block(THREAD_CONTINUE_NULL);
372 return (0);
373 }
374 kn->kn_inuse++;
375 kqunlock(kq);
376 return (1);
377 }
378
379 /*
380 * Convert from a knote use reference back to kq lock.
381 *
382 * Drop a use reference and wake any waiters if
383 * this is the last one.
384 *
385 * The exit return indicates if the knote is
386 * still alive - but the kqueue lock is taken
387 * unconditionally.
388 */
389 static int
390 knoteuse2kqlock(struct kqueue *kq, struct knote *kn)
391 {
392 kqlock(kq);
393 if (--kn->kn_inuse == 0) {
394 if ((kn->kn_status & KN_ATTACHING) != 0) {
395 kn->kn_status &= ~KN_ATTACHING;
396 }
397 if ((kn->kn_status & KN_USEWAIT) != 0) {
398 kn->kn_status &= ~KN_USEWAIT;
399 waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
400 CAST_EVENT64_T(&kn->kn_status),
401 THREAD_AWAKENED,
402 WAITQ_ALL_PRIORITIES);
403 }
404 }
405 return ((kn->kn_status & KN_DROPPING) == 0);
406 }
407
408 /*
409 * Convert a kq lock to a knote drop reference.
410 *
411 * If the knote is in use, wait for the use count
412 * to subside. We first mark our intention to drop
413 * it - keeping other users from "piling on."
414 * If we are too late, we have to wait for the
415 * other drop to complete.
416 *
417 * - kq locked at entry
418 * - always unlocked on exit.
419 * - caller can't hold any locks that would prevent
420 * the other dropper from completing.
421 */
422 static int
423 kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
424 {
425 int oktodrop;
426
427 oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
428 kn->kn_status &= ~KN_STAYQUEUED;
429 kn->kn_status |= KN_DROPPING;
430 if (oktodrop) {
431 if (kn->kn_inuse == 0) {
432 kqunlock(kq);
433 return (oktodrop);
434 }
435 }
436 kn->kn_status |= KN_USEWAIT;
437 waitq_assert_wait64((struct waitq *)kq->kq_wqs,
438 CAST_EVENT64_T(&kn->kn_status),
439 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
440 kqunlock(kq);
441 thread_block(THREAD_CONTINUE_NULL);
442 return (oktodrop);
443 }
444
445 /*
446 * Release a knote use count reference.
447 */
448 static void
449 knote_put(struct knote *kn)
450 {
451 struct kqueue *kq = kn->kn_kq;
452
453 kqlock(kq);
454 if (--kn->kn_inuse == 0) {
455 if ((kn->kn_status & KN_USEWAIT) != 0) {
456 kn->kn_status &= ~KN_USEWAIT;
457 waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
458 CAST_EVENT64_T(&kn->kn_status),
459 THREAD_AWAKENED,
460 WAITQ_ALL_PRIORITIES);
461 }
462 }
463 kqunlock(kq);
464 }
465
466 static int
467 filt_fileattach(struct knote *kn)
468 {
469 return (fo_kqfilter(kn->kn_fp, kn, vfs_context_current()));
470 }
471
472 #define f_flag f_fglob->fg_flag
473 #define f_msgcount f_fglob->fg_msgcount
474 #define f_cred f_fglob->fg_cred
475 #define f_ops f_fglob->fg_ops
476 #define f_offset f_fglob->fg_offset
477 #define f_data f_fglob->fg_data
478
479 static void
480 filt_kqdetach(struct knote *kn)
481 {
482 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
483
484 kqlock(kq);
485 KNOTE_DETACH(&kq->kq_sel.si_note, kn);
486 kqunlock(kq);
487 }
488
489 /*ARGSUSED*/
490 static int
491 filt_kqueue(struct knote *kn, __unused long hint)
492 {
493 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
494
495 kn->kn_data = kq->kq_count;
496 return (kn->kn_data > 0);
497 }
498
499 static int
500 filt_procattach(struct knote *kn)
501 {
502 struct proc *p;
503
504 assert(PID_MAX < NOTE_PDATAMASK);
505
506 if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0)
507 return (ENOTSUP);
508
509 p = proc_find(kn->kn_id);
510 if (p == NULL) {
511 return (ESRCH);
512 }
513
514 const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
515
516 if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
517 do {
518 pid_t selfpid = proc_selfpid();
519
520 if (p->p_ppid == selfpid)
521 break; /* parent => ok */
522
523 if ((p->p_lflag & P_LTRACED) != 0 &&
524 (p->p_oppid == selfpid))
525 break; /* parent-in-waiting => ok */
526
527 proc_rele(p);
528 return (EACCES);
529 } while (0);
530
531 proc_klist_lock();
532
533 kn->kn_flags |= EV_CLEAR; /* automatically set */
534 kn->kn_ptr.p_proc = p; /* store the proc handle */
535
536 KNOTE_ATTACH(&p->p_klist, kn);
537
538 proc_klist_unlock();
539
540 proc_rele(p);
541
542 return (0);
543 }
544
545 /*
546 * The knote may be attached to a different process, which may exit,
547 * leaving nothing for the knote to be attached to. In that case,
548 * the pointer to the process will have already been nulled out.
549 */
550 static void
551 filt_procdetach(struct knote *kn)
552 {
553 struct proc *p;
554
555 proc_klist_lock();
556
557 p = kn->kn_ptr.p_proc;
558 if (p != PROC_NULL) {
559 kn->kn_ptr.p_proc = PROC_NULL;
560 KNOTE_DETACH(&p->p_klist, kn);
561 }
562
563 proc_klist_unlock();
564 }
565
566 static int
567 filt_proc(struct knote *kn, long hint)
568 {
569 /*
570 * Note: a lot of bits in hint may be obtained from the knote
571 * To free some of those bits, see <rdar://problem/12592988> Freeing up
572 * bits in hint for filt_proc
573 */
574 /* hint is 0 when called from above */
575 if (hint != 0) {
576 u_int event;
577
578 /* ALWAYS CALLED WITH proc_klist_lock when (hint != 0) */
579
580 /*
581 * mask off extra data
582 */
583 event = (u_int)hint & NOTE_PCTRLMASK;
584
585 /*
586 * termination lifecycle events can happen while a debugger
587 * has reparented a process, in which case notifications
588 * should be quashed except to the tracing parent. When
589 * the debugger reaps the child (either via wait4(2) or
590 * process exit), the child will be reparented to the original
591 * parent and these knotes re-fired.
592 */
593 if (event & NOTE_EXIT) {
594 if ((kn->kn_ptr.p_proc->p_oppid != 0)
595 && (kn->kn_kq->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
596 /*
597 * This knote is not for the current ptrace(2) parent, ignore.
598 */
599 return 0;
600 }
601 }
602
603 /*
604 * if the user is interested in this event, record it.
605 */
606 if (kn->kn_sfflags & event)
607 kn->kn_fflags |= event;
608
609 #pragma clang diagnostic push
610 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
611 if ((event == NOTE_REAP) || ((event == NOTE_EXIT) && !(kn->kn_sfflags & NOTE_REAP))) {
612 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
613 }
614 #pragma clang diagnostic pop
615
616
617 /*
618 * The kernel has a wrapper in place that returns the same data
619 * as is collected here, in kn_data. Any changes to how
620 * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
621 * should also be reflected in the proc_pidnoteexit() wrapper.
622 */
623 if (event == NOTE_EXIT) {
624 kn->kn_data = 0;
625 if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
626 kn->kn_fflags |= NOTE_EXITSTATUS;
627 kn->kn_data |= (hint & NOTE_PDATAMASK);
628 }
629 if ((kn->kn_sfflags & NOTE_EXIT_DETAIL) != 0) {
630 kn->kn_fflags |= NOTE_EXIT_DETAIL;
631 if ((kn->kn_ptr.p_proc->p_lflag &
632 P_LTERM_DECRYPTFAIL) != 0) {
633 kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
634 }
635 if ((kn->kn_ptr.p_proc->p_lflag &
636 P_LTERM_JETSAM) != 0) {
637 kn->kn_data |= NOTE_EXIT_MEMORY;
638 switch (kn->kn_ptr.p_proc->p_lflag &
639 P_JETSAM_MASK) {
640 case P_JETSAM_VMPAGESHORTAGE:
641 kn->kn_data |= NOTE_EXIT_MEMORY_VMPAGESHORTAGE;
642 break;
643 case P_JETSAM_VMTHRASHING:
644 kn->kn_data |= NOTE_EXIT_MEMORY_VMTHRASHING;
645 break;
646 case P_JETSAM_FCTHRASHING:
647 kn->kn_data |= NOTE_EXIT_MEMORY_FCTHRASHING;
648 break;
649 case P_JETSAM_VNODE:
650 kn->kn_data |= NOTE_EXIT_MEMORY_VNODE;
651 break;
652 case P_JETSAM_HIWAT:
653 kn->kn_data |= NOTE_EXIT_MEMORY_HIWAT;
654 break;
655 case P_JETSAM_PID:
656 kn->kn_data |= NOTE_EXIT_MEMORY_PID;
657 break;
658 case P_JETSAM_IDLEEXIT:
659 kn->kn_data |= NOTE_EXIT_MEMORY_IDLE;
660 break;
661 }
662 }
663 if ((kn->kn_ptr.p_proc->p_csflags &
664 CS_KILLED) != 0) {
665 kn->kn_data |= NOTE_EXIT_CSERROR;
666 }
667 }
668 }
669 }
670
671 /* atomic check, no locking need when called from above */
672 return (kn->kn_fflags != 0);
673 }
674
675 #if VM_PRESSURE_EVENTS
676 /*
677 * Virtual memory kevents
678 *
679 * author: Matt Jacobson [matthew_jacobson@apple.com]
680 */
681
682 static int
683 filt_vmattach(struct knote *kn)
684 {
685 /*
686 * The note will be cleared once the information has been flushed to
687 * the client. If there is still pressure, we will be re-alerted.
688 */
689 kn->kn_flags |= EV_CLEAR;
690 return (vm_knote_register(kn));
691 }
692
693 static void
694 filt_vmdetach(struct knote *kn)
695 {
696 vm_knote_unregister(kn);
697 }
698
699 static int
700 filt_vm(struct knote *kn, long hint)
701 {
702 /* hint == 0 means this is just an alive? check (always true) */
703 if (hint != 0) {
704 const pid_t pid = (pid_t)hint;
705 if ((kn->kn_sfflags & NOTE_VM_PRESSURE) &&
706 (kn->kn_kq->kq_p->p_pid == pid)) {
707 kn->kn_fflags |= NOTE_VM_PRESSURE;
708 }
709 }
710
711 return (kn->kn_fflags != 0);
712 }
713 #endif /* VM_PRESSURE_EVENTS */
714
715 /*
716 * filt_timervalidate - process data from user
717 *
718 * Converts to either interval or deadline format.
719 *
720 * The saved-data field in the knote contains the
721 * time value. The saved filter-flags indicates
722 * the unit of measurement.
723 *
724 * After validation, either the saved-data field
725 * contains the interval in absolute time, or ext[0]
726 * contains the expected deadline. If that deadline
727 * is in the past, ext[0] is 0.
728 *
729 * Returns EINVAL for unrecognized units of time.
730 *
731 * Timer filter lock is held.
732 *
733 */
734 static int
735 filt_timervalidate(struct knote *kn)
736 {
737 uint64_t multiplier;
738 uint64_t raw = 0;
739
740 switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS)) {
741 case NOTE_SECONDS:
742 multiplier = NSEC_PER_SEC;
743 break;
744 case NOTE_USECONDS:
745 multiplier = NSEC_PER_USEC;
746 break;
747 case NOTE_NSECONDS:
748 multiplier = 1;
749 break;
750 case 0: /* milliseconds (default) */
751 multiplier = NSEC_PER_SEC / 1000;
752 break;
753 default:
754 return (EINVAL);
755 }
756
757 /* transform the slop delta(leeway) in kn_ext[1] if passed to same time scale */
758 if(kn->kn_sfflags & NOTE_LEEWAY){
759 nanoseconds_to_absolutetime((uint64_t)kn->kn_ext[1] * multiplier, &raw);
760 kn->kn_ext[1] = raw;
761 }
762
763 nanoseconds_to_absolutetime((uint64_t)kn->kn_sdata * multiplier, &raw);
764
765 kn->kn_ext[0] = 0;
766 kn->kn_sdata = 0;
767
768 if (kn->kn_sfflags & NOTE_ABSOLUTE) {
769 clock_sec_t seconds;
770 clock_nsec_t nanoseconds;
771 uint64_t now;
772
773 clock_get_calendar_nanotime(&seconds, &nanoseconds);
774 nanoseconds_to_absolutetime((uint64_t)seconds * NSEC_PER_SEC +
775 nanoseconds, &now);
776
777 if (raw < now) {
778 /* time has already passed */
779 kn->kn_ext[0] = 0;
780 } else {
781 raw -= now;
782 clock_absolutetime_interval_to_deadline(raw,
783 &kn->kn_ext[0]);
784 }
785 } else {
786 kn->kn_sdata = raw;
787 }
788
789 return (0);
790 }
791
792 /*
793 * filt_timerupdate - compute the next deadline
794 *
795 * Repeating timers store their interval in kn_sdata. Absolute
796 * timers have already calculated the deadline, stored in ext[0].
797 *
798 * On return, the next deadline (or zero if no deadline is needed)
799 * is stored in kn_ext[0].
800 *
801 * Timer filter lock is held.
802 */
803 static void
804 filt_timerupdate(struct knote *kn)
805 {
806 /* if there's no interval, deadline is just in kn_ext[0] */
807 if (kn->kn_sdata == 0)
808 return;
809
810 /* if timer hasn't fired before, fire in interval nsecs */
811 if (kn->kn_ext[0] == 0) {
812 clock_absolutetime_interval_to_deadline(kn->kn_sdata,
813 &kn->kn_ext[0]);
814 } else {
815 /*
816 * If timer has fired before, schedule the next pop
817 * relative to the last intended deadline.
818 *
819 * We could check for whether the deadline has expired,
820 * but the thread call layer can handle that.
821 */
822 kn->kn_ext[0] += kn->kn_sdata;
823 }
824 }
825
826 /*
827 * filt_timerexpire - the timer callout routine
828 *
829 * Just propagate the timer event into the knote
830 * filter routine (by going through the knote
831 * synchronization point). Pass a hint to
832 * indicate this is a real event, not just a
833 * query from above.
834 */
835 static void
836 filt_timerexpire(void *knx, __unused void *spare)
837 {
838 struct klist timer_list;
839 struct knote *kn = knx;
840
841 filt_timerlock();
842
843 kn->kn_hookid &= ~TIMER_RUNNING;
844
845 /* no "object" for timers, so fake a list */
846 SLIST_INIT(&timer_list);
847 SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
848 KNOTE(&timer_list, 1);
849
850 /* if someone is waiting for timer to pop */
851 if (kn->kn_hookid & TIMER_CANCELWAIT) {
852 struct kqueue *kq = kn->kn_kq;
853 waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
854 CAST_EVENT64_T(&kn->kn_hook),
855 THREAD_AWAKENED,
856 WAITQ_ALL_PRIORITIES);
857 }
858
859 filt_timerunlock();
860 }
861
862 /*
863 * Cancel a running timer (or wait for the pop).
864 * Timer filter lock is held.
865 */
866 static void
867 filt_timercancel(struct knote *kn)
868 {
869 struct kqueue *kq = kn->kn_kq;
870 thread_call_t callout = kn->kn_hook;
871 boolean_t cancelled;
872
873 if (kn->kn_hookid & TIMER_RUNNING) {
874 /* cancel the callout if we can */
875 cancelled = thread_call_cancel(callout);
876 if (cancelled) {
877 kn->kn_hookid &= ~TIMER_RUNNING;
878 } else {
879 /* we have to wait for the expire routine. */
880 kn->kn_hookid |= TIMER_CANCELWAIT;
881 waitq_assert_wait64((struct waitq *)kq->kq_wqs,
882 CAST_EVENT64_T(&kn->kn_hook),
883 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
884 filt_timerunlock();
885 thread_block(THREAD_CONTINUE_NULL);
886 filt_timerlock();
887 assert((kn->kn_hookid & TIMER_RUNNING) == 0);
888 }
889 }
890 }
891
892 /*
893 * Allocate a thread call for the knote's lifetime, and kick off the timer.
894 */
895 static int
896 filt_timerattach(struct knote *kn)
897 {
898 thread_call_t callout;
899 int error;
900
901 callout = thread_call_allocate(filt_timerexpire, kn);
902 if (NULL == callout)
903 return (ENOMEM);
904
905 filt_timerlock();
906 error = filt_timervalidate(kn);
907 if (error != 0) {
908 filt_timerunlock();
909 return (error);
910 }
911
912 kn->kn_hook = (void*)callout;
913 kn->kn_hookid = 0;
914
915 /* absolute=EV_ONESHOT */
916 if (kn->kn_sfflags & NOTE_ABSOLUTE)
917 kn->kn_flags |= EV_ONESHOT;
918
919 filt_timerupdate(kn);
920 if (kn->kn_ext[0]) {
921 kn->kn_flags |= EV_CLEAR;
922 unsigned int timer_flags = 0;
923 if (kn->kn_sfflags & NOTE_CRITICAL)
924 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
925 else if (kn->kn_sfflags & NOTE_BACKGROUND)
926 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
927 else
928 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
929
930 if (kn->kn_sfflags & NOTE_LEEWAY)
931 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
932
933 thread_call_enter_delayed_with_leeway(callout, NULL,
934 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
935
936 kn->kn_hookid |= TIMER_RUNNING;
937 } else {
938 /* fake immediate */
939 kn->kn_data = 1;
940 }
941
942 filt_timerunlock();
943 return (0);
944 }
945
946 /*
947 * Shut down the timer if it's running, and free the callout.
948 */
949 static void
950 filt_timerdetach(struct knote *kn)
951 {
952 thread_call_t callout;
953
954 filt_timerlock();
955
956 callout = (thread_call_t)kn->kn_hook;
957 filt_timercancel(kn);
958
959 filt_timerunlock();
960
961 thread_call_free(callout);
962 }
963
964
965
966 static int
967 filt_timer(struct knote *kn, long hint)
968 {
969 int result;
970
971 if (hint) {
972 /* real timer pop -- timer lock held by filt_timerexpire */
973 kn->kn_data++;
974
975 if (((kn->kn_hookid & TIMER_CANCELWAIT) == 0) &&
976 ((kn->kn_flags & EV_ONESHOT) == 0)) {
977
978 /* evaluate next time to fire */
979 filt_timerupdate(kn);
980
981 if (kn->kn_ext[0]) {
982 unsigned int timer_flags = 0;
983
984 /* keep the callout and re-arm */
985 if (kn->kn_sfflags & NOTE_CRITICAL)
986 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
987 else if (kn->kn_sfflags & NOTE_BACKGROUND)
988 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
989 else
990 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
991
992 if (kn->kn_sfflags & NOTE_LEEWAY)
993 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
994
995 thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
996 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
997
998 kn->kn_hookid |= TIMER_RUNNING;
999 }
1000 }
1001
1002 return (1);
1003 }
1004
1005 /* user-query */
1006 filt_timerlock();
1007
1008 result = (kn->kn_data != 0);
1009
1010 filt_timerunlock();
1011
1012 return (result);
1013 }
1014
1015
1016 /*
1017 * filt_timertouch - update knote with new user input
1018 *
1019 * Cancel and restart the timer based on new user data. When
1020 * the user picks up a knote, clear the count of how many timer
1021 * pops have gone off (in kn_data).
1022 */
1023 static void
1024 filt_timertouch(struct knote *kn, struct kevent_internal_s *kev, long type)
1025 {
1026 int error;
1027 filt_timerlock();
1028
1029 switch (type) {
1030 case EVENT_REGISTER:
1031 /* cancel current call */
1032 filt_timercancel(kn);
1033
1034 /* recalculate deadline */
1035 kn->kn_sdata = kev->data;
1036 kn->kn_sfflags = kev->fflags;
1037 kn->kn_ext[0] = kev->ext[0];
1038 kn->kn_ext[1] = kev->ext[1];
1039
1040 error = filt_timervalidate(kn);
1041 if (error) {
1042 /* no way to report error, so mark it in the knote */
1043 kn->kn_flags |= EV_ERROR;
1044 kn->kn_data = error;
1045 break;
1046 }
1047
1048 /* start timer if necessary */
1049 filt_timerupdate(kn);
1050
1051 if (kn->kn_ext[0]) {
1052 unsigned int timer_flags = 0;
1053 if (kn->kn_sfflags & NOTE_CRITICAL)
1054 timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
1055 else if (kn->kn_sfflags & NOTE_BACKGROUND)
1056 timer_flags |= THREAD_CALL_DELAY_USER_BACKGROUND;
1057 else
1058 timer_flags |= THREAD_CALL_DELAY_USER_NORMAL;
1059
1060 if (kn->kn_sfflags & NOTE_LEEWAY)
1061 timer_flags |= THREAD_CALL_DELAY_LEEWAY;
1062
1063 thread_call_enter_delayed_with_leeway(kn->kn_hook, NULL,
1064 kn->kn_ext[0], kn->kn_ext[1], timer_flags);
1065
1066 kn->kn_hookid |= TIMER_RUNNING;
1067 } else {
1068 /* pretend the timer has fired */
1069 kn->kn_data = 1;
1070 }
1071
1072 break;
1073
1074 case EVENT_PROCESS:
1075 /* reset the timer pop count in kn_data */
1076 *kev = kn->kn_kevent;
1077 kev->ext[0] = 0;
1078 kn->kn_data = 0;
1079 if (kn->kn_flags & EV_CLEAR)
1080 kn->kn_fflags = 0;
1081 break;
1082 default:
1083 panic("%s: - invalid type (%ld)", __func__, type);
1084 break;
1085 }
1086
1087 filt_timerunlock();
1088 }
1089
1090 static void
1091 filt_timerlock(void)
1092 {
1093 lck_mtx_lock(&_filt_timerlock);
1094 }
1095
1096 static void
1097 filt_timerunlock(void)
1098 {
1099 lck_mtx_unlock(&_filt_timerlock);
1100 }
1101
1102 static int
1103 filt_userattach(struct knote *kn)
1104 {
1105 /* EVFILT_USER knotes are not attached to anything in the kernel */
1106 kn->kn_hook = NULL;
1107 if (kn->kn_fflags & NOTE_TRIGGER) {
1108 kn->kn_hookid = 1;
1109 } else {
1110 kn->kn_hookid = 0;
1111 }
1112 return (0);
1113 }
1114
1115 static void
1116 filt_userdetach(__unused struct knote *kn)
1117 {
1118 /* EVFILT_USER knotes are not attached to anything in the kernel */
1119 }
1120
1121 static int
1122 filt_user(struct knote *kn, __unused long hint)
1123 {
1124 return (kn->kn_hookid);
1125 }
1126
1127 static void
1128 filt_usertouch(struct knote *kn, struct kevent_internal_s *kev, long type)
1129 {
1130 uint32_t ffctrl;
1131 switch (type) {
1132 case EVENT_REGISTER:
1133 if (kev->fflags & NOTE_TRIGGER) {
1134 kn->kn_hookid = 1;
1135 }
1136
1137 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1138 kev->fflags &= NOTE_FFLAGSMASK;
1139 switch (ffctrl) {
1140 case NOTE_FFNOP:
1141 break;
1142 case NOTE_FFAND:
1143 OSBitAndAtomic(kev->fflags, &kn->kn_sfflags);
1144 break;
1145 case NOTE_FFOR:
1146 OSBitOrAtomic(kev->fflags, &kn->kn_sfflags);
1147 break;
1148 case NOTE_FFCOPY:
1149 kn->kn_sfflags = kev->fflags;
1150 break;
1151 }
1152 kn->kn_sdata = kev->data;
1153 break;
1154 case EVENT_PROCESS:
1155 *kev = kn->kn_kevent;
1156 kev->fflags = (volatile UInt32)kn->kn_sfflags;
1157 kev->data = kn->kn_sdata;
1158 if (kn->kn_flags & EV_CLEAR) {
1159 kn->kn_hookid = 0;
1160 kn->kn_data = 0;
1161 kn->kn_fflags = 0;
1162 }
1163 break;
1164 default:
1165 panic("%s: - invalid type (%ld)", __func__, type);
1166 break;
1167 }
1168 }
1169
1170 /*
1171 * JMM - placeholder for not-yet-implemented filters
1172 */
1173 static int
1174 filt_badattach(__unused struct knote *kn)
1175 {
1176 return (ENOTSUP);
1177 }
1178
1179 struct kqueue *
1180 kqueue_alloc(struct proc *p)
1181 {
1182 struct filedesc *fdp = p->p_fd;
1183 struct kqueue *kq;
1184
1185 MALLOC_ZONE(kq, struct kqueue *, sizeof (struct kqueue), M_KQUEUE,
1186 M_WAITOK);
1187 if (kq != NULL) {
1188 struct waitq_set *wqs;
1189
1190 wqs = waitq_set_alloc(SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST | SYNC_POLICY_DISABLE_IRQ);
1191 if (wqs != NULL) {
1192 bzero(kq, sizeof (struct kqueue));
1193 lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
1194 TAILQ_INIT(&kq->kq_head);
1195 kq->kq_wqs = wqs;
1196 kq->kq_p = p;
1197 } else {
1198 FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
1199 kq = NULL;
1200 }
1201 }
1202
1203 if (fdp->fd_knlistsize < 0) {
1204 proc_fdlock(p);
1205 if (fdp->fd_knlistsize < 0)
1206 fdp->fd_knlistsize = 0; /* this process has had a kq */
1207 proc_fdunlock(p);
1208 }
1209
1210 return (kq);
1211 }
1212
1213 /*
1214 * kqueue_dealloc - detach all knotes from a kqueue and free it
1215 *
1216 * We walk each list looking for knotes referencing this
1217 * this kqueue. If we find one, we try to drop it. But
1218 * if we fail to get a drop reference, that will wait
1219 * until it is dropped. So, we can just restart again
1220 * safe in the assumption that the list will eventually
1221 * not contain any more references to this kqueue (either
1222 * we dropped them all, or someone else did).
1223 *
1224 * Assumes no new events are being added to the kqueue.
1225 * Nothing locked on entry or exit.
1226 */
1227 void
1228 kqueue_dealloc(struct kqueue *kq)
1229 {
1230 struct proc *p;
1231 struct filedesc *fdp;
1232 struct knote *kn;
1233 int i;
1234
1235 if (kq == NULL)
1236 return;
1237
1238 p = kq->kq_p;
1239 fdp = p->p_fd;
1240
1241 proc_fdlock(p);
1242 for (i = 0; i < fdp->fd_knlistsize; i++) {
1243 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1244 while (kn != NULL) {
1245 if (kq == kn->kn_kq) {
1246 kqlock(kq);
1247 proc_fdunlock(p);
1248 /* drop it ourselves or wait */
1249 if (kqlock2knotedrop(kq, kn)) {
1250 kn->kn_fop->f_detach(kn);
1251 knote_drop(kn, p);
1252 }
1253 proc_fdlock(p);
1254 /* start over at beginning of list */
1255 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
1256 continue;
1257 }
1258 kn = SLIST_NEXT(kn, kn_link);
1259 }
1260 }
1261 if (fdp->fd_knhashmask != 0) {
1262 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
1263 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1264 while (kn != NULL) {
1265 if (kq == kn->kn_kq) {
1266 kqlock(kq);
1267 proc_fdunlock(p);
1268 /* drop it ourselves or wait */
1269 if (kqlock2knotedrop(kq, kn)) {
1270 kn->kn_fop->f_detach(kn);
1271 knote_drop(kn, p);
1272 }
1273 proc_fdlock(p);
1274 /* start over at beginning of list */
1275 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
1276 continue;
1277 }
1278 kn = SLIST_NEXT(kn, kn_link);
1279 }
1280 }
1281 }
1282 proc_fdunlock(p);
1283
1284 /*
1285 * waitq_set_free() clears all preposts and also remove the KQ's
1286 * waitq set from any select sets to which it may belong.
1287 */
1288 waitq_set_free(kq->kq_wqs);
1289 kq->kq_wqs = NULL;
1290 lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
1291 FREE_ZONE(kq, sizeof (struct kqueue), M_KQUEUE);
1292 }
1293
1294 int
1295 kqueue_body(struct proc *p, fp_allocfn_t fp_zalloc, void *cra, int32_t *retval)
1296 {
1297 struct kqueue *kq;
1298 struct fileproc *fp;
1299 int fd, error;
1300
1301 error = falloc_withalloc(p,
1302 &fp, &fd, vfs_context_current(), fp_zalloc, cra);
1303 if (error) {
1304 return (error);
1305 }
1306
1307 kq = kqueue_alloc(p);
1308 if (kq == NULL) {
1309 fp_free(p, fd, fp);
1310 return (ENOMEM);
1311 }
1312
1313 fp->f_flag = FREAD | FWRITE;
1314 fp->f_ops = &kqueueops;
1315 fp->f_data = kq;
1316
1317 proc_fdlock(p);
1318 *fdflags(p, fd) |= UF_EXCLOSE;
1319 procfdtbl_releasefd(p, fd, NULL);
1320 fp_drop(p, fd, fp, 1);
1321 proc_fdunlock(p);
1322
1323 *retval = fd;
1324 return (error);
1325 }
1326
1327 int
1328 kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
1329 {
1330 return (kqueue_body(p, fileproc_alloc_init, NULL, retval));
1331 }
1332
1333 static int
1334 kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
1335 unsigned int flags)
1336 {
1337 int advance;
1338 int error;
1339
1340 if (flags & KEVENT_FLAG_LEGACY32) {
1341 bzero(kevp, sizeof (*kevp));
1342
1343 if (IS_64BIT_PROCESS(p)) {
1344 struct user64_kevent kev64;
1345
1346 advance = sizeof (kev64);
1347 error = copyin(*addrp, (caddr_t)&kev64, advance);
1348 if (error)
1349 return (error);
1350 kevp->ident = kev64.ident;
1351 kevp->filter = kev64.filter;
1352 kevp->flags = kev64.flags;
1353 kevp->udata = kev64.udata;
1354 kevp->fflags = kev64.fflags;
1355 kevp->data = kev64.data;
1356 } else {
1357 struct user32_kevent kev32;
1358
1359 advance = sizeof (kev32);
1360 error = copyin(*addrp, (caddr_t)&kev32, advance);
1361 if (error)
1362 return (error);
1363 kevp->ident = (uintptr_t)kev32.ident;
1364 kevp->filter = kev32.filter;
1365 kevp->flags = kev32.flags;
1366 kevp->udata = CAST_USER_ADDR_T(kev32.udata);
1367 kevp->fflags = kev32.fflags;
1368 kevp->data = (intptr_t)kev32.data;
1369 }
1370 } else if (flags & KEVENT_FLAG_LEGACY64) {
1371 struct kevent64_s kev64;
1372
1373 bzero(kevp, sizeof (*kevp));
1374
1375 advance = sizeof (struct kevent64_s);
1376 error = copyin(*addrp, (caddr_t)&kev64, advance);
1377 if (error)
1378 return(error);
1379 kevp->ident = kev64.ident;
1380 kevp->filter = kev64.filter;
1381 kevp->flags = kev64.flags;
1382 kevp->udata = kev64.udata;
1383 kevp->fflags = kev64.fflags;
1384 kevp->data = kev64.data;
1385 kevp->ext[0] = kev64.ext[0];
1386 kevp->ext[1] = kev64.ext[1];
1387
1388 } else {
1389 struct kevent_qos_s kevqos;
1390
1391 bzero(kevp, sizeof (*kevp));
1392
1393 advance = sizeof (struct kevent_qos_s);
1394 error = copyin(*addrp, (caddr_t)&kevqos, advance);
1395 if (error)
1396 return error;
1397 kevp->ident = kevqos.ident;
1398 kevp->filter = kevqos.filter;
1399 kevp->flags = kevqos.flags;
1400 kevp->udata = kevqos.udata;
1401 kevp->fflags = kevqos.fflags;
1402 kevp->data = kevqos.data;
1403 kevp->ext[0] = kevqos.ext[0];
1404 kevp->ext[1] = kevqos.ext[1];
1405 }
1406 if (!error)
1407 *addrp += advance;
1408 return (error);
1409 }
1410
1411 static int
1412 kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
1413 unsigned int flags)
1414 {
1415 user_addr_t addr = *addrp;
1416 int advance;
1417 int error;
1418
1419 if (flags & KEVENT_FLAG_LEGACY32) {
1420 assert((flags & KEVENT_FLAG_STACK_EVENTS) == 0);
1421
1422 if (IS_64BIT_PROCESS(p)) {
1423 struct user64_kevent kev64;
1424
1425 /*
1426 * deal with the special case of a user-supplied
1427 * value of (uintptr_t)-1.
1428 */
1429 kev64.ident = (kevp->ident == (uintptr_t)-1) ?
1430 (uint64_t)-1LL : (uint64_t)kevp->ident;
1431
1432 kev64.filter = kevp->filter;
1433 kev64.flags = kevp->flags;
1434 kev64.fflags = kevp->fflags;
1435 kev64.data = (int64_t) kevp->data;
1436 kev64.udata = kevp->udata;
1437 advance = sizeof (kev64);
1438 error = copyout((caddr_t)&kev64, addr, advance);
1439 } else {
1440 struct user32_kevent kev32;
1441
1442 kev32.ident = (uint32_t)kevp->ident;
1443 kev32.filter = kevp->filter;
1444 kev32.flags = kevp->flags;
1445 kev32.fflags = kevp->fflags;
1446 kev32.data = (int32_t)kevp->data;
1447 kev32.udata = kevp->udata;
1448 advance = sizeof (kev32);
1449 error = copyout((caddr_t)&kev32, addr, advance);
1450 }
1451 } else if (flags & KEVENT_FLAG_LEGACY64) {
1452 struct kevent64_s kev64;
1453
1454 advance = sizeof (struct kevent64_s);
1455 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1456 addr -= advance;
1457 }
1458 kev64.ident = kevp->ident;
1459 kev64.filter = kevp->filter;
1460 kev64.flags = kevp->flags;
1461 kev64.fflags = kevp->fflags;
1462 kev64.data = (int64_t) kevp->data;
1463 kev64.udata = kevp->udata;
1464 kev64.ext[0] = kevp->ext[0];
1465 kev64.ext[1] = kevp->ext[1];
1466 error = copyout((caddr_t)&kev64, addr, advance);
1467 } else {
1468 struct kevent_qos_s kevqos;
1469
1470 bzero(&kevqos, sizeof (struct kevent_qos_s));
1471 advance = sizeof (struct kevent_qos_s);
1472 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1473 addr -= advance;
1474 }
1475 kevqos.ident = kevp->ident;
1476 kevqos.filter = kevp->filter;
1477 kevqos.flags = kevp->flags;
1478 kevqos.fflags = kevp->fflags;
1479 kevqos.data = (int64_t) kevp->data;
1480 kevqos.udata = kevp->udata;
1481 kevqos.ext[0] = kevp->ext[0];
1482 kevqos.ext[1] = kevp->ext[1];
1483 error = copyout((caddr_t)&kevqos, addr, advance);
1484 }
1485 if (!error) {
1486 if (flags & KEVENT_FLAG_STACK_EVENTS)
1487 *addrp = addr;
1488 else
1489 *addrp = addr + advance;
1490 }
1491 return (error);
1492 }
1493
1494 /*
1495 * kevent_continue - continue a kevent syscall after blocking
1496 *
1497 * assume we inherit a use count on the kq fileglob.
1498 */
1499
1500 static void
1501 kevent_continue(__unused struct kqueue *kq, void *data, int error)
1502 {
1503 struct _kevent *cont_args;
1504 struct fileproc *fp;
1505 int32_t *retval;
1506 int noutputs;
1507 int fd;
1508 struct proc *p = current_proc();
1509
1510 cont_args = (struct _kevent *)data;
1511 noutputs = cont_args->eventout;
1512 retval = cont_args->retval;
1513 fd = cont_args->fd;
1514 fp = cont_args->fp;
1515
1516 if (fp != NULL)
1517 fp_drop(p, fd, fp, 0);
1518
1519 /* don't restart after signals... */
1520 if (error == ERESTART)
1521 error = EINTR;
1522 else if (error == EWOULDBLOCK)
1523 error = 0;
1524 if (error == 0)
1525 *retval = noutputs;
1526 unix_syscall_return(error);
1527 }
1528
1529 /*
1530 * kevent - [syscall] register and wait for kernel events
1531 *
1532 */
1533 int
1534 kevent(struct proc *p, struct kevent_args *uap, int32_t *retval)
1535 {
1536 unsigned int flags = KEVENT_FLAG_LEGACY32;
1537
1538 return kevent_internal(p,
1539 uap->fd,
1540 uap->changelist, uap->nchanges,
1541 uap->eventlist, uap->nevents,
1542 0ULL, 0ULL,
1543 flags,
1544 uap->timeout,
1545 kevent_continue,
1546 retval);
1547 }
1548
1549 int
1550 kevent64(struct proc *p, struct kevent64_args *uap, int32_t *retval)
1551 {
1552 unsigned int flags;
1553
1554 /* restrict to user flags and set legacy64 */
1555 flags = uap->flags & KEVENT_FLAG_USER;
1556 flags |= KEVENT_FLAG_LEGACY64;
1557
1558 return kevent_internal(p,
1559 uap->fd,
1560 uap->changelist, uap->nchanges,
1561 uap->eventlist, uap->nevents,
1562 0ULL, 0ULL,
1563 flags,
1564 uap->timeout,
1565 kevent_continue,
1566 retval);
1567 }
1568
1569 int
1570 kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
1571 {
1572 user_size_t usize = 0;
1573 user_size_t ssize;
1574 int error;
1575
1576 /* restrict to user flags */
1577 uap->flags &= KEVENT_FLAG_USER;
1578
1579 if (uap->data_available) {
1580 if (!IS_64BIT_PROCESS(p)) {
1581 uint32_t csize;
1582
1583 error = copyin(uap->data_available, (caddr_t)&csize, sizeof(csize));
1584 if (error)
1585 return error;
1586 usize = csize;
1587 } else {
1588 uint64_t csize;
1589 error = copyin(uap->data_available, (caddr_t)&csize, sizeof(csize));
1590 if (error)
1591 return error;
1592 usize = csize;
1593 }
1594 }
1595 ssize = usize;
1596
1597 error = kevent_internal(p,
1598 uap->fd,
1599 uap->changelist, uap->nchanges,
1600 uap->eventlist, uap->nevents,
1601 uap->data_out, &usize,
1602 uap->flags,
1603 0ULL,
1604 kevent_continue,
1605 retval);
1606
1607 if (error == 0 && uap->data_available && usize != ssize) {
1608 if (!IS_64BIT_PROCESS(p)) {
1609 uint32_t csize = (uint32_t)usize;
1610
1611 error = copyout((caddr_t)&csize, uap->data_available, sizeof(csize));
1612 } else {
1613 error = copyout((caddr_t)&usize, uap->data_available, sizeof(usize));
1614 }
1615 }
1616 return error;
1617 }
1618
1619 int
1620 kevent_qos_internal(struct proc *p, int fd,
1621 user_addr_t changelist, int nchanges,
1622 user_addr_t eventlist, int nevents,
1623 user_addr_t data_out, user_size_t *data_available,
1624 unsigned int flags,
1625 int32_t *retval)
1626 {
1627 return kevent_internal(p,
1628 fd,
1629 changelist, nchanges,
1630 eventlist, nevents,
1631 data_out, data_available,
1632 flags,
1633 0ULL,
1634 NULL,
1635 retval);
1636 }
1637
1638 static int
1639 kevent_internal(struct proc *p,
1640 int fd,
1641 user_addr_t changelist, int nchanges,
1642 user_addr_t ueventlist, int nevents,
1643 user_addr_t data_out, user_size_t *data_available,
1644 unsigned int flags,
1645 user_addr_t utimeout,
1646 kqueue_continue_t continuation,
1647 int32_t *retval)
1648 {
1649 struct _kevent *cont_args;
1650 uthread_t ut;
1651 struct kqueue *kq;
1652 struct fileproc *fp = NULL;
1653 struct kevent_internal_s kev;
1654 int error, noutputs;
1655 struct timeval atv;
1656
1657 #if 1
1658 /* temporarily ignore these fields */
1659 (void)data_out;
1660 (void)data_available;
1661 #endif
1662
1663 /* prepare to deal with stack-wise allocation of out events */
1664 if (flags & KEVENT_FLAG_STACK_EVENTS) {
1665 int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
1666 (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
1667 sizeof(struct user32_kevent)) :
1668 ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
1669 sizeof(struct kevent_qos_s)));
1670 ueventlist += nevents * scale;
1671 }
1672
1673 /* convert timeout to absolute - if we have one (and not immediate) */
1674 if (flags & KEVENT_FLAG_IMMEDIATE) {
1675 getmicrouptime(&atv);
1676 } else if (utimeout != USER_ADDR_NULL) {
1677 struct timeval rtv;
1678 if (IS_64BIT_PROCESS(p)) {
1679 struct user64_timespec ts;
1680 error = copyin(utimeout, &ts, sizeof(ts));
1681 if ((ts.tv_sec & 0xFFFFFFFF00000000ull) != 0)
1682 error = EINVAL;
1683 else
1684 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
1685 } else {
1686 struct user32_timespec ts;
1687 error = copyin(utimeout, &ts, sizeof(ts));
1688 TIMESPEC_TO_TIMEVAL(&rtv, &ts);
1689 }
1690 if (error)
1691 return (error);
1692 if (itimerfix(&rtv))
1693 return (EINVAL);
1694 getmicrouptime(&atv);
1695 timevaladd(&atv, &rtv);
1696 } else {
1697 /* wait forever value */
1698 atv.tv_sec = 0;
1699 atv.tv_usec = 0;
1700 }
1701
1702 if (flags & KEVENT_FLAG_WORKQ) {
1703 /*
1704 * use the private kq associated with the proc workq.
1705 * Just being a thread within the process (and not
1706 * being the exit/exec thread) is enough to hold a
1707 * reference on this special kq.
1708 */
1709 kq = p->p_wqkqueue;
1710 if (kq == NULL) {
1711 struct kqueue *alloc_kq = kqueue_alloc(p);
1712 if (alloc_kq == NULL)
1713 return ENOMEM;
1714
1715 proc_fdlock(p);
1716 if (p->p_wqkqueue == NULL) {
1717 /*
1718 * The kq is marked as special -
1719 * with unique interactions with
1720 * the workq for this process.
1721 */
1722 alloc_kq->kq_state |= KQ_WORKQ;
1723 kq = p->p_wqkqueue = alloc_kq;
1724 proc_fdunlock(p);
1725 } else {
1726 proc_fdunlock(p);
1727 kq = p->p_wqkqueue;
1728 kqueue_dealloc(alloc_kq);
1729 }
1730 }
1731 } else {
1732 /* get a usecount for the kq itself */
1733 if ((error = fp_getfkq(p, fd, &fp, &kq)) != 0)
1734 return (error);
1735 }
1736
1737 /* each kq should only be used for events of one type */
1738 kqlock(kq);
1739 if (kq->kq_state & (KQ_KEV32 | KQ_KEV64 | KQ_KEV_QOS)) {
1740 if (flags & KEVENT_FLAG_LEGACY32) {
1741 if ((kq->kq_state & KQ_KEV32) == 0) {
1742 error = EINVAL;
1743 kqunlock(kq);
1744 goto errorout;
1745 }
1746 } else if (kq->kq_state & KQ_KEV32) {
1747 error = EINVAL;
1748 kqunlock(kq);
1749 goto errorout;
1750 }
1751 } else if (flags & KEVENT_FLAG_LEGACY32) {
1752 kq->kq_state |= KQ_KEV32;
1753 } else {
1754 /* JMM - set KQ_KEVQOS when we are ready for exclusive */
1755 kq->kq_state |= KQ_KEV64;
1756 }
1757 kqunlock(kq);
1758
1759 /* register all the change requests the user provided... */
1760 noutputs = 0;
1761 while (nchanges > 0 && error == 0) {
1762 error = kevent_copyin(&changelist, &kev, p, flags);
1763 if (error)
1764 break;
1765
1766 kev.flags &= ~EV_SYSFLAGS;
1767 error = kevent_register(kq, &kev, p);
1768 if ((error || (kev.flags & EV_RECEIPT)) && nevents > 0) {
1769 kev.flags = EV_ERROR;
1770 kev.data = error;
1771 error = kevent_copyout(&kev, &ueventlist, p, flags);
1772 if (error == 0) {
1773 nevents--;
1774 noutputs++;
1775 }
1776 }
1777 nchanges--;
1778 }
1779
1780 /* short-circuit the scan if we only want error events */
1781 if (flags & KEVENT_FLAG_ERROR_EVENTS)
1782 nevents = 0;
1783
1784 if (nevents > 0 && noutputs == 0 && error == 0) {
1785
1786 /* store the continuation/completion data in the uthread */
1787 ut = (uthread_t)get_bsdthread_info(current_thread());
1788 cont_args = &ut->uu_kevent.ss_kevent;
1789 cont_args->fp = fp;
1790 cont_args->fd = fd;
1791 cont_args->retval = retval;
1792 cont_args->eventlist = ueventlist;
1793 cont_args->eventcount = nevents;
1794 cont_args->eventout = noutputs;
1795 cont_args->eventflags = flags;
1796
1797 error = kqueue_scan(kq, kevent_callback,
1798 continuation, cont_args,
1799 &atv, p);
1800
1801 noutputs = cont_args->eventout;
1802 }
1803
1804 /* don't restart after signals... */
1805 if (error == ERESTART)
1806 error = EINTR;
1807 else if (error == EWOULDBLOCK)
1808 error = 0;
1809 if (error == 0)
1810 *retval = noutputs;
1811 errorout:
1812 if (fp != NULL)
1813 fp_drop(p, fd, fp, 0);
1814 return (error);
1815 }
1816
1817
1818 /*
1819 * kevent_callback - callback for each individual event
1820 *
1821 * called with nothing locked
1822 * caller holds a reference on the kqueue
1823 */
1824 static int
1825 kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
1826 void *data)
1827 {
1828 struct _kevent *cont_args;
1829 int error;
1830
1831 cont_args = (struct _kevent *)data;
1832 assert(cont_args->eventout < cont_args->eventcount);
1833
1834 /*
1835 * Copy out the appropriate amount of event data for this user.
1836 */
1837 error = kevent_copyout(kevp, &cont_args->eventlist, current_proc(),
1838 cont_args->eventflags);
1839
1840 /*
1841 * If there isn't space for additional events, return
1842 * a harmless error to stop the processing here
1843 */
1844 if (error == 0 && ++cont_args->eventout == cont_args->eventcount)
1845 error = EWOULDBLOCK;
1846 return (error);
1847 }
1848
1849 /*
1850 * kevent_description - format a description of a kevent for diagnostic output
1851 *
1852 * called with a 256-byte string buffer
1853 */
1854
1855 char *
1856 kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
1857 {
1858 snprintf(s, n,
1859 "kevent="
1860 "{.ident=%#llx, .filter=%d, .flags=%#x, .udata=%#llx, .fflags=%#x, .data=%#llx, .ext[0]=%#llx, .ext[1]=%#llx}",
1861 kevp->ident,
1862 kevp->filter,
1863 kevp->flags,
1864 kevp->udata,
1865 kevp->fflags,
1866 kevp->data,
1867 kevp->ext[0],
1868 kevp->ext[1] );
1869
1870 return (s);
1871 }
1872
1873 /*
1874 * kevent_register - add a new event to a kqueue
1875 *
1876 * Creates a mapping between the event source and
1877 * the kqueue via a knote data structure.
1878 *
1879 * Because many/most the event sources are file
1880 * descriptor related, the knote is linked off
1881 * the filedescriptor table for quick access.
1882 *
1883 * called with nothing locked
1884 * caller holds a reference on the kqueue
1885 */
1886
1887 int
1888 kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
1889 __unused struct proc *ctxp)
1890 {
1891 struct proc *p = kq->kq_p;
1892 struct filedesc *fdp = p->p_fd;
1893 struct filterops *fops;
1894 struct fileproc *fp = NULL;
1895 struct knote *kn = NULL;
1896 struct klist *list;
1897 int error = 0;
1898
1899 if (kev->filter < 0) {
1900 if (kev->filter + EVFILT_SYSCOUNT < 0)
1901 return (EINVAL);
1902 fops = sysfilt_ops[~kev->filter]; /* to 0-base index */
1903 } else {
1904 return (EINVAL);
1905 }
1906
1907 restart:
1908 /* this iocount needs to be dropped if it is not registered */
1909 list = NULL;
1910 proc_fdlock(p);
1911
1912 /*
1913 * determine where to look for the knote
1914 */
1915 if (fops->f_isfd) {
1916 if ((error = fp_lookup(p, kev->ident, &fp, 1)) != 0) {
1917 proc_fdunlock(p);
1918 return (error);
1919 }
1920 /* fd-based knotes are linked off the fd table */
1921 if (kev->ident < (u_int)fdp->fd_knlistsize) {
1922 list = &fdp->fd_knlist[kev->ident];
1923 }
1924 } else if (fdp->fd_knhashmask != 0) {
1925 /* hash non-fd knotes here too */
1926 list = &fdp->fd_knhash[KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1927 }
1928
1929 /*
1930 * scan the selected list looking for a match
1931 */
1932 if (list != NULL) {
1933 SLIST_FOREACH(kn, list, kn_link) {
1934 if (kq == kn->kn_kq &&
1935 kev->ident == kn->kn_id &&
1936 kev->filter == kn->kn_filter) {
1937 if (kev->flags & EV_UDATA_SPECIFIC) {
1938 if ((kn->kn_flags & EV_UDATA_SPECIFIC) &&
1939 kev->udata == kn->kn_udata) {
1940 break; /* matching udata-specific knote */
1941 }
1942 } else if ((kn->kn_flags & EV_UDATA_SPECIFIC) == 0) {
1943 break; /* matching non-udata-specific knote */
1944 }
1945 }
1946 }
1947 }
1948
1949 /*
1950 * kn now contains the matching knote, or NULL if no match
1951 */
1952 if (kn == NULL) {
1953 if ((kev->flags & (EV_ADD|EV_DELETE)) == EV_ADD) {
1954 kn = knote_alloc();
1955 if (kn == NULL) {
1956 proc_fdunlock(p);
1957 error = ENOMEM;
1958 goto done;
1959 }
1960 kn->kn_fp = fp;
1961 kn->kn_kq = kq;
1962 kn->kn_tq = &kq->kq_head;
1963 kn->kn_fop = fops;
1964 kn->kn_sfflags = kev->fflags;
1965 kn->kn_sdata = kev->data;
1966 kev->fflags = 0;
1967 kev->data = 0;
1968 kn->kn_kevent = *kev;
1969 kn->kn_inuse = 1; /* for f_attach() */
1970 kn->kn_status = KN_ATTACHING;
1971
1972 /* before anyone can find it */
1973 if (kev->flags & EV_DISABLE)
1974 kn->kn_status |= KN_DISABLED;
1975
1976 error = knote_fdpattach(kn, fdp, p);
1977 proc_fdunlock(p);
1978
1979 if (error) {
1980 knote_free(kn);
1981 goto done;
1982 }
1983
1984 /*
1985 * apply reference count to knote structure, and
1986 * do not release it at the end of this routine.
1987 */
1988 fp = NULL;
1989
1990 error = fops->f_attach(kn);
1991
1992 kqlock(kq);
1993
1994 if (error != 0) {
1995 /*
1996 * Failed to attach correctly, so drop.
1997 * All other possible users/droppers
1998 * have deferred to us.
1999 */
2000 kn->kn_status |= KN_DROPPING;
2001 kqunlock(kq);
2002 knote_drop(kn, p);
2003 goto done;
2004 } else if (kn->kn_status & KN_DROPPING) {
2005 /*
2006 * Attach succeeded, but someone else
2007 * deferred their drop - now we have
2008 * to do it for them (after detaching).
2009 */
2010 kqunlock(kq);
2011 kn->kn_fop->f_detach(kn);
2012 knote_drop(kn, p);
2013 goto done;
2014 }
2015 kn->kn_status &= ~KN_ATTACHING;
2016 kqunlock(kq);
2017 } else {
2018 proc_fdunlock(p);
2019 error = ENOENT;
2020 goto done;
2021 }
2022 } else {
2023 /* existing knote - get kqueue lock */
2024 kqlock(kq);
2025 proc_fdunlock(p);
2026
2027 if (kev->flags & EV_DELETE) {
2028 if ((kev->flags & EV_ENABLE) == 0 &&
2029 (kev->flags & EV_DISPATCH2) == EV_DISPATCH2 &&
2030 (kn->kn_status & KN_DISABLED) == KN_DISABLED) {
2031 /* mark for deferred drop */
2032 kn->kn_status |= KN_DEFERDROP;
2033 kqunlock(kq);
2034 error = EINPROGRESS;
2035 } else {
2036 knote_dequeue(kn);
2037 kn->kn_status |= KN_DISABLED;
2038 if (kqlock2knotedrop(kq, kn)) {
2039 kn->kn_fop->f_detach(kn);
2040 knote_drop(kn, p);
2041 } else {
2042 /* pretend we didn't find it */
2043 error = ENOENT;
2044 }
2045 }
2046 goto done;
2047 }
2048
2049 /* update status flags for existing knote */
2050 if (kev->flags & EV_DISABLE) {
2051 knote_dequeue(kn);
2052 kn->kn_status |= KN_DISABLED;
2053
2054 } else if ((kev->flags & EV_ENABLE) &&
2055 (kn->kn_status & KN_DISABLED)) {
2056 kn->kn_status &= ~KN_DISABLED;
2057
2058 /* handle deferred drop */
2059 if (kn->kn_status & KN_DEFERDROP) {
2060 kn->kn_status &= ~KN_DEFERDROP;
2061 kn->kn_flags |= (EV_DELETE | EV_ONESHOT);
2062 knote_activate(kn, 0);
2063 kqunlock(kq);
2064 goto done;
2065 }
2066
2067 if (kn->kn_status & KN_ACTIVE) {
2068 /* force re-activate if previously active */
2069 knote_activate(kn, 1);
2070 }
2071 }
2072
2073 /*
2074 * The user may change some filter values after the
2075 * initial EV_ADD, but doing so will not reset any
2076 * filter which have already been triggered.
2077 */
2078 kn->kn_kevent.udata = kev->udata;
2079 if (fops->f_isfd || fops->f_touch == NULL) {
2080 kn->kn_sfflags = kev->fflags;
2081 kn->kn_sdata = kev->data;
2082 }
2083
2084 /*
2085 * If somebody is in the middle of dropping this
2086 * knote - go find/insert a new one. But we have
2087 * wait for this one to go away first. Attaches
2088 * running in parallel may also drop/modify the
2089 * knote. Wait for those to complete as well and
2090 * then start over if we encounter one.
2091 */
2092 if (!kqlock2knoteusewait(kq, kn)) {
2093 /* kqueue, proc_fdlock both unlocked */
2094 goto restart;
2095 }
2096
2097 /*
2098 * Call touch routine to notify filter of changes
2099 * in filter values.
2100 */
2101 if (!fops->f_isfd && fops->f_touch != NULL)
2102 fops->f_touch(kn, kev, EVENT_REGISTER);
2103 }
2104 /* still have use ref on knote */
2105
2106 /*
2107 * Invoke the filter routine to see if it should be enqueued now.
2108 */
2109 #if 0
2110 if (kn->kn_fop->f_event(kn, 0)) {
2111 #else
2112 /*
2113 * JMM - temporary workaround until rdar://problem/19986199
2114 * This potentially results in extra wakeups for KN_STAYQUEUED event types,
2115 * but waking up only truly active ones (yet trying below to determine
2116 * active status, by invoking the filter routine, is having side-effects).
2117 */
2118 if ((kn->kn_status & KN_STAYQUEUED) || kn->kn_fop->f_event(kn, 0)) {
2119 #endif
2120 if (knoteuse2kqlock(kq, kn))
2121 knote_activate(kn, (kn->kn_status & KN_STAYQUEUED));
2122 kqunlock(kq);
2123 } else {
2124 knote_put(kn);
2125 }
2126
2127 done:
2128 if (fp != NULL)
2129 fp_drop(p, kev->ident, fp, 0);
2130 return (error);
2131 }
2132
2133
2134 /*
2135 * knote_process - process a triggered event
2136 *
2137 * Validate that it is really still a triggered event
2138 * by calling the filter routines (if necessary). Hold
2139 * a use reference on the knote to avoid it being detached.
2140 * If it is still considered triggered, invoke the callback
2141 * routine provided and move it to the provided inprocess
2142 * queue.
2143 *
2144 * caller holds a reference on the kqueue.
2145 * kqueue locked on entry and exit - but may be dropped
2146 */
2147 static int
2148 knote_process(struct knote *kn,
2149 kevent_callback_t callback,
2150 void *data,
2151 struct kqtailq *inprocessp,
2152 struct proc *p)
2153 {
2154 struct kqueue *kq = kn->kn_kq;
2155 struct kevent_internal_s kev;
2156 int touch;
2157 int result;
2158 int error;
2159
2160 /*
2161 * Determine the kevent state we want to return.
2162 *
2163 * Some event states need to be revalidated before returning
2164 * them, others we take the snapshot at the time the event
2165 * was enqueued.
2166 *
2167 * Events with non-NULL f_touch operations must be touched.
2168 * Triggered events must fill in kev for the callback.
2169 *
2170 * Convert our lock to a use-count and call the event's
2171 * filter routine(s) to update.
2172 */
2173 if ((kn->kn_status & KN_DISABLED) != 0) {
2174 result = 0;
2175 touch = 0;
2176 } else {
2177 int revalidate;
2178
2179 result = 1;
2180 revalidate = ((kn->kn_status & KN_STAYQUEUED) != 0 ||
2181 (kn->kn_flags & EV_ONESHOT) == 0);
2182 touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL);
2183
2184 if (revalidate || touch) {
2185 if (revalidate)
2186 knote_deactivate(kn);
2187
2188 /* call the filter/touch routines with just a ref */
2189 if (kqlock2knoteuse(kq, kn)) {
2190 /* if we have to revalidate, call the filter */
2191 if (revalidate) {
2192 result = kn->kn_fop->f_event(kn, 0);
2193 }
2194
2195 /*
2196 * capture the kevent data - using touch if
2197 * specified
2198 */
2199 if (result && touch) {
2200 kn->kn_fop->f_touch(kn, &kev,
2201 EVENT_PROCESS);
2202 }
2203 if (result && (kn->kn_status & KN_TOUCH))
2204 kn->kn_fop->f_touch(kn, &kev,
2205 EVENT_PROCESS);
2206
2207 /*
2208 * convert back to a kqlock - bail if the knote
2209 * went away
2210 */
2211 if (!knoteuse2kqlock(kq, kn)) {
2212 return (EJUSTRETURN);
2213 } else if (result) {
2214 /*
2215 * if revalidated as alive, make sure
2216 * it's active
2217 */
2218 knote_activate(kn, 0);
2219
2220 /*
2221 * capture all events that occurred
2222 * during filter
2223 */
2224 if (!touch) {
2225 kev = kn->kn_kevent;
2226 }
2227
2228 } else if ((kn->kn_status & KN_STAYQUEUED) == 0) {
2229 /*
2230 * was already dequeued, so just bail on
2231 * this one
2232 */
2233 return (EJUSTRETURN);
2234 }
2235 } else {
2236 return (EJUSTRETURN);
2237 }
2238 } else {
2239 kev = kn->kn_kevent;
2240 }
2241 }
2242
2243 /* move knote onto inprocess queue */
2244 assert(kn->kn_tq == &kq->kq_head);
2245 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2246 kn->kn_tq = inprocessp;
2247 TAILQ_INSERT_TAIL(inprocessp, kn, kn_tqe);
2248
2249 /*
2250 * Determine how to dispatch the knote for future event handling.
2251 * not-fired: just return (do not callout).
2252 * One-shot: If dispatch2, enter deferred-delete mode (unless this is
2253 * is the deferred delete event delivery itself). Otherwise,
2254 * deactivate and drop it.
2255 * Clear: deactivate and clear the state.
2256 * Dispatch: don't clear state, just deactivate it and mark it disabled.
2257 * All others: just leave where they are.
2258 */
2259
2260 if (result == 0) {
2261 return (EJUSTRETURN);
2262 } else if ((kn->kn_flags & EV_ONESHOT) != 0) {
2263 knote_deactivate(kn);
2264 if ((kn->kn_flags & (EV_DISPATCH2|EV_DELETE)) == EV_DISPATCH2) {
2265 /* defer dropping non-delete oneshot dispatch2 events */
2266 kn->kn_status |= (KN_DISABLED | KN_DEFERDROP);
2267 kqunlock(kq);
2268 } else if (kqlock2knotedrop(kq, kn)) {
2269 kn->kn_fop->f_detach(kn);
2270 knote_drop(kn, p);
2271 }
2272 } else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) {
2273 if ((kn->kn_flags & EV_DISPATCH) != 0) {
2274 /* deactivate and disable all dispatch knotes */
2275 knote_deactivate(kn);
2276 kn->kn_status |= KN_DISABLED;
2277 } else if (!touch || kn->kn_fflags == 0) {
2278 /* only deactivate if nothing since the touch */
2279 knote_deactivate(kn);
2280 }
2281 if (!touch && (kn->kn_flags & EV_CLEAR) != 0) {
2282 /* manually clear non-touch knotes */
2283 kn->kn_data = 0;
2284 kn->kn_fflags = 0;
2285 }
2286 kqunlock(kq);
2287 } else {
2288 /*
2289 * leave on inprocess queue. We'll
2290 * move all the remaining ones back
2291 * the kq queue and wakeup any
2292 * waiters when we are done.
2293 */
2294 kqunlock(kq);
2295 }
2296
2297 /* callback to handle each event as we find it */
2298 error = (callback)(kq, &kev, data);
2299
2300 kqlock(kq);
2301 return (error);
2302 }
2303
2304 /*
2305 * Return 0 to indicate that processing should proceed,
2306 * -1 if there is nothing to process.
2307 *
2308 * Called with kqueue locked and returns the same way,
2309 * but may drop lock temporarily.
2310 */
2311 static int
2312 kqueue_begin_processing(struct kqueue *kq)
2313 {
2314 for (;;) {
2315 if (kq->kq_count == 0) {
2316 return (-1);
2317 }
2318
2319 /* if someone else is processing the queue, wait */
2320 if (kq->kq_nprocess != 0) {
2321 waitq_assert_wait64((struct waitq *)kq->kq_wqs,
2322 CAST_EVENT64_T(&kq->kq_nprocess),
2323 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
2324 kq->kq_state |= KQ_PROCWAIT;
2325 kqunlock(kq);
2326 thread_block(THREAD_CONTINUE_NULL);
2327 kqlock(kq);
2328 } else {
2329 kq->kq_nprocess = 1;
2330 return (0);
2331 }
2332 }
2333 }
2334
2335 /*
2336 * Called with kqueue lock held.
2337 */
2338 static void
2339 kqueue_end_processing(struct kqueue *kq)
2340 {
2341 kq->kq_nprocess = 0;
2342 if (kq->kq_state & KQ_PROCWAIT) {
2343 kq->kq_state &= ~KQ_PROCWAIT;
2344 waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
2345 CAST_EVENT64_T(&kq->kq_nprocess),
2346 THREAD_AWAKENED,
2347 WAITQ_ALL_PRIORITIES);
2348 }
2349 }
2350
2351 /*
2352 * kqueue_process - process the triggered events in a kqueue
2353 *
2354 * Walk the queued knotes and validate that they are
2355 * really still triggered events by calling the filter
2356 * routines (if necessary). Hold a use reference on
2357 * the knote to avoid it being detached. For each event
2358 * that is still considered triggered, invoke the
2359 * callback routine provided.
2360 *
2361 * caller holds a reference on the kqueue.
2362 * kqueue locked on entry and exit - but may be dropped
2363 * kqueue list locked (held for duration of call)
2364 */
2365
2366 static int
2367 kqueue_process(struct kqueue *kq,
2368 kevent_callback_t callback,
2369 void *data,
2370 int *countp,
2371 struct proc *p)
2372 {
2373 struct kqtailq inprocess;
2374 struct knote *kn;
2375 int nevents;
2376 int error;
2377
2378 TAILQ_INIT(&inprocess);
2379
2380 if (kqueue_begin_processing(kq) == -1) {
2381 *countp = 0;
2382 /* Nothing to process */
2383 return (0);
2384 }
2385
2386 /*
2387 * Clear any pre-posted status from previous runs, so we
2388 * only detect events that occur during this run.
2389 */
2390 waitq_set_clear_preposts(kq->kq_wqs);
2391
2392 /*
2393 * loop through the enqueued knotes, processing each one and
2394 * revalidating those that need it. As they are processed,
2395 * they get moved to the inprocess queue (so the loop can end).
2396 */
2397 error = 0;
2398 nevents = 0;
2399
2400 while (error == 0 &&
2401 (kn = TAILQ_FIRST(&kq->kq_head)) != NULL) {
2402 error = knote_process(kn, callback, data, &inprocess, p);
2403 if (error == EJUSTRETURN)
2404 error = 0;
2405 else
2406 nevents++;
2407 }
2408
2409 /*
2410 * With the kqueue still locked, move any knotes
2411 * remaining on the inprocess queue back to the
2412 * kq's queue and wake up any waiters.
2413 */
2414 while ((kn = TAILQ_FIRST(&inprocess)) != NULL) {
2415 assert(kn->kn_tq == &inprocess);
2416 TAILQ_REMOVE(&inprocess, kn, kn_tqe);
2417 kn->kn_tq = &kq->kq_head;
2418 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2419 }
2420
2421 kqueue_end_processing(kq);
2422
2423 *countp = nevents;
2424 return (error);
2425 }
2426
2427
2428 static void
2429 kqueue_scan_continue(void *data, wait_result_t wait_result)
2430 {
2431 thread_t self = current_thread();
2432 uthread_t ut = (uthread_t)get_bsdthread_info(self);
2433 struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
2434 struct kqueue *kq = (struct kqueue *)data;
2435 int error;
2436 int count;
2437
2438 /* convert the (previous) wait_result to a proper error */
2439 switch (wait_result) {
2440 case THREAD_AWAKENED:
2441 kqlock(kq);
2442 error = kqueue_process(kq, cont_args->call, cont_args, &count,
2443 current_proc());
2444 if (error == 0 && count == 0) {
2445 waitq_assert_wait64((struct waitq *)kq->kq_wqs,
2446 KQ_EVENT, THREAD_ABORTSAFE,
2447 cont_args->deadline);
2448 kq->kq_state |= KQ_SLEEP;
2449 kqunlock(kq);
2450 thread_block_parameter(kqueue_scan_continue, kq);
2451 /* NOTREACHED */
2452 }
2453 kqunlock(kq);
2454 break;
2455 case THREAD_TIMED_OUT:
2456 error = EWOULDBLOCK;
2457 break;
2458 case THREAD_INTERRUPTED:
2459 error = EINTR;
2460 break;
2461 default:
2462 panic("%s: - invalid wait_result (%d)", __func__,
2463 wait_result);
2464 error = 0;
2465 }
2466
2467 /* call the continuation with the results */
2468 assert(cont_args->cont != NULL);
2469 (cont_args->cont)(kq, cont_args->data, error);
2470 }
2471
2472
2473 /*
2474 * kqueue_scan - scan and wait for events in a kqueue
2475 *
2476 * Process the triggered events in a kqueue.
2477 *
2478 * If there are no events triggered arrange to
2479 * wait for them. If the caller provided a
2480 * continuation routine, then kevent_scan will
2481 * also.
2482 *
2483 * The callback routine must be valid.
2484 * The caller must hold a use-count reference on the kq.
2485 */
2486
2487 int
2488 kqueue_scan(struct kqueue *kq,
2489 kevent_callback_t callback,
2490 kqueue_continue_t continuation,
2491 void *data,
2492 struct timeval *atvp,
2493 struct proc *p)
2494 {
2495 thread_continue_t cont = THREAD_CONTINUE_NULL;
2496 uint64_t deadline;
2497 int error;
2498 int first;
2499
2500 assert(callback != NULL);
2501
2502 first = 1;
2503 for (;;) {
2504 wait_result_t wait_result;
2505 int count;
2506
2507 /*
2508 * Make a pass through the kq to find events already
2509 * triggered.
2510 */
2511 kqlock(kq);
2512 error = kqueue_process(kq, callback, data, &count, p);
2513 if (error || count)
2514 break; /* lock still held */
2515
2516 /* looks like we have to consider blocking */
2517 if (first) {
2518 first = 0;
2519 /* convert the timeout to a deadline once */
2520 if (atvp->tv_sec || atvp->tv_usec) {
2521 uint64_t now;
2522
2523 clock_get_uptime(&now);
2524 nanoseconds_to_absolutetime((uint64_t)atvp->tv_sec * NSEC_PER_SEC +
2525 atvp->tv_usec * (long)NSEC_PER_USEC,
2526 &deadline);
2527 if (now >= deadline) {
2528 /* non-blocking call */
2529 error = EWOULDBLOCK;
2530 break; /* lock still held */
2531 }
2532 deadline -= now;
2533 clock_absolutetime_interval_to_deadline(deadline, &deadline);
2534 } else {
2535 deadline = 0; /* block forever */
2536 }
2537
2538 if (continuation) {
2539 uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
2540 struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
2541
2542 cont_args->call = callback;
2543 cont_args->cont = continuation;
2544 cont_args->deadline = deadline;
2545 cont_args->data = data;
2546 cont = kqueue_scan_continue;
2547 }
2548 }
2549
2550 /* go ahead and wait */
2551 waitq_assert_wait64_leeway((struct waitq *)kq->kq_wqs,
2552 KQ_EVENT, THREAD_ABORTSAFE,
2553 TIMEOUT_URGENCY_USER_NORMAL,
2554 deadline, TIMEOUT_NO_LEEWAY);
2555 kq->kq_state |= KQ_SLEEP;
2556 kqunlock(kq);
2557 wait_result = thread_block_parameter(cont, kq);
2558 /* NOTREACHED if (continuation != NULL) */
2559
2560 switch (wait_result) {
2561 case THREAD_AWAKENED:
2562 continue;
2563 case THREAD_TIMED_OUT:
2564 return (EWOULDBLOCK);
2565 case THREAD_INTERRUPTED:
2566 return (EINTR);
2567 default:
2568 panic("%s: - bad wait_result (%d)", __func__,
2569 wait_result);
2570 error = 0;
2571 }
2572 }
2573 kqunlock(kq);
2574 return (error);
2575 }
2576
2577
2578 /*
2579 * XXX
2580 * This could be expanded to call kqueue_scan, if desired.
2581 */
2582 /*ARGSUSED*/
2583 static int
2584 kqueue_read(__unused struct fileproc *fp,
2585 __unused struct uio *uio,
2586 __unused int flags,
2587 __unused vfs_context_t ctx)
2588 {
2589 return (ENXIO);
2590 }
2591
2592 /*ARGSUSED*/
2593 static int
2594 kqueue_write(__unused struct fileproc *fp,
2595 __unused struct uio *uio,
2596 __unused int flags,
2597 __unused vfs_context_t ctx)
2598 {
2599 return (ENXIO);
2600 }
2601
2602 /*ARGSUSED*/
2603 static int
2604 kqueue_ioctl(__unused struct fileproc *fp,
2605 __unused u_long com,
2606 __unused caddr_t data,
2607 __unused vfs_context_t ctx)
2608 {
2609 return (ENOTTY);
2610 }
2611
2612 /*ARGSUSED*/
2613 static int
2614 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
2615 __unused vfs_context_t ctx)
2616 {
2617 struct kqueue *kq = (struct kqueue *)fp->f_data;
2618 struct knote *kn;
2619 struct kqtailq inprocessq;
2620 int retnum = 0;
2621
2622 if (which != FREAD)
2623 return (0);
2624
2625 TAILQ_INIT(&inprocessq);
2626
2627 kqlock(kq);
2628 /*
2629 * If this is the first pass, link the wait queue associated with the
2630 * the kqueue onto the wait queue set for the select(). Normally we
2631 * use selrecord() for this, but it uses the wait queue within the
2632 * selinfo structure and we need to use the main one for the kqueue to
2633 * catch events from KN_STAYQUEUED sources. So we do the linkage manually.
2634 * (The select() call will unlink them when it ends).
2635 */
2636 if (wq_link_id != NULL) {
2637 thread_t cur_act = current_thread();
2638 struct uthread * ut = get_bsdthread_info(cur_act);
2639
2640 kq->kq_state |= KQ_SEL;
2641 waitq_link((struct waitq *)kq->kq_wqs, ut->uu_wqset,
2642 WAITQ_SHOULD_LOCK, (uint64_t *)wq_link_id);
2643
2644 /* always consume the reserved link object */
2645 waitq_link_release(*(uint64_t *)wq_link_id);
2646 *(uint64_t *)wq_link_id = 0;
2647
2648 /*
2649 * selprocess() is expecting that we send it back the waitq
2650 * that was just added to the thread's waitq set. In order
2651 * to not change the selrecord() API (which is exported to
2652 * kexts), we pass this value back through the
2653 * void *wq_link_id pointer we were passed. We need to use
2654 * memcpy here because the pointer may not be properly aligned
2655 * on 32-bit systems.
2656 */
2657 memcpy(wq_link_id, (void *)&(kq->kq_wqs), sizeof(void *));
2658 }
2659
2660 if (kqueue_begin_processing(kq) == -1) {
2661 kqunlock(kq);
2662 return (0);
2663 }
2664
2665 if (kq->kq_count != 0) {
2666 /*
2667 * there is something queued - but it might be a
2668 * KN_STAYQUEUED knote, which may or may not have
2669 * any events pending. So, we have to walk the
2670 * list of knotes to see, and peek at the stay-
2671 * queued ones to be really sure.
2672 */
2673 while ((kn = (struct knote *)TAILQ_FIRST(&kq->kq_head)) != NULL) {
2674 if ((kn->kn_status & KN_STAYQUEUED) == 0) {
2675 retnum = 1;
2676 goto out;
2677 }
2678
2679 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2680 TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe);
2681
2682 if (kqlock2knoteuse(kq, kn)) {
2683 unsigned peek;
2684
2685 peek = kn->kn_fop->f_peek(kn);
2686 if (knoteuse2kqlock(kq, kn)) {
2687 if (peek > 0) {
2688 retnum = 1;
2689 goto out;
2690 }
2691 } else {
2692 retnum = 0;
2693 }
2694 }
2695 }
2696 }
2697
2698 out:
2699 /* Return knotes to active queue */
2700 while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) {
2701 TAILQ_REMOVE(&inprocessq, kn, kn_tqe);
2702 kn->kn_tq = &kq->kq_head;
2703 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2704 }
2705
2706 kqueue_end_processing(kq);
2707 kqunlock(kq);
2708 return (retnum);
2709 }
2710
2711 /*
2712 * kqueue_close -
2713 */
2714 /*ARGSUSED*/
2715 static int
2716 kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
2717 {
2718 struct kqueue *kq = (struct kqueue *)fg->fg_data;
2719
2720 kqueue_dealloc(kq);
2721 fg->fg_data = NULL;
2722 return (0);
2723 }
2724
2725 /*ARGSUSED*/
2726 /*
2727 * The callers has taken a use-count reference on this kqueue and will donate it
2728 * to the kqueue we are being added to. This keeps the kqueue from closing until
2729 * that relationship is torn down.
2730 */
2731 static int
2732 kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
2733 {
2734 struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
2735 struct kqueue *parentkq = kn->kn_kq;
2736
2737 if (parentkq == kq ||
2738 kn->kn_filter != EVFILT_READ)
2739 return (1);
2740
2741 /*
2742 * We have to avoid creating a cycle when nesting kqueues
2743 * inside another. Rather than trying to walk the whole
2744 * potential DAG of nested kqueues, we just use a simple
2745 * ceiling protocol. When a kqueue is inserted into another,
2746 * we check that the (future) parent is not already nested
2747 * into another kqueue at a lower level than the potenial
2748 * child (because it could indicate a cycle). If that test
2749 * passes, we just mark the nesting levels accordingly.
2750 */
2751
2752 kqlock(parentkq);
2753 if (parentkq->kq_level > 0 &&
2754 parentkq->kq_level < kq->kq_level)
2755 {
2756 kqunlock(parentkq);
2757 return (1);
2758 } else {
2759 /* set parent level appropriately */
2760 if (parentkq->kq_level == 0)
2761 parentkq->kq_level = 2;
2762 if (parentkq->kq_level < kq->kq_level + 1)
2763 parentkq->kq_level = kq->kq_level + 1;
2764 kqunlock(parentkq);
2765
2766 kn->kn_fop = &kqread_filtops;
2767 kqlock(kq);
2768 KNOTE_ATTACH(&kq->kq_sel.si_note, kn);
2769 /* indicate nesting in child, if needed */
2770 if (kq->kq_level == 0)
2771 kq->kq_level = 1;
2772 kqunlock(kq);
2773 return (0);
2774 }
2775 }
2776
2777 /*
2778 * kqueue_drain - called when kq is closed
2779 */
2780 /*ARGSUSED*/
2781 static int
2782 kqueue_drain(struct fileproc *fp, __unused vfs_context_t ctx)
2783 {
2784 struct kqueue *kq = (struct kqueue *)fp->f_fglob->fg_data;
2785 kqlock(kq);
2786 kqueue_wakeup(kq, 1);
2787 kqunlock(kq);
2788 return (0);
2789 }
2790
2791 /*ARGSUSED*/
2792 int
2793 kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
2794 {
2795 kqlock(kq);
2796 if (isstat64 != 0) {
2797 struct stat64 *sb64 = (struct stat64 *)ub;
2798
2799 bzero((void *)sb64, sizeof(*sb64));
2800 sb64->st_size = kq->kq_count;
2801 if (kq->kq_state & KQ_KEV_QOS)
2802 sb64->st_blksize = sizeof(struct kevent_qos_s);
2803 else if (kq->kq_state & KQ_KEV64)
2804 sb64->st_blksize = sizeof(struct kevent64_s);
2805 else if (IS_64BIT_PROCESS(p))
2806 sb64->st_blksize = sizeof(struct user64_kevent);
2807 else
2808 sb64->st_blksize = sizeof(struct user32_kevent);
2809 sb64->st_mode = S_IFIFO;
2810 } else {
2811 struct stat *sb = (struct stat *)ub;
2812
2813 bzero((void *)sb, sizeof(*sb));
2814 sb->st_size = kq->kq_count;
2815 if (kq->kq_state & KQ_KEV_QOS)
2816 sb->st_blksize = sizeof(struct kevent_qos_s);
2817 else if (kq->kq_state & KQ_KEV64)
2818 sb->st_blksize = sizeof(struct kevent64_s);
2819 else if (IS_64BIT_PROCESS(p))
2820 sb->st_blksize = sizeof(struct user64_kevent);
2821 else
2822 sb->st_blksize = sizeof(struct user32_kevent);
2823 sb->st_mode = S_IFIFO;
2824 }
2825 kqunlock(kq);
2826 return (0);
2827 }
2828
2829 /*
2830 * Called with the kqueue locked
2831 */
2832 static void
2833 kqueue_wakeup(struct kqueue *kq, int closed)
2834 {
2835 wait_result_t res = THREAD_NOT_WAITING;
2836
2837 if ((kq->kq_state & (KQ_SLEEP | KQ_SEL)) != 0 || kq->kq_nprocess > 0) {
2838 kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
2839 res = waitq_wakeup64_all((struct waitq *)kq->kq_wqs, KQ_EVENT,
2840 (closed) ? THREAD_INTERRUPTED : THREAD_AWAKENED,
2841 WAITQ_ALL_PRIORITIES);
2842 }
2843
2844 /* request additional workq threads if appropriate */
2845 if (res == THREAD_NOT_WAITING && (kq->kq_state & KQ_WORKQ) &&
2846 pthread_functions != NULL && pthread_functions->workq_reqthreads != NULL) {
2847 /*
2848 * The special workq kq should be accumulating the counts of
2849 * queued sources on a pthread_priority_t basis and we should
2850 * be providing that here. For now, just hard-code a single
2851 * entry request at a fixed (default) QOS.
2852 */
2853 struct workq_reqthreads_req_s request = {
2854 .priority = 0x020004ff, /* legacy event manager */
2855 .count = kq->kq_count };
2856 thread_t wqthread;
2857
2858 wqthread = (*pthread_functions->workq_reqthreads)(kq->kq_p, 1, &request);
2859 assert(wqthread == THREAD_NULL);
2860 }
2861 }
2862
2863 void
2864 klist_init(struct klist *list)
2865 {
2866 SLIST_INIT(list);
2867 }
2868
2869
2870 /*
2871 * Query/Post each knote in the object's list
2872 *
2873 * The object lock protects the list. It is assumed
2874 * that the filter/event routine for the object can
2875 * determine that the object is already locked (via
2876 * the hint) and not deadlock itself.
2877 *
2878 * The object lock should also hold off pending
2879 * detach/drop operations. But we'll prevent it here
2880 * too - just in case.
2881 */
2882 void
2883 knote(struct klist *list, long hint)
2884 {
2885 struct knote *kn;
2886
2887 SLIST_FOREACH(kn, list, kn_selnext) {
2888 struct kqueue *kq = kn->kn_kq;
2889
2890 kqlock(kq);
2891 if (kqlock2knoteuse(kq, kn)) {
2892 int result;
2893
2894 /* call the event with only a use count */
2895 result = kn->kn_fop->f_event(kn, hint);
2896
2897 /* if its not going away and triggered */
2898 if (knoteuse2kqlock(kq, kn) && result)
2899 knote_activate(kn, 0);
2900 /* lock held again */
2901 }
2902 kqunlock(kq);
2903 }
2904 }
2905
2906 /*
2907 * attach a knote to the specified list. Return true if this is the first entry.
2908 * The list is protected by whatever lock the object it is associated with uses.
2909 */
2910 int
2911 knote_attach(struct klist *list, struct knote *kn)
2912 {
2913 int ret = SLIST_EMPTY(list);
2914 SLIST_INSERT_HEAD(list, kn, kn_selnext);
2915 return (ret);
2916 }
2917
2918 /*
2919 * detach a knote from the specified list. Return true if that was the last entry.
2920 * The list is protected by whatever lock the object it is associated with uses.
2921 */
2922 int
2923 knote_detach(struct klist *list, struct knote *kn)
2924 {
2925 SLIST_REMOVE(list, kn, knote, kn_selnext);
2926 return (SLIST_EMPTY(list));
2927 }
2928
2929 /*
2930 * For a given knote, link a provided wait queue directly with the kqueue.
2931 * Wakeups will happen via recursive wait queue support. But nothing will move
2932 * the knote to the active list at wakeup (nothing calls knote()). Instead,
2933 * we permanently enqueue them here.
2934 *
2935 * kqueue and knote references are held by caller.
2936 *
2937 * caller provides the wait queue link structure.
2938 */
2939 int
2940 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
2941 {
2942 struct kqueue *kq = kn->kn_kq;
2943 kern_return_t kr;
2944
2945 kr = waitq_link(wq, kq->kq_wqs, WAITQ_SHOULD_LOCK, reserved_link);
2946 if (kr == KERN_SUCCESS) {
2947 knote_markstayqueued(kn);
2948 return (0);
2949 } else {
2950 return (EINVAL);
2951 }
2952 }
2953
2954 /*
2955 * Unlink the provided wait queue from the kqueue associated with a knote.
2956 * Also remove it from the magic list of directly attached knotes.
2957 *
2958 * Note that the unlink may have already happened from the other side, so
2959 * ignore any failures to unlink and just remove it from the kqueue list.
2960 *
2961 * On success, caller is responsible for the link structure
2962 */
2963 int
2964 knote_unlink_waitq(struct knote *kn, struct waitq *wq)
2965 {
2966 struct kqueue *kq = kn->kn_kq;
2967 kern_return_t kr;
2968
2969 kr = waitq_unlink(wq, kq->kq_wqs);
2970 knote_clearstayqueued(kn);
2971 return ((kr != KERN_SUCCESS) ? EINVAL : 0);
2972 }
2973
2974 /*
2975 * remove all knotes referencing a specified fd
2976 *
2977 * Essentially an inlined knote_remove & knote_drop
2978 * when we know for sure that the thing is a file
2979 *
2980 * Entered with the proc_fd lock already held.
2981 * It returns the same way, but may drop it temporarily.
2982 */
2983 void
2984 knote_fdclose(struct proc *p, int fd)
2985 {
2986 struct filedesc *fdp = p->p_fd;
2987 struct klist *list;
2988 struct knote *kn;
2989
2990 list = &fdp->fd_knlist[fd];
2991 while ((kn = SLIST_FIRST(list)) != NULL) {
2992 struct kqueue *kq = kn->kn_kq;
2993
2994 if (kq->kq_p != p)
2995 panic("%s: proc mismatch (kq->kq_p=%p != p=%p)",
2996 __func__, kq->kq_p, p);
2997
2998 kqlock(kq);
2999 proc_fdunlock(p);
3000
3001 /*
3002 * Convert the lock to a drop ref.
3003 * If we get it, go ahead and drop it.
3004 * Otherwise, we waited for it to
3005 * be dropped by the other guy, so
3006 * it is safe to move on in the list.
3007 */
3008 if (kqlock2knotedrop(kq, kn)) {
3009 kn->kn_fop->f_detach(kn);
3010 knote_drop(kn, p);
3011 }
3012
3013 proc_fdlock(p);
3014
3015 /* the fd tables may have changed - start over */
3016 list = &fdp->fd_knlist[fd];
3017 }
3018 }
3019
3020 /* proc_fdlock held on entry (and exit) */
3021 static int
3022 knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p)
3023 {
3024 struct klist *list = NULL;
3025
3026 if (! kn->kn_fop->f_isfd) {
3027 if (fdp->fd_knhashmask == 0)
3028 fdp->fd_knhash = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
3029 &fdp->fd_knhashmask);
3030 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
3031 } else {
3032 if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
3033 u_int size = 0;
3034
3035 if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
3036 || kn->kn_id >= (uint64_t)maxfiles)
3037 return (EINVAL);
3038
3039 /* have to grow the fd_knlist */
3040 size = fdp->fd_knlistsize;
3041 while (size <= kn->kn_id)
3042 size += KQEXTENT;
3043
3044 if (size >= (UINT_MAX/sizeof(struct klist *)))
3045 return (EINVAL);
3046
3047 MALLOC(list, struct klist *,
3048 size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
3049 if (list == NULL)
3050 return (ENOMEM);
3051
3052 bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
3053 fdp->fd_knlistsize * sizeof(struct klist *));
3054 bzero((caddr_t)list +
3055 fdp->fd_knlistsize * sizeof(struct klist *),
3056 (size - fdp->fd_knlistsize) * sizeof(struct klist *));
3057 FREE(fdp->fd_knlist, M_KQUEUE);
3058 fdp->fd_knlist = list;
3059 fdp->fd_knlistsize = size;
3060 }
3061 list = &fdp->fd_knlist[kn->kn_id];
3062 }
3063 SLIST_INSERT_HEAD(list, kn, kn_link);
3064 return (0);
3065 }
3066
3067
3068
3069 /*
3070 * should be called at spl == 0, since we don't want to hold spl
3071 * while calling fdrop and free.
3072 */
3073 static void
3074 knote_drop(struct knote *kn, __unused struct proc *ctxp)
3075 {
3076 struct kqueue *kq = kn->kn_kq;
3077 struct proc *p = kq->kq_p;
3078 struct filedesc *fdp = p->p_fd;
3079 struct klist *list;
3080 int needswakeup;
3081
3082 proc_fdlock(p);
3083 if (kn->kn_fop->f_isfd)
3084 list = &fdp->fd_knlist[kn->kn_id];
3085 else
3086 list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
3087
3088 SLIST_REMOVE(list, kn, knote, kn_link);
3089 kqlock(kq);
3090 knote_dequeue(kn);
3091 needswakeup = (kn->kn_status & KN_USEWAIT);
3092 kqunlock(kq);
3093 proc_fdunlock(p);
3094
3095 if (needswakeup)
3096 waitq_wakeup64_all((struct waitq *)kq->kq_wqs,
3097 CAST_EVENT64_T(&kn->kn_status),
3098 THREAD_AWAKENED,
3099 WAITQ_ALL_PRIORITIES);
3100
3101 if (kn->kn_fop->f_isfd)
3102 fp_drop(p, kn->kn_id, kn->kn_fp, 0);
3103
3104 knote_free(kn);
3105 }
3106
3107 /* called with kqueue lock held */
3108 static void
3109 knote_activate(struct knote *kn, int force)
3110 {
3111 struct kqueue *kq = kn->kn_kq;
3112
3113 if (!force && (kn->kn_status & KN_ACTIVE))
3114 return;
3115
3116 kn->kn_status |= KN_ACTIVE;
3117 knote_enqueue(kn);
3118 kqueue_wakeup(kq, 0);
3119
3120 /* wake up the parent kq, too */
3121 KNOTE(&kq->kq_sel.si_note, 0);
3122 }
3123
3124 /* called with kqueue lock held */
3125 static void
3126 knote_deactivate(struct knote *kn)
3127 {
3128 kn->kn_status &= ~KN_ACTIVE;
3129 knote_dequeue(kn);
3130 }
3131
3132 /* called with kqueue lock held */
3133 static void
3134 knote_enqueue(struct knote *kn)
3135 {
3136 if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_STAYQUEUED ||
3137 (kn->kn_status & (KN_QUEUED | KN_STAYQUEUED | KN_DISABLED)) == 0) {
3138 struct kqtailq *tq = kn->kn_tq;
3139 struct kqueue *kq = kn->kn_kq;
3140
3141 TAILQ_INSERT_TAIL(tq, kn, kn_tqe);
3142 kn->kn_status |= KN_QUEUED;
3143 kq->kq_count++;
3144 }
3145 }
3146
3147 /* called with kqueue lock held */
3148 static void
3149 knote_dequeue(struct knote *kn)
3150 {
3151 struct kqueue *kq = kn->kn_kq;
3152
3153 if ((kn->kn_status & (KN_QUEUED | KN_STAYQUEUED)) == KN_QUEUED) {
3154 struct kqtailq *tq = kn->kn_tq;
3155
3156 TAILQ_REMOVE(tq, kn, kn_tqe);
3157 kn->kn_tq = &kq->kq_head;
3158 kn->kn_status &= ~KN_QUEUED;
3159 kq->kq_count--;
3160 }
3161 }
3162
3163 void
3164 knote_init(void)
3165 {
3166 knote_zone = zinit(sizeof(struct knote), 8192*sizeof(struct knote),
3167 8192, "knote zone");
3168
3169 /* allocate kq lock group attribute and group */
3170 kq_lck_grp_attr = lck_grp_attr_alloc_init();
3171
3172 kq_lck_grp = lck_grp_alloc_init("kqueue", kq_lck_grp_attr);
3173
3174 /* Allocate kq lock attribute */
3175 kq_lck_attr = lck_attr_alloc_init();
3176
3177 /* Initialize the timer filter lock */
3178 lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
3179
3180 #if VM_PRESSURE_EVENTS
3181 /* Initialize the vm pressure list lock */
3182 vm_pressure_init(kq_lck_grp, kq_lck_attr);
3183 #endif
3184
3185 #if CONFIG_MEMORYSTATUS
3186 /* Initialize the memorystatus list lock */
3187 memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
3188 #endif
3189 }
3190 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
3191
3192 static struct knote *
3193 knote_alloc(void)
3194 {
3195 return ((struct knote *)zalloc(knote_zone));
3196 }
3197
3198 static void
3199 knote_free(struct knote *kn)
3200 {
3201 zfree(knote_zone, kn);
3202 }
3203
3204 #if SOCKETS
3205 #include <sys/param.h>
3206 #include <sys/socket.h>
3207 #include <sys/protosw.h>
3208 #include <sys/domain.h>
3209 #include <sys/mbuf.h>
3210 #include <sys/kern_event.h>
3211 #include <sys/malloc.h>
3212 #include <sys/sys_domain.h>
3213 #include <sys/syslog.h>
3214
3215 #ifndef ROUNDUP64
3216 #define ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
3217 #endif
3218
3219 #ifndef ADVANCE64
3220 #define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n))
3221 #endif
3222
3223 static lck_grp_attr_t *kev_lck_grp_attr;
3224 static lck_attr_t *kev_lck_attr;
3225 static lck_grp_t *kev_lck_grp;
3226 static decl_lck_rw_data(,kev_lck_data);
3227 static lck_rw_t *kev_rwlock = &kev_lck_data;
3228
3229 static int kev_attach(struct socket *so, int proto, struct proc *p);
3230 static int kev_detach(struct socket *so);
3231 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
3232 struct ifnet *ifp, struct proc *p);
3233 static lck_mtx_t * event_getlock(struct socket *, int);
3234 static int event_lock(struct socket *, int, void *);
3235 static int event_unlock(struct socket *, int, void *);
3236
3237 static int event_sofreelastref(struct socket *);
3238 static void kev_delete(struct kern_event_pcb *);
3239
3240 static struct pr_usrreqs event_usrreqs = {
3241 .pru_attach = kev_attach,
3242 .pru_control = kev_control,
3243 .pru_detach = kev_detach,
3244 .pru_soreceive = soreceive,
3245 };
3246
3247 static struct protosw eventsw[] = {
3248 {
3249 .pr_type = SOCK_RAW,
3250 .pr_protocol = SYSPROTO_EVENT,
3251 .pr_flags = PR_ATOMIC,
3252 .pr_usrreqs = &event_usrreqs,
3253 .pr_lock = event_lock,
3254 .pr_unlock = event_unlock,
3255 .pr_getlock = event_getlock,
3256 }
3257 };
3258
3259 __private_extern__ int kevt_getstat SYSCTL_HANDLER_ARGS;
3260 __private_extern__ int kevt_pcblist SYSCTL_HANDLER_ARGS;
3261
3262 SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
3263 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Kernel event family");
3264
3265 struct kevtstat kevtstat;
3266 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
3267 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
3268 kevt_getstat, "S,kevtstat", "");
3269
3270 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
3271 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
3272 kevt_pcblist, "S,xkevtpcb", "");
3273
3274 static lck_mtx_t *
3275 event_getlock(struct socket *so, int locktype)
3276 {
3277 #pragma unused(locktype)
3278 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
3279
3280 if (so->so_pcb != NULL) {
3281 if (so->so_usecount < 0)
3282 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
3283 so, so->so_usecount, solockhistory_nr(so));
3284 /* NOTREACHED */
3285 } else {
3286 panic("%s: so=%p NULL NO so_pcb %s\n", __func__,
3287 so, solockhistory_nr(so));
3288 /* NOTREACHED */
3289 }
3290 return (&ev_pcb->evp_mtx);
3291 }
3292
3293 static int
3294 event_lock(struct socket *so, int refcount, void *lr)
3295 {
3296 void *lr_saved;
3297
3298 if (lr == NULL)
3299 lr_saved = __builtin_return_address(0);
3300 else
3301 lr_saved = lr;
3302
3303 if (so->so_pcb != NULL) {
3304 lck_mtx_lock(&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
3305 } else {
3306 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3307 so, lr_saved, solockhistory_nr(so));
3308 /* NOTREACHED */
3309 }
3310
3311 if (so->so_usecount < 0) {
3312 panic("%s: so=%p so_pcb=%p lr=%p ref=%d lrh= %s\n", __func__,
3313 so, so->so_pcb, lr_saved, so->so_usecount,
3314 solockhistory_nr(so));
3315 /* NOTREACHED */
3316 }
3317
3318 if (refcount)
3319 so->so_usecount++;
3320
3321 so->lock_lr[so->next_lock_lr] = lr_saved;
3322 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
3323 return (0);
3324 }
3325
3326 static int
3327 event_unlock(struct socket *so, int refcount, void *lr)
3328 {
3329 void *lr_saved;
3330 lck_mtx_t *mutex_held;
3331
3332 if (lr == NULL)
3333 lr_saved = __builtin_return_address(0);
3334 else
3335 lr_saved = lr;
3336
3337 if (refcount)
3338 so->so_usecount--;
3339
3340 if (so->so_usecount < 0) {
3341 panic("%s: so=%p usecount=%d lrh= %s\n", __func__,
3342 so, so->so_usecount, solockhistory_nr(so));
3343 /* NOTREACHED */
3344 }
3345 if (so->so_pcb == NULL) {
3346 panic("%s: so=%p NO PCB usecount=%d lr=%p lrh= %s\n", __func__,
3347 so, so->so_usecount, (void *)lr_saved,
3348 solockhistory_nr(so));
3349 /* NOTREACHED */
3350 }
3351 mutex_held = (&((struct kern_event_pcb *)so->so_pcb)->evp_mtx);
3352
3353 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3354 so->unlock_lr[so->next_unlock_lr] = lr_saved;
3355 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
3356
3357 if (so->so_usecount == 0) {
3358 VERIFY(so->so_flags & SOF_PCBCLEARING);
3359 event_sofreelastref(so);
3360 } else {
3361 lck_mtx_unlock(mutex_held);
3362 }
3363
3364 return (0);
3365 }
3366
3367 static int
3368 event_sofreelastref(struct socket *so)
3369 {
3370 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *)so->so_pcb;
3371
3372 lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_OWNED);
3373
3374 so->so_pcb = NULL;
3375
3376 /*
3377 * Disable upcall in the event another thread is in kev_post_msg()
3378 * appending record to the receive socket buffer, since sbwakeup()
3379 * may release the socket lock otherwise.
3380 */
3381 so->so_rcv.sb_flags &= ~SB_UPCALL;
3382 so->so_snd.sb_flags &= ~SB_UPCALL;
3383 so->so_event = sonullevent;
3384 lck_mtx_unlock(&(ev_pcb->evp_mtx));
3385
3386 lck_mtx_assert(&(ev_pcb->evp_mtx), LCK_MTX_ASSERT_NOTOWNED);
3387 lck_rw_lock_exclusive(kev_rwlock);
3388 LIST_REMOVE(ev_pcb, evp_link);
3389 kevtstat.kes_pcbcount--;
3390 kevtstat.kes_gencnt++;
3391 lck_rw_done(kev_rwlock);
3392 kev_delete(ev_pcb);
3393
3394 sofreelastref(so, 1);
3395 return (0);
3396 }
3397
3398 static int event_proto_count = (sizeof (eventsw) / sizeof (struct protosw));
3399
3400 static
3401 struct kern_event_head kern_event_head;
3402
3403 static u_int32_t static_event_id = 0;
3404
3405 #define EVPCB_ZONE_MAX 65536
3406 #define EVPCB_ZONE_NAME "kerneventpcb"
3407 static struct zone *ev_pcb_zone;
3408
3409 /*
3410 * Install the protosw's for the NKE manager. Invoked at extension load time
3411 */
3412 void
3413 kern_event_init(struct domain *dp)
3414 {
3415 struct protosw *pr;
3416 int i;
3417
3418 VERIFY(!(dp->dom_flags & DOM_INITIALIZED));
3419 VERIFY(dp == systemdomain);
3420
3421 kev_lck_grp_attr = lck_grp_attr_alloc_init();
3422 if (kev_lck_grp_attr == NULL) {
3423 panic("%s: lck_grp_attr_alloc_init failed\n", __func__);
3424 /* NOTREACHED */
3425 }
3426
3427 kev_lck_grp = lck_grp_alloc_init("Kernel Event Protocol",
3428 kev_lck_grp_attr);
3429 if (kev_lck_grp == NULL) {
3430 panic("%s: lck_grp_alloc_init failed\n", __func__);
3431 /* NOTREACHED */
3432 }
3433
3434 kev_lck_attr = lck_attr_alloc_init();
3435 if (kev_lck_attr == NULL) {
3436 panic("%s: lck_attr_alloc_init failed\n", __func__);
3437 /* NOTREACHED */
3438 }
3439
3440 lck_rw_init(kev_rwlock, kev_lck_grp, kev_lck_attr);
3441 if (kev_rwlock == NULL) {
3442 panic("%s: lck_mtx_alloc_init failed\n", __func__);
3443 /* NOTREACHED */
3444 }
3445
3446 for (i = 0, pr = &eventsw[0]; i < event_proto_count; i++, pr++)
3447 net_add_proto(pr, dp, 1);
3448
3449 ev_pcb_zone = zinit(sizeof(struct kern_event_pcb),
3450 EVPCB_ZONE_MAX * sizeof(struct kern_event_pcb), 0, EVPCB_ZONE_NAME);
3451 if (ev_pcb_zone == NULL) {
3452 panic("%s: failed allocating ev_pcb_zone", __func__);
3453 /* NOTREACHED */
3454 }
3455 zone_change(ev_pcb_zone, Z_EXPAND, TRUE);
3456 zone_change(ev_pcb_zone, Z_CALLERACCT, TRUE);
3457 }
3458
3459 static int
3460 kev_attach(struct socket *so, __unused int proto, __unused struct proc *p)
3461 {
3462 int error = 0;
3463 struct kern_event_pcb *ev_pcb;
3464
3465 error = soreserve(so, KEV_SNDSPACE, KEV_RECVSPACE);
3466 if (error != 0)
3467 return (error);
3468
3469 if ((ev_pcb = (struct kern_event_pcb *)zalloc(ev_pcb_zone)) == NULL) {
3470 return (ENOBUFS);
3471 }
3472 bzero(ev_pcb, sizeof(struct kern_event_pcb));
3473 lck_mtx_init(&ev_pcb->evp_mtx, kev_lck_grp, kev_lck_attr);
3474
3475 ev_pcb->evp_socket = so;
3476 ev_pcb->evp_vendor_code_filter = 0xffffffff;
3477
3478 so->so_pcb = (caddr_t) ev_pcb;
3479 lck_rw_lock_exclusive(kev_rwlock);
3480 LIST_INSERT_HEAD(&kern_event_head, ev_pcb, evp_link);
3481 kevtstat.kes_pcbcount++;
3482 kevtstat.kes_gencnt++;
3483 lck_rw_done(kev_rwlock);
3484
3485 return (error);
3486 }
3487
3488 static void
3489 kev_delete(struct kern_event_pcb *ev_pcb)
3490 {
3491 VERIFY(ev_pcb != NULL);
3492 lck_mtx_destroy(&ev_pcb->evp_mtx, kev_lck_grp);
3493 zfree(ev_pcb_zone, ev_pcb);
3494 }
3495
3496 static int
3497 kev_detach(struct socket *so)
3498 {
3499 struct kern_event_pcb *ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3500
3501 if (ev_pcb != NULL) {
3502 soisdisconnected(so);
3503 so->so_flags |= SOF_PCBCLEARING;
3504 }
3505
3506 return (0);
3507 }
3508
3509 /*
3510 * For now, kev_vendor_code and mbuf_tags use the same
3511 * mechanism.
3512 */
3513 errno_t kev_vendor_code_find(
3514 const char *string,
3515 u_int32_t *out_vendor_code)
3516 {
3517 if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
3518 return (EINVAL);
3519 }
3520 return (net_str_id_find_internal(string, out_vendor_code,
3521 NSI_VENDOR_CODE, 1));
3522 }
3523
3524 errno_t
3525 kev_msg_post(struct kev_msg *event_msg)
3526 {
3527 mbuf_tag_id_t min_vendor, max_vendor;
3528
3529 net_str_id_first_last(&min_vendor, &max_vendor, NSI_VENDOR_CODE);
3530
3531 if (event_msg == NULL)
3532 return (EINVAL);
3533
3534 /*
3535 * Limit third parties to posting events for registered vendor codes
3536 * only
3537 */
3538 if (event_msg->vendor_code < min_vendor ||
3539 event_msg->vendor_code > max_vendor) {
3540 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_badvendor);
3541 return (EINVAL);
3542 }
3543 return (kev_post_msg(event_msg));
3544 }
3545
3546 int
3547 kev_post_msg(struct kev_msg *event_msg)
3548 {
3549 struct mbuf *m, *m2;
3550 struct kern_event_pcb *ev_pcb;
3551 struct kern_event_msg *ev;
3552 char *tmp;
3553 u_int32_t total_size;
3554 int i;
3555
3556 /* Verify the message is small enough to fit in one mbuf w/o cluster */
3557 total_size = KEV_MSG_HEADER_SIZE;
3558
3559 for (i = 0; i < 5; i++) {
3560 if (event_msg->dv[i].data_length == 0)
3561 break;
3562 total_size += event_msg->dv[i].data_length;
3563 }
3564
3565 if (total_size > MLEN) {
3566 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_toobig);
3567 return (EMSGSIZE);
3568 }
3569
3570 m = m_get(M_DONTWAIT, MT_DATA);
3571 if (m == 0) {
3572 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
3573 return (ENOMEM);
3574 }
3575 ev = mtod(m, struct kern_event_msg *);
3576 total_size = KEV_MSG_HEADER_SIZE;
3577
3578 tmp = (char *) &ev->event_data[0];
3579 for (i = 0; i < 5; i++) {
3580 if (event_msg->dv[i].data_length == 0)
3581 break;
3582
3583 total_size += event_msg->dv[i].data_length;
3584 bcopy(event_msg->dv[i].data_ptr, tmp,
3585 event_msg->dv[i].data_length);
3586 tmp += event_msg->dv[i].data_length;
3587 }
3588
3589 ev->id = ++static_event_id;
3590 ev->total_size = total_size;
3591 ev->vendor_code = event_msg->vendor_code;
3592 ev->kev_class = event_msg->kev_class;
3593 ev->kev_subclass = event_msg->kev_subclass;
3594 ev->event_code = event_msg->event_code;
3595
3596 m->m_len = total_size;
3597 lck_rw_lock_shared(kev_rwlock);
3598 for (ev_pcb = LIST_FIRST(&kern_event_head);
3599 ev_pcb;
3600 ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
3601 lck_mtx_lock(&ev_pcb->evp_mtx);
3602 if (ev_pcb->evp_socket->so_pcb == NULL) {
3603 lck_mtx_unlock(&ev_pcb->evp_mtx);
3604 continue;
3605 }
3606 if (ev_pcb->evp_vendor_code_filter != KEV_ANY_VENDOR) {
3607 if (ev_pcb->evp_vendor_code_filter != ev->vendor_code) {
3608 lck_mtx_unlock(&ev_pcb->evp_mtx);
3609 continue;
3610 }
3611
3612 if (ev_pcb->evp_class_filter != KEV_ANY_CLASS) {
3613 if (ev_pcb->evp_class_filter != ev->kev_class) {
3614 lck_mtx_unlock(&ev_pcb->evp_mtx);
3615 continue;
3616 }
3617
3618 if ((ev_pcb->evp_subclass_filter !=
3619 KEV_ANY_SUBCLASS) &&
3620 (ev_pcb->evp_subclass_filter !=
3621 ev->kev_subclass)) {
3622 lck_mtx_unlock(&ev_pcb->evp_mtx);
3623 continue;
3624 }
3625 }
3626 }
3627
3628 m2 = m_copym(m, 0, m->m_len, M_NOWAIT);
3629 if (m2 == 0) {
3630 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_nomem);
3631 m_free(m);
3632 lck_mtx_unlock(&ev_pcb->evp_mtx);
3633 lck_rw_done(kev_rwlock);
3634 return (ENOMEM);
3635 }
3636 if (sbappendrecord(&ev_pcb->evp_socket->so_rcv, m2)) {
3637 /*
3638 * We use "m" for the socket stats as it would be
3639 * unsafe to use "m2"
3640 */
3641 so_inc_recv_data_stat(ev_pcb->evp_socket,
3642 1, m->m_len, SO_TC_BE);
3643
3644 sorwakeup(ev_pcb->evp_socket);
3645 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_posted);
3646 } else {
3647 OSIncrementAtomic64((SInt64 *)&kevtstat.kes_fullsock);
3648 }
3649 lck_mtx_unlock(&ev_pcb->evp_mtx);
3650 }
3651 m_free(m);
3652 lck_rw_done(kev_rwlock);
3653
3654 return (0);
3655 }
3656
3657 static int
3658 kev_control(struct socket *so,
3659 u_long cmd,
3660 caddr_t data,
3661 __unused struct ifnet *ifp,
3662 __unused struct proc *p)
3663 {
3664 struct kev_request *kev_req = (struct kev_request *) data;
3665 struct kern_event_pcb *ev_pcb;
3666 struct kev_vendor_code *kev_vendor;
3667 u_int32_t *id_value = (u_int32_t *) data;
3668
3669 switch (cmd) {
3670 case SIOCGKEVID:
3671 *id_value = static_event_id;
3672 break;
3673 case SIOCSKEVFILT:
3674 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3675 ev_pcb->evp_vendor_code_filter = kev_req->vendor_code;
3676 ev_pcb->evp_class_filter = kev_req->kev_class;
3677 ev_pcb->evp_subclass_filter = kev_req->kev_subclass;
3678 break;
3679 case SIOCGKEVFILT:
3680 ev_pcb = (struct kern_event_pcb *) so->so_pcb;
3681 kev_req->vendor_code = ev_pcb->evp_vendor_code_filter;
3682 kev_req->kev_class = ev_pcb->evp_class_filter;
3683 kev_req->kev_subclass = ev_pcb->evp_subclass_filter;
3684 break;
3685 case SIOCGKEVVENDOR:
3686 kev_vendor = (struct kev_vendor_code *)data;
3687 /* Make sure string is NULL terminated */
3688 kev_vendor->vendor_string[KEV_VENDOR_CODE_MAX_STR_LEN-1] = 0;
3689 return (net_str_id_find_internal(kev_vendor->vendor_string,
3690 &kev_vendor->vendor_code, NSI_VENDOR_CODE, 0));
3691 default:
3692 return (ENOTSUP);
3693 }
3694
3695 return (0);
3696 }
3697
3698 int
3699 kevt_getstat SYSCTL_HANDLER_ARGS
3700 {
3701 #pragma unused(oidp, arg1, arg2)
3702 int error = 0;
3703
3704 lck_rw_lock_shared(kev_rwlock);
3705
3706 if (req->newptr != USER_ADDR_NULL) {
3707 error = EPERM;
3708 goto done;
3709 }
3710 if (req->oldptr == USER_ADDR_NULL) {
3711 req->oldidx = sizeof(struct kevtstat);
3712 goto done;
3713 }
3714
3715 error = SYSCTL_OUT(req, &kevtstat,
3716 MIN(sizeof(struct kevtstat), req->oldlen));
3717 done:
3718 lck_rw_done(kev_rwlock);
3719
3720 return (error);
3721 }
3722
3723 __private_extern__ int
3724 kevt_pcblist SYSCTL_HANDLER_ARGS
3725 {
3726 #pragma unused(oidp, arg1, arg2)
3727 int error = 0;
3728 int n, i;
3729 struct xsystmgen xsg;
3730 void *buf = NULL;
3731 size_t item_size = ROUNDUP64(sizeof (struct xkevtpcb)) +
3732 ROUNDUP64(sizeof (struct xsocket_n)) +
3733 2 * ROUNDUP64(sizeof (struct xsockbuf_n)) +
3734 ROUNDUP64(sizeof (struct xsockstat_n));
3735 struct kern_event_pcb *ev_pcb;
3736
3737 buf = _MALLOC(item_size, M_TEMP, M_WAITOK | M_ZERO);
3738 if (buf == NULL)
3739 return (ENOMEM);
3740
3741 lck_rw_lock_shared(kev_rwlock);
3742
3743 n = kevtstat.kes_pcbcount;
3744
3745 if (req->oldptr == USER_ADDR_NULL) {
3746 req->oldidx = (n + n/8) * item_size;
3747 goto done;
3748 }
3749 if (req->newptr != USER_ADDR_NULL) {
3750 error = EPERM;
3751 goto done;
3752 }
3753 bzero(&xsg, sizeof (xsg));
3754 xsg.xg_len = sizeof (xsg);
3755 xsg.xg_count = n;
3756 xsg.xg_gen = kevtstat.kes_gencnt;
3757 xsg.xg_sogen = so_gencnt;
3758 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
3759 if (error) {
3760 goto done;
3761 }
3762 /*
3763 * We are done if there is no pcb
3764 */
3765 if (n == 0) {
3766 goto done;
3767 }
3768
3769 i = 0;
3770 for (i = 0, ev_pcb = LIST_FIRST(&kern_event_head);
3771 i < n && ev_pcb != NULL;
3772 i++, ev_pcb = LIST_NEXT(ev_pcb, evp_link)) {
3773 struct xkevtpcb *xk = (struct xkevtpcb *)buf;
3774 struct xsocket_n *xso = (struct xsocket_n *)
3775 ADVANCE64(xk, sizeof (*xk));
3776 struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)
3777 ADVANCE64(xso, sizeof (*xso));
3778 struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)
3779 ADVANCE64(xsbrcv, sizeof (*xsbrcv));
3780 struct xsockstat_n *xsostats = (struct xsockstat_n *)
3781 ADVANCE64(xsbsnd, sizeof (*xsbsnd));
3782
3783 bzero(buf, item_size);
3784
3785 lck_mtx_lock(&ev_pcb->evp_mtx);
3786
3787 xk->kep_len = sizeof(struct xkevtpcb);
3788 xk->kep_kind = XSO_EVT;
3789 xk->kep_evtpcb = (uint64_t)VM_KERNEL_ADDRPERM(ev_pcb);
3790 xk->kep_vendor_code_filter = ev_pcb->evp_vendor_code_filter;
3791 xk->kep_class_filter = ev_pcb->evp_class_filter;
3792 xk->kep_subclass_filter = ev_pcb->evp_subclass_filter;
3793
3794 sotoxsocket_n(ev_pcb->evp_socket, xso);
3795 sbtoxsockbuf_n(ev_pcb->evp_socket ?
3796 &ev_pcb->evp_socket->so_rcv : NULL, xsbrcv);
3797 sbtoxsockbuf_n(ev_pcb->evp_socket ?
3798 &ev_pcb->evp_socket->so_snd : NULL, xsbsnd);
3799 sbtoxsockstat_n(ev_pcb->evp_socket, xsostats);
3800
3801 lck_mtx_unlock(&ev_pcb->evp_mtx);
3802
3803 error = SYSCTL_OUT(req, buf, item_size);
3804 }
3805
3806 if (error == 0) {
3807 /*
3808 * Give the user an updated idea of our state.
3809 * If the generation differs from what we told
3810 * her before, she knows that something happened
3811 * while we were processing this request, and it
3812 * might be necessary to retry.
3813 */
3814 bzero(&xsg, sizeof (xsg));
3815 xsg.xg_len = sizeof (xsg);
3816 xsg.xg_count = n;
3817 xsg.xg_gen = kevtstat.kes_gencnt;
3818 xsg.xg_sogen = so_gencnt;
3819 error = SYSCTL_OUT(req, &xsg, sizeof (xsg));
3820 if (error) {
3821 goto done;
3822 }
3823 }
3824
3825 done:
3826 lck_rw_done(kev_rwlock);
3827
3828 return (error);
3829 }
3830
3831 #endif /* SOCKETS */
3832
3833
3834 int
3835 fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo)
3836 {
3837 struct vinfo_stat * st;
3838
3839 st = &kinfo->kq_stat;
3840
3841 st->vst_size = kq->kq_count;
3842 if (kq->kq_state & KQ_KEV_QOS)
3843 st->vst_blksize = sizeof(struct kevent_qos_s);
3844 else if (kq->kq_state & KQ_KEV64)
3845 st->vst_blksize = sizeof(struct kevent64_s);
3846 else
3847 st->vst_blksize = sizeof(struct kevent);
3848 st->vst_mode = S_IFIFO;
3849
3850 /* flags exported to libproc as PROC_KQUEUE_* (sys/proc_info.h) */
3851 #define PROC_KQUEUE_MASK (KQ_SEL|KQ_SLEEP|KQ_KEV32|KQ_KEV64|KQ_KEV_QOS)
3852 kinfo->kq_state = kq->kq_state & PROC_KQUEUE_MASK;
3853
3854 return (0);
3855 }
3856
3857
3858 void
3859 knote_markstayqueued(struct knote *kn)
3860 {
3861 kqlock(kn->kn_kq);
3862 kn->kn_status |= KN_STAYQUEUED;
3863 knote_enqueue(kn);
3864 kqunlock(kn->kn_kq);
3865 }
3866
3867 void
3868 knote_clearstayqueued(struct knote *kn)
3869 {
3870 kqlock(kn->kn_kq);
3871 kn->kn_status &= ~KN_STAYQUEUED;
3872 knote_dequeue(kn);
3873 kqunlock(kn->kn_kq);
3874 }
3875
3876 static unsigned long
3877 kevent_extinfo_emit(struct kqueue *kq, struct knote *kn, struct kevent_extinfo *buf,
3878 unsigned long buflen, unsigned long nknotes)
3879 {
3880 struct kevent_qos_s kevqos;
3881 struct kevent_internal_s *kevp;
3882 for (; kn; kn = SLIST_NEXT(kn, kn_link)) {
3883 if (kq == kn->kn_kq) {
3884 if (nknotes < buflen) {
3885 struct kevent_extinfo *info = &buf[nknotes];
3886
3887 kqlock(kq);
3888 bzero(&kevqos, sizeof(kevqos));
3889 kevp = &(kn->kn_kevent);
3890
3891 kevqos.ident = kevp->ident;
3892 kevqos.filter = kevp->filter;
3893 kevqos.flags = kevp->flags;
3894 kevqos.fflags = kevp->fflags;
3895 kevqos.data = (int64_t) kevp->data;
3896 kevqos.udata = kevp->udata;
3897 kevqos.ext[0] = kevp->ext[0];
3898 kevqos.ext[1] = kevp->ext[1];
3899
3900 memcpy(&info->kqext_kev, &kevqos, sizeof(info->kqext_kev));
3901 info->kqext_sdata = kn->kn_sdata;
3902
3903 /* status flags exported to userspace/libproc */
3904 #define KQEXT_STATUS_MASK (KN_ACTIVE|KN_QUEUED|KN_DISABLED|KN_STAYQUEUED)
3905 info->kqext_status = kn->kn_status & KQEXT_STATUS_MASK;
3906 info->kqext_sfflags = kn->kn_sfflags;
3907
3908 kqunlock(kq);
3909 }
3910
3911 /* we return total number of knotes, which may be more than requested */
3912 nknotes++;
3913 }
3914 }
3915
3916 return nknotes;
3917 }
3918
3919 int
3920 pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
3921 uint32_t bufsize, int32_t *retval)
3922 {
3923 struct knote *kn;
3924 int i;
3925 int err = 0;
3926 struct filedesc *fdp = p->p_fd;
3927 unsigned long nknotes = 0;
3928 unsigned long buflen = bufsize / sizeof(struct kevent_extinfo);
3929 struct kevent_extinfo *kqext = NULL;
3930
3931 kqext = kalloc(buflen * sizeof(struct kevent_extinfo));
3932 if (kqext == NULL) {
3933 err = ENOMEM;
3934 goto out;
3935 }
3936 bzero(kqext, buflen * sizeof(struct kevent_extinfo));
3937
3938 proc_fdlock(p);
3939
3940 for (i = 0; i < fdp->fd_knlistsize; i++) {
3941 kn = SLIST_FIRST(&fdp->fd_knlist[i]);
3942 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
3943 }
3944
3945 if (fdp->fd_knhashmask != 0) {
3946 for (i = 0; i < (int)fdp->fd_knhashmask + 1; i++) {
3947 kn = SLIST_FIRST(&fdp->fd_knhash[i]);
3948 nknotes = kevent_extinfo_emit(kq, kn, kqext, buflen, nknotes);
3949 }
3950 }
3951
3952 proc_fdunlock(p);
3953
3954 assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
3955 err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
3956
3957 out:
3958 if (kqext) {
3959 kfree(kqext, buflen * sizeof(struct kevent_extinfo));
3960 kqext = NULL;
3961 }
3962
3963 if (!err)
3964 *retval = nknotes;
3965 return err;
3966 }