/*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
*
* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
#include <libkern/libkern.h>
#include "net/net_str_id.h"
+#include <mach/task.h>
+
+#if VM_PRESSURE_EVENTS
+#include <kern/vm_pressure.h>
+#endif
+
MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
#define KQ_EVENT NULL
static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
static int kqueue_process(struct kqueue *kq, kevent_callback_t callback,
void *data, int *countp, struct proc *p);
+static int kqueue_begin_processing(struct kqueue *kq);
+static void kqueue_end_processing(struct kqueue *kq);
static int knote_process(struct knote *kn, kevent_callback_t callback,
void *data, struct kqtailq *inprocessp, struct proc *p);
static void knote_put(struct knote *kn);
.f_event = filt_proc,
};
+#if VM_PRESSURE_EVENTS
+static int filt_vmattach(struct knote *kn);
+static void filt_vmdetach(struct knote *kn);
+static int filt_vm(struct knote *kn, long hint);
+static struct filterops vm_filtops = {
+ .f_attach = filt_vmattach,
+ .f_detach = filt_vmdetach,
+ .f_event = filt_vm,
+};
+#endif /* VM_PRESSURE_EVENTS */
+
extern struct filterops fs_filtops;
extern struct filterops sig_filtops;
.f_touch = filt_usertouch,
};
-#if CONFIG_AUDIT
-/* Audit session filter */
-extern struct filterops audit_session_filtops;
-#endif
-
/*
* Table for for all system-defined filters.
*/
&machport_filtops, /* EVFILT_MACHPORT */
&fs_filtops, /* EVFILT_FS */
&user_filtops, /* EVFILT_USER */
-#if CONFIG_AUDIT
- &audit_session_filtops, /* EVFILT_SESSION */
+ &bad_filtops, /* unused */
+#if VM_PRESSURE_EVENTS
+ &vm_filtops, /* EVFILT_VM */
#else
- &bad_filtops,
+ &bad_filtops, /* EVFILT_VM */
#endif
+ &file_filtops, /* EVFILT_SOCK */
};
/*
return (ESRCH);
}
+ const int NoteExitStatusBits = NOTE_EXIT | NOTE_EXITSTATUS;
+
+ if ((kn->kn_sfflags & NoteExitStatusBits) == NoteExitStatusBits)
+ do {
+ pid_t selfpid = proc_selfpid();
+
+ if (p->p_ppid == selfpid)
+ break; /* parent => ok */
+
+ if ((p->p_lflag & P_LTRACED) != 0 &&
+ (p->p_oppid == selfpid))
+ break; /* parent-in-waiting => ok */
+
+ proc_rele(p);
+ return (EACCES);
+ } while (0);
+
proc_klist_lock();
kn->kn_flags |= EV_CLEAR; /* automatically set */
*/
event = (u_int)hint & NOTE_PCTRLMASK;
+ /*
+ * termination lifecycle events can happen while a debugger
+ * has reparented a process, in which case notifications
+ * should be quashed except to the tracing parent. When
+ * the debugger reaps the child (either via wait4(2) or
+ * process exit), the child will be reparented to the original
+ * parent and these knotes re-fired.
+ */
+ if (event & NOTE_EXIT) {
+ if ((kn->kn_ptr.p_proc->p_oppid != 0)
+ && (kn->kn_kq->kq_p->p_pid != kn->kn_ptr.p_proc->p_ppid)) {
+ /*
+ * This knote is not for the current ptrace(2) parent, ignore.
+ */
+ return 0;
+ }
+ }
+
/*
* if the user is interested in this event, record it.
*/
if (event == NOTE_REAP || (event == NOTE_EXIT && !(kn->kn_sfflags & NOTE_REAP))) {
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
}
+ if ((event == NOTE_EXIT) && ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0)) {
+ kn->kn_fflags |= NOTE_EXITSTATUS;
+ kn->kn_data = (hint & NOTE_PDATAMASK);
+ }
+ if ((event == NOTE_RESOURCEEND) && ((kn->kn_sfflags & NOTE_RESOURCEEND) != 0)) {
+ kn->kn_fflags |= NOTE_RESOURCEEND;
+ kn->kn_data = (hint & NOTE_PDATAMASK);
+ }
+#if CONFIG_EMBEDDED
+ /* If the event is one of the APPSTATE events,remove the rest */
+ if (((event & NOTE_APPALLSTATES) != 0) && ((kn->kn_sfflags & NOTE_APPALLSTATES) != 0)) {
+ /* only one state at a time */
+ kn->kn_fflags &= ~NOTE_APPALLSTATES;
+ kn->kn_fflags |= event;
+ }
+#endif /* CONFIG_EMBEDDED */
}
/* atomic check, no locking need when called from above */
return (kn->kn_fflags != 0);
}
+#if VM_PRESSURE_EVENTS
+/*
+ * Virtual memory kevents
+ *
+ * author: Matt Jacobson [matthew_jacobson@apple.com]
+ */
+
+static int
+filt_vmattach(struct knote *kn)
+{
+ /*
+ * The note will be cleared once the information has been flushed to the client.
+ * If there is still pressure, we will be re-alerted.
+ */
+ kn->kn_flags |= EV_CLEAR;
+
+ return vm_knote_register(kn);
+}
+
+static void
+filt_vmdetach(struct knote *kn)
+{
+ vm_knote_unregister(kn);
+}
+
+static int
+filt_vm(struct knote *kn, long hint)
+{
+ /* hint == 0 means this is just an alive? check (always true) */
+ if (hint != 0) {
+ const pid_t pid = (pid_t)hint;
+ if ((kn->kn_sfflags & NOTE_VM_PRESSURE) && (kn->kn_kq->kq_p->p_pid == pid)) {
+ kn->kn_fflags |= NOTE_VM_PRESSURE;
+ }
+ }
+
+ return (kn->kn_fflags != 0);
+}
+#endif /* VM_PRESSURE_EVENTS */
/*
* filt_timervalidate - process data from user
{
/* EVFILT_USER knotes are not attached to anything in the kernel */
kn->kn_hook = NULL;
- if (kn->kn_fflags & NOTE_TRIGGER || kn->kn_flags & EV_TRIGGER) {
+ if (kn->kn_fflags & NOTE_TRIGGER) {
kn->kn_hookid = 1;
} else {
kn->kn_hookid = 0;
static void
filt_usertouch(struct knote *kn, struct kevent64_s *kev, long type)
{
- int ffctrl;
+ uint32_t ffctrl;
switch (type) {
case EVENT_REGISTER:
- if (kev->fflags & NOTE_TRIGGER || kev->flags & EV_TRIGGER) {
+ if (kev->fflags & NOTE_TRIGGER) {
kn->kn_hookid = 1;
}
error = fops->f_attach(kn);
- /*
- * Anyone trying to drop this knote will yield to
- * us, since KN_ATTACHING is set.
- */
kqlock(kq);
- if (error != 0 || (kn->kn_status & KN_DROPPING)) {
- if (error == 0) {
- kn->kn_fop->f_detach(kn);
- }
+
+ if (error != 0) {
+ /*
+ * Failed to attach correctly, so drop.
+ * All other possible users/droppers
+ * have deferred to us.
+ */
kn->kn_status |= KN_DROPPING;
kqunlock(kq);
knote_drop(kn, p);
goto done;
+ } else if (kn->kn_status & KN_DROPPING) {
+ /*
+ * Attach succeeded, but someone else
+ * deferred their drop - now we have
+ * to do it for them (after detaching).
+ */
+ kqunlock(kq);
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, p);
+ goto done;
}
kn->kn_status &= ~KN_ATTACHING;
kqunlock(kq);
knote_enqueue(kn);
}
+ /*
+ * The user may change some filter values after the
+ * initial EV_ADD, but doing so will not reset any
+ * filter which have already been triggered.
+ */
+ kn->kn_kevent.udata = kev->udata;
+ if (fops->f_isfd || fops->f_touch == NULL) {
+ kn->kn_sfflags = kev->fflags;
+ kn->kn_sdata = kev->data;
+ }
+
/*
* If somebody is in the middle of dropping this
* knote - go find/insert a new one. But we have
}
/*
- * The user may change some filter values after the
- * initial EV_ADD, but doing so will not reset any
- * filter which have already been triggered.
+ * Call touch routine to notify filter of changes
+ * in filter values.
*/
- kn->kn_kevent.udata = kev->udata;
if (!fops->f_isfd && fops->f_touch != NULL)
fops->f_touch(kn, kev, EVENT_REGISTER);
- else {
- kn->kn_sfflags = kev->fflags;
- kn->kn_sdata = kev->data;
- }
-
- /* We may need to push some info down to a networked filesystem */
- if (kn->kn_filter == EVFILT_VNODE) {
- vnode_knoteupdate(kn);
- }
}
/* still have use ref on knote */
}
/* capture the kevent data - using touch if specified */
- if (result) {
- if (touch) {
- kn->kn_fop->f_touch(kn, &kev, EVENT_PROCESS);
- } else {
- kev = kn->kn_kevent;
- }
+ if (result && touch) {
+ kn->kn_fop->f_touch(kn, &kev, EVENT_PROCESS);
}
+
/* convert back to a kqlock - bail if the knote went away */
if (!knoteuse2kqlock(kq, kn)) {
return EJUSTRETURN;
if (!(kn->kn_status & KN_ACTIVE)) {
knote_activate(kn, 0);
}
+
+ /* capture all events that occurred during filter */
+ if (!touch) {
+ kev = kn->kn_kevent;
+ }
+
} else if ((kn->kn_status & KN_STAYQUEUED) == 0) {
/* was already dequeued, so just bail on this one */
return EJUSTRETURN;
if (result == 0) {
return EJUSTRETURN;
- } else if (kn->kn_flags & EV_ONESHOT) {
+ } else if ((kn->kn_flags & EV_ONESHOT) != 0) {
knote_deactivate(kn);
if (kqlock2knotedrop(kq, kn)) {
kn->kn_fop->f_detach(kn);
knote_drop(kn, p);
}
- } else if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
- knote_deactivate(kn);
- /* manually clear knotes who weren't 'touch'ed */
- if ((touch == 0) && (kn->kn_flags & EV_CLEAR)) {
+ } else if ((kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) != 0) {
+ if ((kn->kn_flags & EV_DISPATCH) != 0) {
+ /* deactivate and disable all dispatch knotes */
+ knote_deactivate(kn);
+ kn->kn_status |= KN_DISABLED;
+ } else if (!touch || kn->kn_fflags == 0) {
+ /* only deactivate if nothing since the touch */
+ knote_deactivate(kn);
+ }
+ if (!touch && (kn->kn_flags & EV_CLEAR) != 0) {
+ /* manually clear non-touch knotes */
kn->kn_data = 0;
kn->kn_fflags = 0;
}
- if (kn->kn_flags & EV_DISPATCH)
- kn->kn_status |= KN_DISABLED;
kqunlock(kq);
} else {
/*
return error;
}
+/*
+ * Return 0 to indicate that processing should proceed,
+ * -1 if there is nothing to process.
+ *
+ * Called with kqueue locked and returns the same way,
+ * but may drop lock temporarily.
+ */
+static int
+kqueue_begin_processing(struct kqueue *kq)
+{
+ for (;;) {
+ if (kq->kq_count == 0) {
+ return -1;
+ }
+
+ /* if someone else is processing the queue, wait */
+ if (kq->kq_nprocess != 0) {
+ wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_UNINT, 0);
+ kq->kq_state |= KQ_PROCWAIT;
+ kqunlock(kq);
+ thread_block(THREAD_CONTINUE_NULL);
+ kqlock(kq);
+ } else {
+ kq->kq_nprocess = 1;
+ return 0;
+ }
+ }
+}
+
+/*
+ * Called with kqueue lock held.
+ */
+static void
+kqueue_end_processing(struct kqueue *kq)
+{
+ kq->kq_nprocess = 0;
+ if (kq->kq_state & KQ_PROCWAIT) {
+ kq->kq_state &= ~KQ_PROCWAIT;
+ wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_AWAKENED);
+ }
+}
/*
* kqueue_process - process the triggered events in a kqueue
int error;
TAILQ_INIT(&inprocess);
- restart:
- if (kq->kq_count == 0) {
+
+ if (kqueue_begin_processing(kq) == -1) {
*countp = 0;
+ /* Nothing to process */
return 0;
}
- /* if someone else is processing the queue, wait */
- if (hw_atomic_add(&kq->kq_nprocess, 1) != 1) {
- hw_atomic_sub(&kq->kq_nprocess, 1);
- wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_UNINT, 0);
- kq->kq_state |= KQ_PROCWAIT;
- kqunlock(kq);
- thread_block(THREAD_CONTINUE_NULL);
- kqlock(kq);
- goto restart;
- }
-
/*
* Clear any pre-posted status from previous runs, so we only
* detect events that occur during this run.
kn->kn_tq = &kq->kq_head;
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
}
- hw_atomic_sub(&kq->kq_nprocess, 1);
- if (kq->kq_state & KQ_PROCWAIT) {
- kq->kq_state &= ~KQ_PROCWAIT;
- wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_AWAKENED);
- }
+
+ kqueue_end_processing(kq);
*countp = nevents;
return error;
kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx)
{
struct kqueue *kq = (struct kqueue *)fp->f_data;
- int again;
-
+ struct knote *kn;
+ struct kqtailq inprocessq;
+ int retnum = 0;
+
if (which != FREAD)
return 0;
+ TAILQ_INIT(&inprocessq);
+
kqlock(kq);
/*
* If this is the first pass, link the wait queue associated with the
(wait_queue_link_t)wql);
}
- retry:
- again = 0;
- if (kq->kq_count != 0) {
- struct knote *kn;
+ if (kqueue_begin_processing(kq) == -1) {
+ kqunlock(kq);
+ return 0;
+ }
+ if (kq->kq_count != 0) {
/*
* there is something queued - but it might be a
* KN_STAYQUEUED knote, which may or may not have
* list of knotes to see, and peek at the stay-
* queued ones to be really sure.
*/
- TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
- int retnum = 0;
- if ((kn->kn_status & KN_STAYQUEUED) == 0 ||
- (retnum = kn->kn_fop->f_peek(kn)) > 0) {
- kqunlock(kq);
- return 1;
+ while ((kn = (struct knote*)TAILQ_FIRST(&kq->kq_head)) != NULL) {
+ if ((kn->kn_status & KN_STAYQUEUED) == 0) {
+ retnum = 1;
+ goto out;
}
- if (retnum < 0)
- again++;
+
+ TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
+ TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe);
+
+ if (kqlock2knoteuse(kq, kn)) {
+ unsigned peek;
+
+ peek = kn->kn_fop->f_peek(kn);
+ if (knoteuse2kqlock(kq, kn)) {
+ if (peek > 0) {
+ retnum = 1;
+ goto out;
+ }
+ } else {
+ retnum = 0;
+ }
+ }
}
}
- /*
- * If we stumbled across a knote that couldn't be peeked at,
- * we have to drop the kq lock and try again.
- */
- if (again > 0) {
- kqunlock(kq);
- mutex_pause(0);
- kqlock(kq);
- goto retry;
+out:
+ /* Return knotes to active queue */
+ while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) {
+ TAILQ_REMOVE(&inprocessq, kn, kn_tqe);
+ kn->kn_tq = &kq->kq_head;
+ TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
}
+ kqueue_end_processing(kq);
kqunlock(kq);
- return 0;
+ return retnum;
}
/*
* we permanently enqueue them here.
*
* kqueue and knote references are held by caller.
+ *
+ * caller provides the wait queue link structure.
*/
int
-knote_link_wait_queue(struct knote *kn, struct wait_queue *wq)
+knote_link_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t wql)
{
struct kqueue *kq = kn->kn_kq;
kern_return_t kr;
- kr = wait_queue_link(wq, kq->kq_wqs);
+ kr = wait_queue_link_noalloc(wq, kq->kq_wqs, wql);
if (kr == KERN_SUCCESS) {
- kqlock(kq);
- kn->kn_status |= KN_STAYQUEUED;
- knote_enqueue(kn);
- kqunlock(kq);
+ knote_markstayqueued(kn);
return 0;
} else {
- return ENOMEM;
+ return EINVAL;
}
}
*
* Note that the unlink may have already happened from the other side, so
* ignore any failures to unlink and just remove it from the kqueue list.
+ *
+ * On success, caller is responsible for the link structure
*/
-void
-knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq)
+int
+knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq, wait_queue_link_t *wqlp)
{
struct kqueue *kq = kn->kn_kq;
+ kern_return_t kr;
- (void) wait_queue_unlink(wq, kq->kq_wqs);
+ kr = wait_queue_unlink_nofree(wq, kq->kq_wqs, wqlp);
kqlock(kq);
kn->kn_status &= ~KN_STAYQUEUED;
knote_dequeue(kn);
kqunlock(kq);
+ return (kr != KERN_SUCCESS) ? EINVAL : 0;
}
/*
/* proc_fdlock held on entry (and exit) */
static int
-knote_fdpattach(struct knote *kn, struct filedesc *fdp, __unused struct proc *p)
+knote_fdpattach(struct knote *kn, struct filedesc *fdp, struct proc *p)
{
struct klist *list = NULL;
if ((u_int)fdp->fd_knlistsize <= kn->kn_id) {
u_int size = 0;
+ if (kn->kn_id >= (uint64_t)p->p_rlimit[RLIMIT_NOFILE].rlim_cur
+ || kn->kn_id >= (uint64_t)maxfiles)
+ return (EINVAL);
+
/* have to grow the fd_knlist */
size = fdp->fd_knlistsize;
while (size <= kn->kn_id)
size += KQEXTENT;
+
+ if (size >= (UINT_MAX/sizeof(struct klist *)))
+ return (EINVAL);
+
MALLOC(list, struct klist *,
size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
if (list == NULL)
/* Initialize the timer filter lock */
lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
+
+#if VM_PRESSURE_EVENTS
+ /* Initialize the vm pressure list lock */
+ vm_pressure_init(kq_lck_grp, kq_lck_attr);
+#endif
}
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
return(0);
}
+
+void
+knote_markstayqueued(struct knote *kn)
+{
+ kqlock(kn->kn_kq);
+ kn->kn_status |= KN_STAYQUEUED;
+ knote_enqueue(kn);
+ kqunlock(kn->kn_kq);
+}