#include <sys/sysproto.h>
#include <sys/proc_info.h>
-#include <bsm/audit_kernel.h>
+#include <security/audit/audit.h>
#include <sys/kdebug.h>
#endif
-
/*
* interfaces to the outside world
*/
static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
vfs_context_t ctx);
+static int pipe_drain(struct fileproc *fp,vfs_context_t ctx);
+
struct fileops pipeops =
{ pipe_read,
pipe_select,
pipe_close,
pipe_kqfilter,
- NULL };
+ pipe_drain };
static void filt_pipedetach(struct knote *kn);
static int filt_piperead(struct knote *kn, long hint);
static int filt_pipewrite(struct knote *kn, long hint);
-static struct filterops pipe_rfiltops =
- { 1, NULL, filt_pipedetach, filt_piperead };
-static struct filterops pipe_wfiltops =
- { 1, NULL, filt_pipedetach, filt_pipewrite };
+static struct filterops pipe_rfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_pipedetach,
+ .f_event = filt_piperead,
+};
+static struct filterops pipe_wfiltops = {
+ .f_isfd = 1,
+ .f_detach = filt_pipedetach,
+ .f_event = filt_pipewrite,
+};
/*
* Default pipe buffer size(s), this can be kind-of large now because pipe
#if PIPE_SYSCTLS
SYSCTL_DECL(_kern_ipc);
-SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
&maxpipekva, 0, "Pipe KVA limit");
-SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW|CTLFLAG_LOCKED,
&maxpipekvawired, 0, "Pipe KVA wired limit");
-SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD|CTLFLAG_LOCKED,
&amountpipes, 0, "Current # of pipes");
-SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD|CTLFLAG_LOCKED,
&nbigpipe, 0, "Current # of big pipes");
-SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
&amountpipekva, 0, "Pipe KVA usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD|CTLFLAG_LOCKED,
&amountpipekvawired, 0, "Pipe wired KVA usage");
#endif
static zone_t pipe_zone;
+#define PIPE_GARBAGE_AGE_LIMIT 5000 /* In milliseconds */
+#define PIPE_GARBAGE_QUEUE_LIMIT 32000
+
+struct pipe_garbage {
+ struct pipe *pg_pipe;
+ struct pipe_garbage *pg_next;
+ uint64_t pg_timestamp;
+};
+
+static zone_t pipe_garbage_zone;
+static struct pipe_garbage *pipe_garbage_head = NULL;
+static struct pipe_garbage *pipe_garbage_tail = NULL;
+static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT;
+static int pipe_garbage_count = 0;
+static lck_mtx_t *pipe_garbage_lock;
+
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
void
pipeinit(void)
{
- pipe_zone = (zone_t)zinit(sizeof(struct pipe), 8192 * sizeof(struct pipe), 4096, "pipe zone");
+ vm_size_t zone_size;
+
+ zone_size = 8192 * sizeof(struct pipe);
+ pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone");
/*
* allocate lock group attribute and group for pipe mutexes
* allocate the lock attribute for pipe mutexes
*/
pipe_mtx_attr = lck_attr_alloc_init();
+
+ /*
+ * Set up garbage collection for dead pipes
+ */
+ zone_size = (PIPE_GARBAGE_QUEUE_LIMIT + 20) *
+ sizeof(struct pipe_garbage);
+ pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage),
+ zone_size, 4096, "pipe garbage zone");
+ pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr);
}
/* Bitmap for things to touch in pipe_touch() */
/* ARGSUSED */
int
-pipe(proc_t p, __unused struct pipe_args *uap, register_t *retval)
+pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval)
{
struct fileproc *rf, *wf;
struct pipe *rpipe, *wpipe;
* address of this pipe's struct pipe. This number may be recycled
* relatively quickly.
*/
- sb64->st_ino = (ino64_t)((uint32_t)cpipe);
+ sb64->st_ino = (ino64_t)((uintptr_t)cpipe);
} else {
sb = (struct stat *)ub;
* address of this pipe's struct pipe. This number may be recycled
* relatively quickly.
*/
- sb->st_ino = (ino_t)cpipe;
+ sb->st_ino = (ino_t)(uintptr_t)cpipe;
}
PIPE_UNLOCK(cpipe);
cpipe->pipe_buffer.out = 0;
cpipe->pipe_buffer.cnt = 0;
- OSAddAtomic(1, (SInt32 *)&amountpipes);
- OSAddAtomic(cpipe->pipe_buffer.size, (SInt32 *)&amountpipekva);
+ OSAddAtomic(1, &amountpipes);
+ OSAddAtomic(cpipe->pipe_buffer.size, &amountpipekva);
return (0);
}
* detect EOF condition
* read returns 0 on EOF, no need to set error
*/
- if (rpipe->pipe_state & PIPE_EOF)
+ if (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
break;
+ }
/*
* If the "write-side" has been blocked, wake it up now.
PRIBIO | PCATCH, "pipdww", 0);
if (error)
goto error1;
- if (wpipe->pipe_state & PIPE_EOF) {
+ if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
error = EPIPE;
goto error1;
}
PRIBIO | PCATCH, "pipdwc", 0);
if (error)
goto error1;
- if (wpipe->pipe_state & PIPE_EOF) {
+ if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
error = EPIPE;
goto error1;
}
error = 0;
while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
- if (wpipe->pipe_state & PIPE_EOF) {
+ if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
pipelock(wpipe, 0);
PIPE_UNLOCK(wpipe);
pipe_destroy_write_buffer(wpipe);
/*
* detect loss of pipe read side, issue SIGPIPE if lost.
*/
- if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)) {
+ if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
PIPE_UNLOCK(rpipe);
return (EPIPE);
}
if ((error = pipelock(wpipe, 1)) == 0) {
PIPE_UNLOCK(wpipe);
if (pipespace(wpipe, pipe_size) == 0)
- OSAddAtomic(1, (SInt32 *)&nbigpipe);
+ OSAddAtomic(1, &nbigpipe);
PIPE_LOCK(wpipe);
pipeunlock(wpipe);
*/
if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
(fp->f_flag & FNONBLOCK) == 0 &&
- amountpipekvawired + uio->uio_resid < maxpipekvawired) {
+ amountpipekvawired + uio_resid(uio) < maxpipekvawired) {
error = pipe_direct_write(wpipe, uio);
if (error)
break;
}
error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipbww", 0);
- if (wpipe->pipe_state & PIPE_EOF)
+ if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))
break;
if (error)
break;
int size; /* Transfer size */
int segsize; /* first segment to transfer */
- if (wpipe->pipe_state & PIPE_EOF) {
+ if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
pipeunlock(wpipe);
error = EPIPE;
break;
error = EAGAIN;
break;
}
+
+ /*
+ * If read side wants to go away, we just issue a signal
+ * to ourselves.
+ */
+ if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
+ error = EPIPE;
+ break;
+ }
+
/*
* We have no more space and have something to offer,
* wake up select/poll.
if (error != 0)
break;
- /*
- * If read side wants to go away, we just issue a signal
- * to ourselves.
- */
- if (wpipe->pipe_state & PIPE_EOF) {
- error = EPIPE;
- break;
- }
}
}
--wpipe->pipe_busy;
case FREAD:
if ((rpipe->pipe_state & PIPE_DIRECTW) ||
(rpipe->pipe_buffer.cnt > 0) ||
- (rpipe->pipe_state & PIPE_EOF)) {
+ (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
retnum = 1;
} else {
break;
case FWRITE:
- if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
+ if (wpipe)
+ wpipe->pipe_state |= PIPE_WSELECT;
+ if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
(((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
(wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
if (cpipe->pipe_buffer.buffer != NULL) {
if (cpipe->pipe_buffer.size > PIPE_SIZE)
- OSAddAtomic(-1, (SInt32 *)&nbigpipe);
- OSAddAtomic(-(cpipe->pipe_buffer.size), (SInt32 *)&amountpipekva);
- OSAddAtomic(-1, (SInt32 *)&amountpipes);
+ OSAddAtomic(-1, &nbigpipe);
+ OSAddAtomic(-(cpipe->pipe_buffer.size), &amountpipekva);
+ OSAddAtomic(-1, &amountpipes);
kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer,
cpipe->pipe_buffer.size);
#endif
}
+/*
+ * When a thread sets a write-select on a pipe, it creates an implicit,
+ * untracked dependency between that thread and the peer of the pipe
+ * on which the select is set. If the peer pipe is closed and freed
+ * before the select()ing thread wakes up, the system will panic as
+ * it attempts to unwind the dangling select(). To avoid that panic,
+ * we notice whenever a dangerous select() is set on a pipe, and
+ * defer the final deletion of the pipe until that select()s are all
+ * resolved. Since we can't currently detect exactly when that
+ * resolution happens, we use a simple garbage collection queue to
+ * reap the at-risk pipes 'later'.
+ */
+static void
+pipe_garbage_collect(struct pipe *cpipe)
+{
+ uint64_t old, now;
+ struct pipe_garbage *pgp;
+
+ /* Convert msecs to nsecs and then to abstime */
+ old = pipe_garbage_age_limit * 1000000;
+ nanoseconds_to_absolutetime(old, &old);
+
+ lck_mtx_lock(pipe_garbage_lock);
+
+ /* Free anything that's been on the queue for <mumble> seconds */
+ now = mach_absolute_time();
+ old = now - old;
+ while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) {
+ pipe_garbage_head = pgp->pg_next;
+ if (pipe_garbage_head == NULL)
+ pipe_garbage_tail = NULL;
+ pipe_garbage_count--;
+ zfree(pipe_zone, pgp->pg_pipe);
+ zfree(pipe_garbage_zone, pgp);
+ }
+
+ /* Add the new pipe (if any) to the tail of the garbage queue */
+ if (cpipe) {
+ cpipe->pipe_state = PIPE_DEAD;
+ pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone);
+ if (pgp == NULL) {
+ /*
+ * We're too low on memory to garbage collect the
+ * pipe. Freeing it runs the risk of panicing the
+ * system. All we can do is leak it and leave
+ * a breadcrumb behind. The good news, such as it
+ * is, is that this will probably never happen.
+ * We will probably hit the panic below first.
+ */
+ printf("Leaking pipe %p - no room left in the queue",
+ cpipe);
+ lck_mtx_unlock(pipe_garbage_lock);
+ return;
+ }
+
+ pgp->pg_pipe = cpipe;
+ pgp->pg_timestamp = now;
+ pgp->pg_next = NULL;
+
+ if (pipe_garbage_tail)
+ pipe_garbage_tail->pg_next = pgp;
+ pipe_garbage_tail = pgp;
+ if (pipe_garbage_head == NULL)
+ pipe_garbage_head = pipe_garbage_tail;
+
+ if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT)
+ panic("Length of pipe garbage queue exceeded %d",
+ PIPE_GARBAGE_QUEUE_LIMIT);
+ }
+ lck_mtx_unlock(pipe_garbage_lock);
+}
+
/*
* shutdown the pipe
*/
* If the other side is blocked, wake it up saying that
* we want to close it down.
*/
+ cpipe->pipe_state &= ~PIPE_DRAIN;
cpipe->pipe_state |= PIPE_EOF;
pipeselwakeup(cpipe, cpipe);
*/
if ((ppipe = cpipe->pipe_peer) != NULL) {
+ ppipe->pipe_state &= ~(PIPE_DRAIN);
ppipe->pipe_state |= PIPE_EOF;
pipeselwakeup(ppipe, ppipe);
}
}
pipe_free_kmem(cpipe);
-
- zfree(pipe_zone, cpipe);
+ if (cpipe->pipe_state & PIPE_WSELECT) {
+ pipe_garbage_collect(cpipe);
+ } else {
+ zfree(pipe_zone, cpipe);
+ pipe_garbage_collect(NULL);
+ }
}
/*ARGSUSED*/
if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
kn->kn_data = rpipe->pipe_map.cnt;
#endif
- if ((rpipe->pipe_state & PIPE_EOF) ||
- (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+ if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
+ (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
kn->kn_flags |= EV_EOF;
retval = 1;
} else {
- retval = (kn->kn_sfflags & NOTE_LOWAT) ?
- (kn->kn_data >= kn->kn_sdata) : (kn->kn_data > 0);
+ int64_t lowwat = 1;
+ if (kn->kn_sfflags & NOTE_LOWAT) {
+ if (rpipe->pipe_buffer.size && kn->kn_sdata > rpipe->pipe_buffer.size)
+ lowwat = rpipe->pipe_buffer.size;
+ else if (kn->kn_sdata > lowwat)
+ lowwat = kn->kn_sdata;
+ }
+ retval = kn->kn_data >= lowwat;
}
if (hint == 0)
wpipe = rpipe->pipe_peer;
- if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+ if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
kn->kn_data = 0;
kn->kn_flags |= EV_EOF;
}
kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
if (!kn->kn_data && wpipe->pipe_buffer.size == 0)
- kn->kn_data = 1; /* unwritten pipe is ready for write */
+ kn->kn_data = PIPE_BUF; /* unwritten pipe is ready for write */
#ifndef PIPE_NODIRECT
if (wpipe->pipe_state & PIPE_DIRECTW)
kn->kn_data = 0;
#endif
+ int64_t lowwat = PIPE_BUF;
+ if (kn->kn_sfflags & NOTE_LOWAT) {
+ if (wpipe->pipe_buffer.size && kn->kn_sdata > wpipe->pipe_buffer.size)
+ lowwat = wpipe->pipe_buffer.size;
+ else if (kn->kn_sdata > lowwat)
+ lowwat = kn->kn_sdata;
+ }
+
if (hint == 0)
PIPE_UNLOCK(rpipe);
- return (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
- kn->kn_sdata : PIPE_BUF));
+ return (kn->kn_data >= lowwat);
}
int
return (0);
}
+
+
+static int
+pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx)
+{
+
+ /* Note: fdlock already held */
+ struct pipe *ppipe, *cpipe = (struct pipe *)(fp->f_fglob->fg_data);
+
+ if (cpipe) {
+ PIPE_LOCK(cpipe);
+ cpipe->pipe_state |= PIPE_DRAIN;
+ cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
+ wakeup(cpipe);
+
+ /* Must wake up peer: a writer sleeps on the read side */
+ if ((ppipe = cpipe->pipe_peer)) {
+ ppipe->pipe_state |= PIPE_DRAIN;
+ ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
+ wakeup(ppipe);
+ }
+
+ PIPE_UNLOCK(cpipe);
+ return 0;
+ }
+
+ return 1;
+}
+
+
+
+
+