]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/kern/sys_pipe.c
xnu-1699.32.7.tar.gz
[apple/xnu.git] / bsd / kern / sys_pipe.c
index 3190f00f1c688b1422841d1f3bc9d6db70b97f7e..27f2461b4854a284c3ba5352afe6a789d5311c84 100644 (file)
 #include <sys/sysproto.h>
 #include <sys/proc_info.h>
 
-#include <bsm/audit_kernel.h>
+#include <security/audit/audit.h>
 
 #include <sys/kdebug.h>
 
 
 #endif
 
-
 /*
  * interfaces to the outside world
  */
@@ -179,6 +178,8 @@ static int pipe_kqfilter(struct fileproc *fp, struct knote *kn,
 static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
                vfs_context_t ctx);
 
+static int pipe_drain(struct fileproc *fp,vfs_context_t ctx);
+
 
 struct  fileops pipeops =
   { pipe_read,
@@ -187,17 +188,23 @@ struct  fileops pipeops =
     pipe_select,
     pipe_close,
     pipe_kqfilter,
-    NULL };
+    pipe_drain };
 
 
 static void    filt_pipedetach(struct knote *kn);
 static int     filt_piperead(struct knote *kn, long hint);
 static int     filt_pipewrite(struct knote *kn, long hint);
 
-static struct filterops pipe_rfiltops =
-       { 1, NULL, filt_pipedetach, filt_piperead };
-static struct filterops pipe_wfiltops =
-       { 1, NULL, filt_pipedetach, filt_pipewrite };
+static struct filterops pipe_rfiltops = {
+        .f_isfd = 1,
+        .f_detach = filt_pipedetach,
+        .f_event = filt_piperead,
+};
+static struct filterops pipe_wfiltops = {
+        .f_isfd = 1,
+        .f_detach = filt_pipedetach,
+        .f_event = filt_pipewrite,
+};
 
 /*
  * Default pipe buffer size(s), this can be kind-of large now because pipe
@@ -224,17 +231,17 @@ int maxpipekva = 1024 * 1024 * 16;
 #if PIPE_SYSCTLS
 SYSCTL_DECL(_kern_ipc);
 
-SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
           &maxpipekva, 0, "Pipe KVA limit");
-SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
+SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW|CTLFLAG_LOCKED,
           &maxpipekvawired, 0, "Pipe KVA wired limit");
-SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD|CTLFLAG_LOCKED,
           &amountpipes, 0, "Current # of pipes");
-SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD|CTLFLAG_LOCKED,
           &nbigpipe, 0, "Current # of big pipes");
-SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
           &amountpipekva, 0, "Pipe KVA usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
+SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD|CTLFLAG_LOCKED,
           &amountpipekvawired, 0, "Pipe wired KVA usage");
 #endif
 
@@ -264,12 +271,31 @@ static lck_grp_attr_t     *pipe_mtx_grp_attr;
 
 static zone_t pipe_zone;
 
+#define        PIPE_GARBAGE_AGE_LIMIT          5000    /* In milliseconds */
+#define PIPE_GARBAGE_QUEUE_LIMIT       32000
+
+struct pipe_garbage {
+       struct pipe             *pg_pipe;
+       struct pipe_garbage     *pg_next;
+       uint64_t                pg_timestamp;
+};
+
+static zone_t pipe_garbage_zone;
+static struct pipe_garbage *pipe_garbage_head = NULL;
+static struct pipe_garbage *pipe_garbage_tail = NULL;
+static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT;
+static int pipe_garbage_count = 0;
+static lck_mtx_t *pipe_garbage_lock;
+
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 
 void
 pipeinit(void)
 {
-        pipe_zone = (zone_t)zinit(sizeof(struct pipe), 8192 * sizeof(struct pipe), 4096, "pipe zone");
+       vm_size_t zone_size;
+
+       zone_size = 8192 * sizeof(struct pipe);
+        pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone");
 
        /*
         * allocate lock group attribute and group for pipe mutexes
@@ -281,6 +307,15 @@ pipeinit(void)
         * allocate the lock attribute for pipe mutexes
         */
        pipe_mtx_attr = lck_attr_alloc_init();
+
+       /*
+        * Set up garbage collection for dead pipes
+        */
+       zone_size = (PIPE_GARBAGE_QUEUE_LIMIT + 20) *
+           sizeof(struct pipe_garbage);
+        pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage),
+           zone_size, 4096, "pipe garbage zone");
+       pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr);
 }
 
 /* Bitmap for things to touch in pipe_touch() */
@@ -319,7 +354,7 @@ pipe_touch(struct pipe *tpipe, int touch)
 
 /* ARGSUSED */
 int
-pipe(proc_t p, __unused struct pipe_args *uap, register_t *retval)
+pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval)
 {
        struct fileproc *rf, *wf;
        struct pipe *rpipe, *wpipe;
@@ -490,7 +525,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
                * address of this pipe's struct pipe.  This number may be recycled
                * relatively quickly.
                */
-               sb64->st_ino = (ino64_t)((uint32_t)cpipe);
+               sb64->st_ino = (ino64_t)((uintptr_t)cpipe);
        } else {
                sb = (struct stat *)ub; 
 
@@ -517,7 +552,7 @@ pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
                * address of this pipe's struct pipe.  This number may be recycled
                * relatively quickly.
                */
-               sb->st_ino = (ino_t)cpipe;
+               sb->st_ino = (ino_t)(uintptr_t)cpipe;
        }
        PIPE_UNLOCK(cpipe);
 
@@ -557,8 +592,8 @@ pipespace(struct pipe *cpipe, int size)
        cpipe->pipe_buffer.out = 0;
        cpipe->pipe_buffer.cnt = 0;
 
-       OSAddAtomic(1, (SInt32 *)&amountpipes);
-       OSAddAtomic(cpipe->pipe_buffer.size, (SInt32 *)&amountpipekva);
+       OSAddAtomic(1, &amountpipes);
+       OSAddAtomic(cpipe->pipe_buffer.size, &amountpipekva);
 
        return (0);
 }
@@ -734,8 +769,9 @@ pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags,
                         * detect EOF condition
                         * read returns 0 on EOF, no need to set error
                         */
-                       if (rpipe->pipe_state & PIPE_EOF)
+                       if (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
                                break;
+                       }
 
                        /*
                         * If the "write-side" has been blocked, wake it up now.
@@ -975,7 +1011,7 @@ retry:
                    PRIBIO | PCATCH, "pipdww", 0);
                if (error)
                        goto error1;
-               if (wpipe->pipe_state & PIPE_EOF) {
+               if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
                        error = EPIPE;
                        goto error1;
                }
@@ -992,7 +1028,7 @@ retry:
                    PRIBIO | PCATCH, "pipdwc", 0);
                if (error)
                        goto error1;
-               if (wpipe->pipe_state & PIPE_EOF) {
+               if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
                        error = EPIPE;
                        goto error1;
                }
@@ -1013,7 +1049,7 @@ retry:
 
        error = 0;
        while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
-               if (wpipe->pipe_state & PIPE_EOF) {
+               if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
                        pipelock(wpipe, 0);
                        PIPE_UNLOCK(wpipe);
                        pipe_destroy_write_buffer(wpipe);
@@ -1072,7 +1108,7 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
        /*
         * detect loss of pipe read side, issue SIGPIPE if lost.
         */
-       if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)) {
+       if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
                PIPE_UNLOCK(rpipe);
                return (EPIPE);
        }
@@ -1125,7 +1161,7 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
                if ((error = pipelock(wpipe, 1)) == 0) {
                        PIPE_UNLOCK(wpipe);
                        if (pipespace(wpipe, pipe_size) == 0)
-                               OSAddAtomic(1, (SInt32 *)&nbigpipe);
+                               OSAddAtomic(1, &nbigpipe);
                        PIPE_LOCK(wpipe);
                        pipeunlock(wpipe);
 
@@ -1169,7 +1205,7 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
                 */
                if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
                    (fp->f_flag & FNONBLOCK) == 0 &&
-                   amountpipekvawired + uio->uio_resid < maxpipekvawired) { 
+                   amountpipekvawired + uio_resid(uio) < maxpipekvawired) { 
                        error = pipe_direct_write(wpipe, uio);
                        if (error)
                                break;
@@ -1191,7 +1227,7 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
                        }
                        error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipbww", 0);
 
-                       if (wpipe->pipe_state & PIPE_EOF)
+                       if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))
                                break;
                        if (error)
                                break;
@@ -1213,7 +1249,7 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
                                int size;       /* Transfer size */
                                int segsize;    /* first segment to transfer */
 
-                               if (wpipe->pipe_state & PIPE_EOF) {
+                               if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
                                        pipeunlock(wpipe);
                                        error = EPIPE;
                                        break;
@@ -1324,6 +1360,16 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
                                error = EAGAIN;
                                break;
                        }
+
+                       /*
+                        * If read side wants to go away, we just issue a signal
+                        * to ourselves.
+                        */
+                       if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
+                               error = EPIPE;
+                               break;
+                       }       
+
                        /*
                         * We have no more space and have something to offer,
                         * wake up select/poll.
@@ -1336,14 +1382,6 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
 
                        if (error != 0)
                                break;
-                       /*
-                        * If read side wants to go away, we just issue a signal
-                        * to ourselves.
-                        */
-                       if (wpipe->pipe_state & PIPE_EOF) {
-                               error = EPIPE;
-                               break;
-                       }       
                }
        }
        --wpipe->pipe_busy;
@@ -1472,7 +1510,7 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
         case FREAD:
                if ((rpipe->pipe_state & PIPE_DIRECTW) ||
                    (rpipe->pipe_buffer.cnt > 0) ||
-                   (rpipe->pipe_state & PIPE_EOF)) {
+                   (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
 
                        retnum = 1;
                } else {
@@ -1482,7 +1520,9 @@ pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
                break;
 
         case FWRITE:
-               if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
+               if (wpipe)
+                       wpipe->pipe_state |= PIPE_WSELECT;
+               if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
                    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
                     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
 
@@ -1526,9 +1566,9 @@ pipe_free_kmem(struct pipe *cpipe)
 
        if (cpipe->pipe_buffer.buffer != NULL) {
                if (cpipe->pipe_buffer.size > PIPE_SIZE)
-                       OSAddAtomic(-1, (SInt32 *)&nbigpipe);
-               OSAddAtomic(-(cpipe->pipe_buffer.size), (SInt32 *)&amountpipekva);
-               OSAddAtomic(-1, (SInt32 *)&amountpipes);
+                       OSAddAtomic(-1, &nbigpipe);
+               OSAddAtomic(-(cpipe->pipe_buffer.size), &amountpipekva);
+               OSAddAtomic(-1, &amountpipes);
 
                kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer,
                          cpipe->pipe_buffer.size);
@@ -1549,6 +1589,78 @@ pipe_free_kmem(struct pipe *cpipe)
 #endif
 }
 
+/*
+ * When a thread sets a write-select on a pipe, it creates an implicit,
+ * untracked dependency between that thread and the peer of the pipe
+ * on which the select is set.  If the peer pipe is closed and freed
+ * before the select()ing thread wakes up, the system will panic as
+ * it attempts to unwind the dangling select().  To avoid that panic,
+ * we notice whenever a dangerous select() is set on a pipe, and
+ * defer the final deletion of the pipe until that select()s are all
+ * resolved.  Since we can't currently detect exactly when that
+ * resolution happens, we use a simple garbage collection queue to 
+ * reap the at-risk pipes 'later'.
+ */
+static void
+pipe_garbage_collect(struct pipe *cpipe)
+{
+       uint64_t old, now;
+       struct pipe_garbage *pgp;
+
+       /* Convert msecs to nsecs and then to abstime */
+       old = pipe_garbage_age_limit * 1000000;
+       nanoseconds_to_absolutetime(old, &old);
+
+       lck_mtx_lock(pipe_garbage_lock);
+
+       /* Free anything that's been on the queue for <mumble> seconds */
+       now = mach_absolute_time();
+       old = now - old;
+       while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) {
+               pipe_garbage_head = pgp->pg_next;
+               if (pipe_garbage_head == NULL)
+                       pipe_garbage_tail = NULL;
+               pipe_garbage_count--;
+               zfree(pipe_zone, pgp->pg_pipe);
+               zfree(pipe_garbage_zone, pgp);
+       }
+
+       /* Add the new pipe (if any) to the tail of the garbage queue */
+       if (cpipe) {
+               cpipe->pipe_state = PIPE_DEAD;
+               pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone);
+               if (pgp == NULL) {
+                       /*
+                        * We're too low on memory to garbage collect the
+                        * pipe.  Freeing it runs the risk of panicing the
+                        * system.  All we can do is leak it and leave
+                        * a breadcrumb behind.  The good news, such as it
+                        * is, is that this will probably never happen.
+                        * We will probably hit the panic below first.
+                        */
+                       printf("Leaking pipe %p - no room left in the queue",
+                           cpipe);
+                       lck_mtx_unlock(pipe_garbage_lock);
+                       return;
+               }
+
+               pgp->pg_pipe = cpipe;
+               pgp->pg_timestamp = now;
+               pgp->pg_next = NULL;
+
+               if (pipe_garbage_tail)
+                       pipe_garbage_tail->pg_next = pgp;
+               pipe_garbage_tail = pgp;
+               if (pipe_garbage_head == NULL)
+                       pipe_garbage_head = pipe_garbage_tail;
+
+               if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT)
+                       panic("Length of pipe garbage queue exceeded %d",
+                           PIPE_GARBAGE_QUEUE_LIMIT);
+       }
+       lck_mtx_unlock(pipe_garbage_lock);
+}
+
 /*
  * shutdown the pipe
  */
@@ -1569,6 +1681,7 @@ pipeclose(struct pipe *cpipe)
         * If the other side is blocked, wake it up saying that
         * we want to close it down.
         */
+       cpipe->pipe_state &= ~PIPE_DRAIN;
        cpipe->pipe_state |= PIPE_EOF;
        pipeselwakeup(cpipe, cpipe);
        
@@ -1592,6 +1705,7 @@ pipeclose(struct pipe *cpipe)
         */
        if ((ppipe = cpipe->pipe_peer) != NULL) {
 
+               ppipe->pipe_state &= ~(PIPE_DRAIN);
                ppipe->pipe_state |= PIPE_EOF;
 
                pipeselwakeup(ppipe, ppipe);
@@ -1625,8 +1739,12 @@ pipeclose(struct pipe *cpipe)
                }
        }
        pipe_free_kmem(cpipe);
-
-       zfree(pipe_zone, cpipe);
+       if (cpipe->pipe_state & PIPE_WSELECT) {
+               pipe_garbage_collect(cpipe);
+       } else {
+               zfree(pipe_zone, cpipe);
+               pipe_garbage_collect(NULL);
+       }
 }
 
 /*ARGSUSED*/
@@ -1725,13 +1843,19 @@ filt_piperead(struct knote *kn, long hint)
        if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
                kn->kn_data = rpipe->pipe_map.cnt;
 #endif
-       if ((rpipe->pipe_state & PIPE_EOF) ||
-           (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+       if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
+           (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
                kn->kn_flags |= EV_EOF;
                retval = 1;
        } else {
-               retval = (kn->kn_sfflags & NOTE_LOWAT) ?
-                        (kn->kn_data >= kn->kn_sdata) : (kn->kn_data > 0);
+               int64_t lowwat = 1;
+               if (kn->kn_sfflags & NOTE_LOWAT) {
+                       if (rpipe->pipe_buffer.size && kn->kn_sdata > rpipe->pipe_buffer.size)
+                               lowwat = rpipe->pipe_buffer.size;
+                       else if (kn->kn_sdata > lowwat)
+                               lowwat = kn->kn_sdata;
+               }
+               retval = kn->kn_data >= lowwat;
        }
 
        if (hint == 0)
@@ -1758,7 +1882,7 @@ filt_pipewrite(struct knote *kn, long hint)
 
        wpipe = rpipe->pipe_peer;
 
-       if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+       if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
                kn->kn_data = 0;
                kn->kn_flags |= EV_EOF; 
 
@@ -1768,17 +1892,24 @@ filt_pipewrite(struct knote *kn, long hint)
        }
        kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
        if (!kn->kn_data && wpipe->pipe_buffer.size == 0)
-               kn->kn_data = 1; /* unwritten pipe is ready for write */
+               kn->kn_data = PIPE_BUF; /* unwritten pipe is ready for write */
 
 #ifndef PIPE_NODIRECT
        if (wpipe->pipe_state & PIPE_DIRECTW)
                kn->kn_data = 0;
 #endif
+       int64_t lowwat = PIPE_BUF;
+       if (kn->kn_sfflags & NOTE_LOWAT) {
+               if (wpipe->pipe_buffer.size && kn->kn_sdata > wpipe->pipe_buffer.size)
+                       lowwat = wpipe->pipe_buffer.size;
+               else if (kn->kn_sdata > lowwat)
+                       lowwat = kn->kn_sdata;
+       }
+       
        if (hint == 0)
                PIPE_UNLOCK(rpipe);
 
-       return (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
-                                kn->kn_sdata : PIPE_BUF));
+       return (kn->kn_data >= lowwat);
 }
 
 int
@@ -1863,3 +1994,36 @@ fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo)
 
        return (0);
 }
+
+
+static int 
+pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx)
+{
+
+       /* Note: fdlock already held */
+       struct pipe *ppipe, *cpipe = (struct pipe *)(fp->f_fglob->fg_data);
+
+       if (cpipe) {
+               PIPE_LOCK(cpipe);
+               cpipe->pipe_state |= PIPE_DRAIN; 
+               cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
+               wakeup(cpipe);
+               
+               /* Must wake up peer: a writer sleeps on the read side */
+               if ((ppipe = cpipe->pipe_peer)) {
+                       ppipe->pipe_state |= PIPE_DRAIN;
+                       ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
+                       wakeup(ppipe);
+               }
+               
+               PIPE_UNLOCK(cpipe);
+               return 0;
+       }
+
+       return 1;
+}
+
+
+
+
+