bsd/kern/sys_pipe.c

   1 /*
   2  * Copyright (c) 1996 John S. Dyson
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice immediately at the beginning of the file, without modification,
  10  *    this list of conditions, and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  * 3. Absolutely no warranty of function or purpose is made by the author
  15  *    John S. Dyson.
  16  * 4. Modifications may be freely made to this file if the above conditions
  17  *    are met.
  18  */
  19 /*
  20  * Copyright (c) 2003-2014 Apple Inc. All rights reserved.
  21  *
  22  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  23  *
  24  * This file contains Original Code and/or Modifications of Original Code
  25  * as defined in and that are subject to the Apple Public Source License
  26  * Version 2.0 (the 'License'). You may not use this file except in
  27  * compliance with the License. The rights granted to you under the License
  28  * may not be used to create, or enable the creation or redistribution of,
  29  * unlawful or unlicensed copies of an Apple operating system, or to
  30  * circumvent, violate, or enable the circumvention or violation of, any
  31  * terms of an Apple operating system software license agreement.
  32  *
  33  * Please obtain a copy of the License at
  34  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  35  *
  36  * The Original Code and all software distributed under the License are
  37  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  38  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  39  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  40  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  41  * Please see the License for the specific language governing rights and
  42  * limitations under the License.
  43  *
  44  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  45  */
  46 /*
  47  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  48  * support for mandatory and extensible security protections.  This notice
  49  * is included in support of clause 2.2 (b) of the Apple Public License,
  50  * Version 2.0.
  51  */
  52
  53 /*
  54  * This file contains a high-performance replacement for the socket-based
  55  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
  56  * all features of sockets, but does do everything that pipes normally
  57  * do.
  58  *
  59  * Pipes are implemented as circular buffers. Following are the valid states in pipes operations
  60  *
  61  *      _________________________________
  62  * 1.  |_________________________________| r=w, c=0
  63  *
  64  *      _________________________________
  65  * 2.  |__r:::::wc_______________________| r <= w , c > 0
  66  *
  67  *      _________________________________
  68  * 3.  |::::wc_____r:::::::::::::::::::::| r>w , c > 0
  69  *
  70  *      _________________________________
  71  * 4.  |:::::::wrc:::::::::::::::::::::::| w=r, c = Max size
  72  *
  73  *
  74  *  Nomenclature:-
  75  *  a-z define the steps in a program flow
  76  *  1-4 are the states as defined aboe
  77  *  Action: is what file operation is done on the pipe
  78  *
  79  *  Current:None  Action: initialize with size M=200
  80  *  a. State 1 ( r=0, w=0, c=0)
  81  *
  82  *  Current: a    Action: write(100) (w < M)
  83  *  b. State 2 (r=0, w=100, c=100)
  84  *
  85  *  Current: b    Action: write(100) (w = M-w)
  86  *  c. State 4 (r=0,w=0,c=200)
  87  *
  88  *  Current: b    Action: read(70)  ( r < c )
  89  *  d. State 2(r=70,w=100,c=30)
  90  *
  91  *  Current: d    Action: write(75) ( w < (m-w))
  92  *  e. State 2 (r=70,w=175,c=105)
  93  *
  94  *  Current: d    Action: write(110) ( w > (m-w))
  95  *  f. State 3 (r=70,w=10,c=140)
  96  *
  97  *  Current: d    Action: read(30) (r >= c )
  98  *  g. State 1 (r=100,w=100,c=0)
  99  *
 100  */
 101
 102 /*
 103  * This code create half duplex pipe buffers for facilitating file like
 104  * operations on pipes. The initial buffer is very small, but this can
 105  * dynamically change to larger sizes based on usage. The buffer size is never
 106  * reduced. The total amount of kernel memory used is governed by maxpipekva.
 107  * In case of dynamic expansion limit is reached, the output thread is blocked
 108  * until the pipe buffer empties enough to continue.
 109  *
 110  * In order to limit the resource use of pipes, two sysctls exist:
 111  *
 112  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
 113  * address space available to us in pipe_map.
 114  *
 115  * Memory usage may be monitored through the sysctls
 116  * kern.ipc.pipes, kern.ipc.pipekva.
 117  *
 118  */
 119
 120 #include <sys/param.h>
 121 #include <sys/systm.h>
 122 #include <sys/filedesc.h>
 123 #include <sys/kernel.h>
 124 #include <sys/vnode.h>
 125 #include <sys/proc_internal.h>
 126 #include <sys/kauth.h>
 127 #include <sys/file_internal.h>
 128 #include <sys/stat.h>
 129 #include <sys/ioctl.h>
 130 #include <sys/fcntl.h>
 131 #include <sys/malloc.h>
 132 #include <sys/syslog.h>
 133 #include <sys/unistd.h>
 134 #include <sys/resourcevar.h>
 135 #include <sys/aio_kern.h>
 136 #include <sys/signalvar.h>
 137 #include <sys/pipe.h>
 138 #include <sys/sysproto.h>
 139 #include <sys/proc_info.h>
 140
 141 #include <security/audit/audit.h>
 142
 143 #include <sys/kdebug.h>
 144
 145 #include <kern/zalloc.h>
 146 #include <kern/kalloc.h>
 147 #include <vm/vm_kern.h>
 148 #include <libkern/OSAtomic.h>
 149
 150 #define f_flag f_fglob->fg_flag
 151 #define f_msgcount f_fglob->fg_msgcount
 152 #define f_cred f_fglob->fg_cred
 153 #define f_ops f_fglob->fg_ops
 154 #define f_offset f_fglob->fg_offset
 155 #define f_data f_fglob->fg_data
 156
 157 /*
 158  * interfaces to the outside world exported through file operations
 159  */
 160 static int pipe_read(struct fileproc *fp, struct uio *uio,
 161                 int flags, vfs_context_t ctx);
 162 static int pipe_write(struct fileproc *fp, struct uio *uio,
 163                 int flags, vfs_context_t ctx);
 164 static int pipe_close(struct fileglob *fg, vfs_context_t ctx);
 165 static int pipe_select(struct fileproc *fp, int which, void * wql,
 166                 vfs_context_t ctx);
 167 static int pipe_kqfilter(struct fileproc *fp, struct knote *kn,
 168                 vfs_context_t ctx);
 169 static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
 170                 vfs_context_t ctx);
 171 static int pipe_drain(struct fileproc *fp,vfs_context_t ctx);
 172
 173 static const struct fileops pipeops = {
 174         DTYPE_PIPE,
 175         pipe_read,
 176         pipe_write,
 177         pipe_ioctl,
 178         pipe_select,
 179         pipe_close,
 180         pipe_kqfilter,
 181         pipe_drain
 182 };
 183
 184 static void     filt_pipedetach(struct knote *kn);
 185 static int      filt_piperead(struct knote *kn, long hint);
 186 static int      filt_pipewrite(struct knote *kn, long hint);
 187
 188 static struct filterops pipe_rfiltops = {
 189         .f_isfd = 1,
 190         .f_detach = filt_pipedetach,
 191         .f_event = filt_piperead,
 192 };
 193
 194 static struct filterops pipe_wfiltops = {
 195         .f_isfd = 1,
 196         .f_detach = filt_pipedetach,
 197         .f_event = filt_pipewrite,
 198 };
 199
 200 static int nbigpipe;      /* for compatibility sake. no longer used */
 201 static int amountpipes;   /* total number of pipes in system */
 202 static int amountpipekva; /* total memory used by pipes */
 203
 204 int maxpipekva __attribute__((used)) = PIPE_KVAMAX;  /* allowing 16MB max. */
 205
 206 #if PIPE_SYSCTLS
 207 SYSCTL_DECL(_kern_ipc);
 208
 209 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
 210            &maxpipekva, 0, "Pipe KVA limit");
 211 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW|CTLFLAG_LOCKED,
 212            &maxpipekvawired, 0, "Pipe KVA wired limit");
 213 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD|CTLFLAG_LOCKED,
 214            &amountpipes, 0, "Current # of pipes");
 215 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD|CTLFLAG_LOCKED,
 216            &nbigpipe, 0, "Current # of big pipes");
 217 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD|CTLFLAG_LOCKED,
 218            &amountpipekva, 0, "Pipe KVA usage");
 219 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD|CTLFLAG_LOCKED,
 220            &amountpipekvawired, 0, "Pipe wired KVA usage");
 221 #endif
 222
 223 static void pipeclose(struct pipe *cpipe);
 224 static void pipe_free_kmem(struct pipe *cpipe);
 225 static int pipe_create(struct pipe **cpipep);
 226 static int pipespace(struct pipe *cpipe, int size);
 227 static int choose_pipespace(unsigned long current, unsigned long expected);
 228 static int expand_pipespace(struct pipe *p, int target_size);
 229 static void pipeselwakeup(struct pipe *cpipe, struct pipe *spipe);
 230 static __inline int pipeio_lock(struct pipe *cpipe, int catch);
 231 static __inline void pipeio_unlock(struct pipe *cpipe);
 232
 233 extern int postpipeevent(struct pipe *, int);
 234 extern void evpipefree(struct pipe *cpipe);
 235
 236 static lck_grp_t        *pipe_mtx_grp;
 237 static lck_attr_t       *pipe_mtx_attr;
 238 static lck_grp_attr_t   *pipe_mtx_grp_attr;
 239
 240 static zone_t pipe_zone;
 241
 242 #define MAX_PIPESIZE(pipe)              ( MAX(PIPE_SIZE, (pipe)->pipe_buffer.size) )
 243
 244 #define PIPE_GARBAGE_AGE_LIMIT          5000    /* In milliseconds */
 245 #define PIPE_GARBAGE_QUEUE_LIMIT        32000
 246
 247 struct pipe_garbage {
 248         struct pipe             *pg_pipe;
 249         struct pipe_garbage     *pg_next;
 250         uint64_t                pg_timestamp;
 251 };
 252
 253 static zone_t pipe_garbage_zone;
 254 static struct pipe_garbage *pipe_garbage_head = NULL;
 255 static struct pipe_garbage *pipe_garbage_tail = NULL;
 256 static uint64_t pipe_garbage_age_limit = PIPE_GARBAGE_AGE_LIMIT;
 257 static int pipe_garbage_count = 0;
 258 static lck_mtx_t *pipe_garbage_lock;
 259 static void pipe_garbage_collect(struct pipe *cpipe);
 260
 261 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 262
 263 /* initial setup done at time of sysinit */
 264 void
 265 pipeinit(void)
 266 {
 267         nbigpipe=0;
 268         vm_size_t zone_size;
 269
 270         zone_size = 8192 * sizeof(struct pipe);
 271         pipe_zone = zinit(sizeof(struct pipe), zone_size, 4096, "pipe zone");
 272
 273
 274         /* allocate lock group attribute and group for pipe mutexes */
 275         pipe_mtx_grp_attr = lck_grp_attr_alloc_init();
 276         pipe_mtx_grp = lck_grp_alloc_init("pipe", pipe_mtx_grp_attr);
 277
 278         /* allocate the lock attribute for pipe mutexes */
 279         pipe_mtx_attr = lck_attr_alloc_init();
 280
 281         /*
 282          * Set up garbage collection for dead pipes
 283          */
 284         zone_size = (PIPE_GARBAGE_QUEUE_LIMIT + 20) *
 285             sizeof(struct pipe_garbage);
 286         pipe_garbage_zone = (zone_t)zinit(sizeof(struct pipe_garbage),
 287             zone_size, 4096, "pipe garbage zone");
 288         pipe_garbage_lock = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr);
 289
 290 }
 291
 292 /* Bitmap for things to touch in pipe_touch() */
 293 #define PIPE_ATIME      0x00000001      /* time of last access */
 294 #define PIPE_MTIME      0x00000002      /* time of last modification */
 295 #define PIPE_CTIME      0x00000004      /* time of last status change */
 296
 297 static void
 298 pipe_touch(struct pipe *tpipe, int touch)
 299 {
 300         struct timeval now;
 301
 302         microtime(&now);
 303
 304         if (touch & PIPE_ATIME) {
 305                 tpipe->st_atimespec.tv_sec  = now.tv_sec;
 306                 tpipe->st_atimespec.tv_nsec = now.tv_usec * 1000;
 307         }
 308
 309         if (touch & PIPE_MTIME) {
 310                 tpipe->st_mtimespec.tv_sec  = now.tv_sec;
 311                 tpipe->st_mtimespec.tv_nsec = now.tv_usec * 1000;
 312         }
 313
 314         if (touch & PIPE_CTIME) {
 315                 tpipe->st_ctimespec.tv_sec  = now.tv_sec;
 316                 tpipe->st_ctimespec.tv_nsec = now.tv_usec * 1000;
 317         }
 318 }
 319
 320 static const unsigned int pipesize_blocks[] = {512,1024,2048,4096, 4096 * 2, PIPE_SIZE , PIPE_SIZE * 4 };
 321
 322 /*
 323  * finds the right size from possible sizes in pipesize_blocks
 324  * returns the size which matches max(current,expected)
 325  */
 326 static int
 327 choose_pipespace(unsigned long current, unsigned long expected)
 328 {
 329         int i = sizeof(pipesize_blocks)/sizeof(unsigned int) -1;
 330         unsigned long target;
 331
 332         /*
 333          * assert that we always get an atomic transaction sized pipe buffer,
 334          * even if the system pipe buffer high-water mark has been crossed.
 335          */
 336         assert(PIPE_BUF == pipesize_blocks[0]);
 337
 338         if (expected > current)
 339                 target = expected;
 340         else
 341                 target = current;
 342
 343         while ( i >0 && pipesize_blocks[i-1] > target) {
 344                 i=i-1;
 345
 346         }
 347
 348         return pipesize_blocks[i];
 349 }
 350
 351
 352 /*
 353  * expand the size of pipe while there is data to be read,
 354  * and then free the old buffer once the current buffered
 355  * data has been transferred to new storage.
 356  * Required: PIPE_LOCK and io lock to be held by caller.
 357  * returns 0 on success or no expansion possible
 358  */
 359 static int
 360 expand_pipespace(struct pipe *p, int target_size)
 361 {
 362         struct pipe tmp, oldpipe;
 363         int error;
 364         tmp.pipe_buffer.buffer = 0;
 365
 366         if (p->pipe_buffer.size >= (unsigned) target_size) {
 367                 return 0; /* the existing buffer is max size possible */
 368         }
 369
 370         /* create enough space in the target */
 371         error = pipespace(&tmp, target_size);
 372         if (error != 0)
 373                 return (error);
 374
 375         oldpipe.pipe_buffer.buffer = p->pipe_buffer.buffer;
 376         oldpipe.pipe_buffer.size = p->pipe_buffer.size;
 377
 378         memcpy(tmp.pipe_buffer.buffer, p->pipe_buffer.buffer, p->pipe_buffer.size);
 379         if (p->pipe_buffer.cnt > 0 && p->pipe_buffer.in <= p->pipe_buffer.out ){
 380                 /* we are in State 3 and need extra copying for read to be consistent */
 381                 memcpy(&tmp.pipe_buffer.buffer[p->pipe_buffer.size], p->pipe_buffer.buffer, p->pipe_buffer.size);
 382                 p->pipe_buffer.in += p->pipe_buffer.size;
 383         }
 384
 385         p->pipe_buffer.buffer = tmp.pipe_buffer.buffer;
 386         p->pipe_buffer.size = tmp.pipe_buffer.size;
 387
 388
 389         pipe_free_kmem(&oldpipe);
 390         return 0;
 391 }
 392
 393 /*
 394  * The pipe system call for the DTYPE_PIPE type of pipes
 395  *
 396  * returns:
 397  *  FREAD  | fd0 | -->[struct rpipe] --> |~~buffer~~| \
 398  *                                                    (pipe_mutex)
 399  *  FWRITE | fd1 | -->[struct wpipe] --X              /
 400  */
 401
 402 /* ARGSUSED */
 403 int
 404 pipe(proc_t p, __unused struct pipe_args *uap, int32_t *retval)
 405 {
 406         struct fileproc *rf, *wf;
 407         struct pipe *rpipe, *wpipe;
 408         lck_mtx_t   *pmtx;
 409         int fd, error;
 410
 411         if ((pmtx = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr)) == NULL)
 412                 return (ENOMEM);
 413
 414         rpipe = wpipe = NULL;
 415         if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
 416                 error = ENFILE;
 417                 goto freepipes;
 418         }
 419         /*
 420          * allocate the space for the normal I/O direction up
 421          * front... we'll delay the allocation for the other
 422          * direction until a write actually occurs (most likely it won't)...
 423          */
 424         error = pipespace(rpipe, choose_pipespace(rpipe->pipe_buffer.size, 0));
 425         if (error)
 426                 goto freepipes;
 427
 428         TAILQ_INIT(&rpipe->pipe_evlist);
 429         TAILQ_INIT(&wpipe->pipe_evlist);
 430
 431         error = falloc(p, &rf, &fd, vfs_context_current());
 432         if (error) {
 433                 goto freepipes;
 434         }
 435         retval[0] = fd;
 436
 437         /*
 438          * for now we'll create half-duplex pipes(refer returns section above).
 439          * this is what we've always supported..
 440          */
 441         rf->f_flag = FREAD;
 442         rf->f_data = (caddr_t)rpipe;
 443         rf->f_ops = &pipeops;
 444
 445         error = falloc(p, &wf, &fd, vfs_context_current());
 446         if (error) {
 447                 fp_free(p, retval[0], rf);
 448                 goto freepipes;
 449         }
 450         wf->f_flag = FWRITE;
 451         wf->f_data = (caddr_t)wpipe;
 452         wf->f_ops = &pipeops;
 453
 454         rpipe->pipe_peer = wpipe;
 455         wpipe->pipe_peer = rpipe;
 456         /* both structures share the same mutex */
 457         rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
 458
 459         retval[1] = fd;
 460 #if CONFIG_MACF
 461         /*
 462          * XXXXXXXX SHOULD NOT HOLD FILE_LOCK() XXXXXXXXXXXX
 463          *
 464          * struct pipe represents a pipe endpoint.  The MAC label is shared
 465          * between the connected endpoints.  As a result mac_pipe_label_init() and
 466          * mac_pipe_label_associate() should only be called on one of the endpoints
 467          * after they have been connected.
 468          */
 469         mac_pipe_label_init(rpipe);
 470         mac_pipe_label_associate(kauth_cred_get(), rpipe);
 471         wpipe->pipe_label = rpipe->pipe_label;
 472 #endif
 473         proc_fdlock_spin(p);
 474         procfdtbl_releasefd(p, retval[0], NULL);
 475         procfdtbl_releasefd(p, retval[1], NULL);
 476         fp_drop(p, retval[0], rf, 1);
 477         fp_drop(p, retval[1], wf, 1);
 478         proc_fdunlock(p);
 479
 480
 481         return (0);
 482
 483 freepipes:
 484         pipeclose(rpipe);
 485         pipeclose(wpipe);
 486         lck_mtx_free(pmtx, pipe_mtx_grp);
 487
 488         return (error);
 489 }
 490
 491 int
 492 pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
 493 {
 494 #if CONFIG_MACF
 495         int error;
 496 #endif
 497         int     pipe_size = 0;
 498         int     pipe_count;
 499         struct stat *sb = (struct stat *)0;     /* warning avoidance ; protected by isstat64 */
 500         struct stat64 * sb64 = (struct stat64 *)0;  /* warning avoidance ; protected by isstat64 */
 501
 502         if (cpipe == NULL)
 503                 return (EBADF);
 504         PIPE_LOCK(cpipe);
 505
 506 #if CONFIG_MACF
 507         error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
 508         if (error) {
 509                 PIPE_UNLOCK(cpipe);
 510                 return (error);
 511         }
 512 #endif
 513         if (cpipe->pipe_buffer.buffer == 0) {
 514                 /* must be stat'ing the write fd */
 515                 if (cpipe->pipe_peer) {
 516                         /* the peer still exists, use it's info */
 517                         pipe_size  = MAX_PIPESIZE(cpipe->pipe_peer);
 518                         pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
 519                 } else {
 520                         pipe_count = 0;
 521                 }
 522         } else {
 523                 pipe_size  = MAX_PIPESIZE(cpipe);
 524                 pipe_count = cpipe->pipe_buffer.cnt;
 525         }
 526         /*
 527          * since peer's buffer is setup ouside of lock
 528          * we might catch it in transient state
 529          */
 530         if (pipe_size == 0)
 531                 pipe_size  = MAX(PIPE_SIZE, pipesize_blocks[0]);
 532
 533         if (isstat64 != 0) {
 534                 sb64 = (struct stat64 *)ub;
 535
 536                 bzero(sb64, sizeof(*sb64));
 537                 sb64->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 538                 sb64->st_blksize = pipe_size;
 539                 sb64->st_size = pipe_count;
 540                 sb64->st_blocks = (sb64->st_size + sb64->st_blksize - 1) / sb64->st_blksize;
 541
 542                 sb64->st_uid = kauth_getuid();
 543                 sb64->st_gid = kauth_getgid();
 544
 545                 sb64->st_atimespec.tv_sec  = cpipe->st_atimespec.tv_sec;
 546                 sb64->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
 547
 548                 sb64->st_mtimespec.tv_sec  = cpipe->st_mtimespec.tv_sec;
 549                 sb64->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
 550
 551                 sb64->st_ctimespec.tv_sec  = cpipe->st_ctimespec.tv_sec;
 552                 sb64->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
 553
 554                 /*
 555                 * Return a relatively unique inode number based on the current
 556                 * address of this pipe's struct pipe.  This number may be recycled
 557                 * relatively quickly.
 558                 */
 559                 sb64->st_ino = (ino64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
 560         } else {
 561                 sb = (struct stat *)ub;
 562
 563                 bzero(sb, sizeof(*sb));
 564                 sb->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 565                 sb->st_blksize = pipe_size;
 566                 sb->st_size = pipe_count;
 567                 sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
 568
 569                 sb->st_uid = kauth_getuid();
 570                 sb->st_gid = kauth_getgid();
 571
 572                 sb->st_atimespec.tv_sec  = cpipe->st_atimespec.tv_sec;
 573                 sb->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
 574
 575                 sb->st_mtimespec.tv_sec  = cpipe->st_mtimespec.tv_sec;
 576                 sb->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
 577
 578                 sb->st_ctimespec.tv_sec  = cpipe->st_ctimespec.tv_sec;
 579                 sb->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
 580
 581                 /*
 582                 * Return a relatively unique inode number based on the current
 583                 * address of this pipe's struct pipe.  This number may be recycled
 584                 * relatively quickly.
 585                 */
 586                 sb->st_ino = (ino_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
 587         }
 588         PIPE_UNLOCK(cpipe);
 589
 590         /*
 591          * POSIX: Left as 0: st_dev, st_nlink, st_rdev, st_flags, st_gen,
 592          * st_uid, st_gid.
 593          *
 594          * XXX (st_dev) should be unique, but there is no device driver that
 595          * XXX is associated with pipes, since they are implemented via a
 596          * XXX struct fileops indirection rather than as FS objects.
 597          */
 598         return (0);
 599 }
 600
 601
 602 /*
 603  * Allocate kva for pipe circular buffer, the space is pageable
 604  * This routine will 'realloc' the size of a pipe safely, if it fails
 605  * it will retain the old buffer.
 606  * If it fails it will return ENOMEM.
 607  */
 608 static int
 609 pipespace(struct pipe *cpipe, int size)
 610 {
 611         vm_offset_t buffer;
 612
 613         if (size <= 0)
 614                 return(EINVAL);
 615
 616         if ((buffer = (vm_offset_t)kalloc(size)) == 0 )
 617                 return(ENOMEM);
 618
 619         /* free old resources if we're resizing */
 620         pipe_free_kmem(cpipe);
 621         cpipe->pipe_buffer.buffer = (caddr_t)buffer;
 622         cpipe->pipe_buffer.size = size;
 623         cpipe->pipe_buffer.in = 0;
 624         cpipe->pipe_buffer.out = 0;
 625         cpipe->pipe_buffer.cnt = 0;
 626
 627         OSAddAtomic(1, &amountpipes);
 628         OSAddAtomic(cpipe->pipe_buffer.size, &amountpipekva);
 629
 630         return (0);
 631 }
 632
 633 /*
 634  * initialize and allocate VM and memory for pipe
 635  */
 636 static int
 637 pipe_create(struct pipe **cpipep)
 638 {
 639         struct pipe *cpipe;
 640         cpipe = (struct pipe *)zalloc(pipe_zone);
 641
 642         if ((*cpipep = cpipe) == NULL)
 643                 return (ENOMEM);
 644
 645         /*
 646          * protect so pipespace or pipeclose don't follow a junk pointer
 647          * if pipespace() fails.
 648          */
 649         bzero(cpipe, sizeof *cpipe);
 650
 651         /* Initial times are all the time of creation of the pipe */
 652         pipe_touch(cpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME);
 653         return (0);
 654 }
 655
 656
 657 /*
 658  * lock a pipe for I/O, blocking other access
 659  */
 660 static inline int
 661 pipeio_lock(struct pipe *cpipe, int catch)
 662 {
 663         int error;
 664         while (cpipe->pipe_state & PIPE_LOCKFL) {
 665                 cpipe->pipe_state |= PIPE_LWANT;
 666                 error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO,
 667                                "pipelk", 0);
 668                 if (error != 0)
 669                         return (error);
 670         }
 671         cpipe->pipe_state |= PIPE_LOCKFL;
 672         return (0);
 673 }
 674
 675 /*
 676  * unlock a pipe I/O lock
 677  */
 678 static inline void
 679 pipeio_unlock(struct pipe *cpipe)
 680 {
 681         cpipe->pipe_state &= ~PIPE_LOCKFL;
 682         if (cpipe->pipe_state & PIPE_LWANT) {
 683                 cpipe->pipe_state &= ~PIPE_LWANT;
 684                 wakeup(cpipe);
 685         }
 686 }
 687
 688 /*
 689  * wakeup anyone whos blocked in select
 690  */
 691 static void
 692 pipeselwakeup(struct pipe *cpipe, struct pipe *spipe)
 693 {
 694         if (cpipe->pipe_state & PIPE_SEL) {
 695                 cpipe->pipe_state &= ~PIPE_SEL;
 696                 selwakeup(&cpipe->pipe_sel);
 697         }
 698         if (cpipe->pipe_state & PIPE_KNOTE)
 699                KNOTE(&cpipe->pipe_sel.si_note, 1);
 700
 701         postpipeevent(cpipe, EV_RWBYTES);
 702
 703         if (spipe && (spipe->pipe_state & PIPE_ASYNC) && spipe->pipe_pgid) {
 704                 if (spipe->pipe_pgid < 0)
 705                         gsignal(-spipe->pipe_pgid, SIGIO);
 706                 else
 707                         proc_signal(spipe->pipe_pgid, SIGIO);
 708         }
 709 }
 710
 711 /*
 712  * Read n bytes from the buffer. Semantics are similar to file read.
 713  * returns: number of bytes read from the buffer
 714  */
 715 /* ARGSUSED */
 716 static int
 717 pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags,
 718         __unused vfs_context_t ctx)
 719 {
 720         struct pipe *rpipe = (struct pipe *)fp->f_data;
 721         int error;
 722         int nread = 0;
 723         u_int size;
 724
 725         PIPE_LOCK(rpipe);
 726         ++rpipe->pipe_busy;
 727
 728         error = pipeio_lock(rpipe, 1);
 729         if (error)
 730                 goto unlocked_error;
 731
 732 #if CONFIG_MACF
 733         error = mac_pipe_check_read(kauth_cred_get(), rpipe);
 734         if (error)
 735                 goto locked_error;
 736 #endif
 737
 738
 739         while (uio_resid(uio)) {
 740                 /*
 741                  * normal pipe buffer receive
 742                  */
 743                 if (rpipe->pipe_buffer.cnt > 0) {
 744                         /*
 745                          * # bytes to read is min( bytes from read pointer until end of buffer,
 746                          *                         total unread bytes,
 747                          *                         user requested byte count)
 748                          */
 749                         size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
 750                         if (size > rpipe->pipe_buffer.cnt)
 751                                 size = rpipe->pipe_buffer.cnt;
 752                         // LP64todo - fix this!
 753                         if (size > (u_int) uio_resid(uio))
 754                                 size = (u_int) uio_resid(uio);
 755
 756                         PIPE_UNLOCK(rpipe); /* we still hold io lock.*/
 757                         error = uiomove(
 758                             &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
 759                             size, uio);
 760                         PIPE_LOCK(rpipe);
 761                         if (error)
 762                                 break;
 763
 764                         rpipe->pipe_buffer.out += size;
 765                         if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
 766                                 rpipe->pipe_buffer.out = 0;
 767
 768                         rpipe->pipe_buffer.cnt -= size;
 769
 770                         /*
 771                          * If there is no more to read in the pipe, reset
 772                          * its pointers to the beginning.  This improves
 773                          * cache hit stats.
 774                          */
 775                         if (rpipe->pipe_buffer.cnt == 0) {
 776                                 rpipe->pipe_buffer.in = 0;
 777                                 rpipe->pipe_buffer.out = 0;
 778                         }
 779                         nread += size;
 780                 } else {
 781                         /*
 782                          * detect EOF condition
 783                          * read returns 0 on EOF, no need to set error
 784                          */
 785                         if (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
 786                                 break;
 787                         }
 788
 789                         /*
 790                          * If the "write-side" has been blocked, wake it up now.
 791                          */
 792                         if (rpipe->pipe_state & PIPE_WANTW) {
 793                                 rpipe->pipe_state &= ~PIPE_WANTW;
 794                                 wakeup(rpipe);
 795                         }
 796
 797                         /*
 798                          * Break if some data was read in previous iteration.
 799                          */
 800                         if (nread > 0)
 801                                 break;
 802
 803                         /*
 804                          * Unlock the pipe buffer for our remaining processing.
 805                          * We will either break out with an error or we will
 806                          * sleep and relock to loop.
 807                          */
 808                         pipeio_unlock(rpipe);
 809
 810                         /*
 811                          * Handle non-blocking mode operation or
 812                          * wait for more data.
 813                          */
 814                         if (fp->f_flag & FNONBLOCK) {
 815                                 error = EAGAIN;
 816                         } else {
 817                                 rpipe->pipe_state |= PIPE_WANTR;
 818                                 error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0);
 819                                 if (error == 0)
 820                                         error = pipeio_lock(rpipe, 1);
 821                         }
 822                         if (error)
 823                                 goto unlocked_error;
 824                 }
 825         }
 826 #if CONFIG_MACF
 827 locked_error:
 828 #endif
 829         pipeio_unlock(rpipe);
 830
 831 unlocked_error:
 832         --rpipe->pipe_busy;
 833
 834         /*
 835          * PIPE_WANT processing only makes sense if pipe_busy is 0.
 836          */
 837         if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
 838                 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
 839                 wakeup(rpipe);
 840         } else if (rpipe->pipe_buffer.cnt < rpipe->pipe_buffer.size) {
 841                 /*
 842                  * Handle write blocking hysteresis.
 843                  */
 844                 if (rpipe->pipe_state & PIPE_WANTW) {
 845                         rpipe->pipe_state &= ~PIPE_WANTW;
 846                         wakeup(rpipe);
 847                 }
 848         }
 849
 850         if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) > 0)
 851                 pipeselwakeup(rpipe, rpipe->pipe_peer);
 852
 853         /* update last read time */
 854         pipe_touch(rpipe, PIPE_ATIME);
 855
 856         PIPE_UNLOCK(rpipe);
 857
 858         return (error);
 859 }
 860
 861 /*
 862  * perform a write of n bytes into the read side of buffer. Since
 863  * pipes are unidirectional a write is meant to be read by the otherside only.
 864  */
 865 static int
 866 pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
 867         __unused vfs_context_t ctx)
 868 {
 869         int error = 0;
 870         int orig_resid;
 871         int pipe_size;
 872         struct pipe *wpipe, *rpipe;
 873         // LP64todo - fix this!
 874         orig_resid = uio_resid(uio);
 875         int space;
 876
 877         rpipe = (struct pipe *)fp->f_data;
 878
 879         PIPE_LOCK(rpipe);
 880         wpipe = rpipe->pipe_peer;
 881
 882         /*
 883          * detect loss of pipe read side, issue SIGPIPE if lost.
 884          */
 885         if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
 886                 PIPE_UNLOCK(rpipe);
 887                 return (EPIPE);
 888         }
 889 #if CONFIG_MACF
 890         error = mac_pipe_check_write(kauth_cred_get(), wpipe);
 891         if (error) {
 892                 PIPE_UNLOCK(rpipe);
 893                 return (error);
 894         }
 895 #endif
 896         ++wpipe->pipe_busy;
 897
 898         pipe_size = 0;
 899
 900         /*
 901          * need to allocate some storage... we delay the allocation
 902          * until the first write on fd[0] to avoid allocating storage for both
 903          * 'pipe ends'... most pipes are half-duplex with the writes targeting
 904          * fd[1], so allocating space for both ends is a waste...
 905          */
 906
 907         if ( wpipe->pipe_buffer.buffer == 0 || (
 908                 (unsigned)orig_resid > wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt &&
 909                 amountpipekva < maxpipekva ) ) {
 910
 911                 pipe_size = choose_pipespace(wpipe->pipe_buffer.size, wpipe->pipe_buffer.cnt + orig_resid);
 912         }
 913         if (pipe_size) {
 914                 /*
 915                  * need to do initial allocation or resizing of pipe
 916                  * holding both structure and io locks.
 917                  */
 918                 if ((error = pipeio_lock(wpipe, 1)) == 0) {
 919                         if (wpipe->pipe_buffer.cnt == 0)
 920                                 error = pipespace(wpipe, pipe_size);
 921                         else
 922                                 error = expand_pipespace(wpipe, pipe_size);
 923
 924                         pipeio_unlock(wpipe);
 925
 926                         /* allocation failed */
 927                         if (wpipe->pipe_buffer.buffer == 0)
 928                                 error = ENOMEM;
 929                 }
 930                 if (error) {
 931                         /*
 932                          * If an error occurred unbusy and return, waking up any pending
 933                          * readers.
 934                          */
 935                         --wpipe->pipe_busy;
 936                         if ((wpipe->pipe_busy == 0) &&
 937                             (wpipe->pipe_state & PIPE_WANT)) {
 938                                 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 939                                 wakeup(wpipe);
 940                         }
 941                         PIPE_UNLOCK(rpipe);
 942                         return(error);
 943                 }
 944         }
 945
 946         while (uio_resid(uio)) {
 947
 948         retrywrite:
 949                 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 950
 951                 /* Writes of size <= PIPE_BUF must be atomic. */
 952                 if ((space < uio_resid(uio)) && (orig_resid <= PIPE_BUF))
 953                         space = 0;
 954
 955                 if (space > 0) {
 956
 957                         if ((error = pipeio_lock(wpipe,1)) == 0) {
 958                                 int size;       /* Transfer size */
 959                                 int segsize;    /* first segment to transfer */
 960
 961                                 if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
 962                                         pipeio_unlock(wpipe);
 963                                         error = EPIPE;
 964                                         break;
 965                                 }
 966                                 /*
 967                                  * If a process blocked in pipeio_lock, our
 968                                  * value for space might be bad... the mutex
 969                                  * is dropped while we're blocked
 970                                  */
 971                                 if (space > (int)(wpipe->pipe_buffer.size -
 972                                     wpipe->pipe_buffer.cnt)) {
 973                                         pipeio_unlock(wpipe);
 974                                         goto retrywrite;
 975                                 }
 976
 977                                 /*
 978                                  * Transfer size is minimum of uio transfer
 979                                  * and free space in pipe buffer.
 980                                  */
 981                                 // LP64todo - fix this!
 982                                 if (space > uio_resid(uio))
 983                                         size = uio_resid(uio);
 984                                 else
 985                                         size = space;
 986                                 /*
 987                                  * First segment to transfer is minimum of
 988                                  * transfer size and contiguous space in
 989                                  * pipe buffer.  If first segment to transfer
 990                                  * is less than the transfer size, we've got
 991                                  * a wraparound in the buffer.
 992                                  */
 993                                 segsize = wpipe->pipe_buffer.size -
 994                                         wpipe->pipe_buffer.in;
 995                                 if (segsize > size)
 996                                         segsize = size;
 997
 998                                 /* Transfer first segment */
 999
1000                                 PIPE_UNLOCK(rpipe);
1001                                 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1002                                                 segsize, uio);
1003                                 PIPE_LOCK(rpipe);
1004
1005                                 if (error == 0 && segsize < size) {
1006                                         /*
1007                                          * Transfer remaining part now, to
1008                                          * support atomic writes.  Wraparound
1009                                          * happened. (State 3)
1010                                          */
1011                                         if (wpipe->pipe_buffer.in + segsize !=
1012                                             wpipe->pipe_buffer.size)
1013                                                 panic("Expected pipe buffer "
1014                                                     "wraparound disappeared");
1015
1016                                         PIPE_UNLOCK(rpipe);
1017                                         error = uiomove(
1018                                             &wpipe->pipe_buffer.buffer[0],
1019                                             size - segsize, uio);
1020                                         PIPE_LOCK(rpipe);
1021                                 }
1022                                 /*
1023                                  * readers never know to read until count is updated.
1024                                  */
1025                                 if (error == 0) {
1026                                         wpipe->pipe_buffer.in += size;
1027                                         if (wpipe->pipe_buffer.in >
1028                                             wpipe->pipe_buffer.size) {
1029                                                 if (wpipe->pipe_buffer.in !=
1030                                                     size - segsize +
1031                                                     wpipe->pipe_buffer.size)
1032                                                         panic("Expected "
1033                                                             "wraparound bad");
1034                                                 wpipe->pipe_buffer.in = size -
1035                                                     segsize;
1036                                         }
1037
1038                                         wpipe->pipe_buffer.cnt += size;
1039                                         if (wpipe->pipe_buffer.cnt >
1040                                             wpipe->pipe_buffer.size)
1041                                                 panic("Pipe buffer overflow");
1042
1043                                 }
1044                                 pipeio_unlock(wpipe);
1045                         }
1046                         if (error)
1047                                 break;
1048
1049                 } else {
1050                         /*
1051                          * If the "read-side" has been blocked, wake it up now.
1052                          */
1053                         if (wpipe->pipe_state & PIPE_WANTR) {
1054                                 wpipe->pipe_state &= ~PIPE_WANTR;
1055                                 wakeup(wpipe);
1056                         }
1057                         /*
1058                          * don't block on non-blocking I/O
1059                          * we'll do the pipeselwakeup on the way out
1060                          */
1061                         if (fp->f_flag & FNONBLOCK) {
1062                                 error = EAGAIN;
1063                                 break;
1064                         }
1065
1066                         /*
1067                          * If read side wants to go away, we just issue a signal
1068                          * to ourselves.
1069                          */
1070                         if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) {
1071                                 error = EPIPE;
1072                                 break;
1073                         }
1074
1075                         /*
1076                          * We have no more space and have something to offer,
1077                          * wake up select/poll.
1078                          */
1079                         pipeselwakeup(wpipe, wpipe);
1080
1081                         wpipe->pipe_state |= PIPE_WANTW;
1082
1083                         error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipewr", 0);
1084
1085                         if (error != 0)
1086                                 break;
1087                 }
1088         }
1089         --wpipe->pipe_busy;
1090
1091         if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1092                 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1093                 wakeup(wpipe);
1094         }
1095         if (wpipe->pipe_buffer.cnt > 0) {
1096                 /*
1097                  * If there are any characters in the buffer, we wake up
1098                  * the reader if it was blocked waiting for data.
1099                  */
1100                 if (wpipe->pipe_state & PIPE_WANTR) {
1101                         wpipe->pipe_state &= ~PIPE_WANTR;
1102                         wakeup(wpipe);
1103                 }
1104                 /*
1105                  * wake up thread blocked in select/poll or post the notification
1106                  */
1107                 pipeselwakeup(wpipe, wpipe);
1108         }
1109
1110         /* Update modification, status change (# of bytes in pipe) times */
1111         pipe_touch(rpipe, PIPE_MTIME | PIPE_CTIME);
1112         pipe_touch(wpipe, PIPE_MTIME | PIPE_CTIME);
1113         PIPE_UNLOCK(rpipe);
1114
1115         return (error);
1116 }
1117
1118 /*
1119  * we implement a very minimal set of ioctls for compatibility with sockets.
1120  */
1121 /* ARGSUSED 3 */
1122 static int
1123 pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
1124         __unused vfs_context_t ctx)
1125 {
1126         struct pipe *mpipe = (struct pipe *)fp->f_data;
1127 #if CONFIG_MACF
1128         int error;
1129 #endif
1130
1131         PIPE_LOCK(mpipe);
1132
1133 #if CONFIG_MACF
1134         error = mac_pipe_check_ioctl(kauth_cred_get(), mpipe, cmd);
1135         if (error) {
1136                 PIPE_UNLOCK(mpipe);
1137
1138                 return (error);
1139         }
1140 #endif
1141
1142         switch (cmd) {
1143
1144         case FIONBIO:
1145                 PIPE_UNLOCK(mpipe);
1146                 return (0);
1147
1148         case FIOASYNC:
1149                 if (*(int *)data) {
1150                         mpipe->pipe_state |= PIPE_ASYNC;
1151                 } else {
1152                         mpipe->pipe_state &= ~PIPE_ASYNC;
1153                 }
1154                 PIPE_UNLOCK(mpipe);
1155                 return (0);
1156
1157         case FIONREAD:
1158                 *(int *)data = mpipe->pipe_buffer.cnt;
1159                 PIPE_UNLOCK(mpipe);
1160                 return (0);
1161
1162         case TIOCSPGRP:
1163                 mpipe->pipe_pgid = *(int *)data;
1164
1165                 PIPE_UNLOCK(mpipe);
1166                 return (0);
1167
1168         case TIOCGPGRP:
1169                 *(int *)data = mpipe->pipe_pgid;
1170
1171                 PIPE_UNLOCK(mpipe);
1172                 return (0);
1173
1174         }
1175         PIPE_UNLOCK(mpipe);
1176         return (ENOTTY);
1177 }
1178
1179
1180 static int
1181 pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
1182 {
1183         struct pipe *rpipe = (struct pipe *)fp->f_data;
1184         struct pipe *wpipe;
1185         int    retnum = 0;
1186
1187         if (rpipe == NULL || rpipe == (struct pipe *)-1)
1188                 return (retnum);
1189
1190         PIPE_LOCK(rpipe);
1191
1192         wpipe = rpipe->pipe_peer;
1193
1194
1195 #if CONFIG_MACF
1196         /*
1197          * XXX We should use a per thread credential here; minimally, the
1198          * XXX process credential should have a persistent reference on it
1199          * XXX before being passed in here.
1200          */
1201         if (mac_pipe_check_select(vfs_context_ucred(ctx), rpipe, which)) {
1202                 PIPE_UNLOCK(rpipe);
1203                 return (0);
1204         }
1205 #endif
1206         switch (which) {
1207
1208         case FREAD:
1209                 if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1210                     (rpipe->pipe_buffer.cnt > 0) ||
1211                     (rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1212
1213                         retnum = 1;
1214                 } else {
1215                         rpipe->pipe_state |= PIPE_SEL;
1216                         selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
1217                 }
1218                 break;
1219
1220         case FWRITE:
1221                 if (wpipe)
1222                         wpipe->pipe_state |= PIPE_WSELECT;
1223                 if (wpipe == NULL || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
1224                     (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1225                      (MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
1226
1227                         retnum = 1;
1228                 } else {
1229                         wpipe->pipe_state |= PIPE_SEL;
1230                         selrecord(vfs_context_proc(ctx), &wpipe->pipe_sel, wql);
1231                 }
1232                 break;
1233         case 0:
1234                 rpipe->pipe_state |= PIPE_SEL;
1235                 selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
1236                 break;
1237         }
1238         PIPE_UNLOCK(rpipe);
1239
1240         return (retnum);
1241 }
1242
1243
1244 /* ARGSUSED 1 */
1245 static int
1246 pipe_close(struct fileglob *fg, __unused vfs_context_t ctx)
1247 {
1248         struct pipe *cpipe;
1249
1250         proc_fdlock_spin(vfs_context_proc(ctx));
1251         cpipe = (struct pipe *)fg->fg_data;
1252         fg->fg_data = NULL;
1253         proc_fdunlock(vfs_context_proc(ctx));
1254         if (cpipe)
1255                 pipeclose(cpipe);
1256
1257         return (0);
1258 }
1259
1260 static void
1261 pipe_free_kmem(struct pipe *cpipe)
1262 {
1263         if (cpipe->pipe_buffer.buffer != NULL) {
1264                 OSAddAtomic(-(cpipe->pipe_buffer.size), &amountpipekva);
1265                 OSAddAtomic(-1, &amountpipes);
1266                 kfree((void *)cpipe->pipe_buffer.buffer,
1267                           cpipe->pipe_buffer.size);
1268                 cpipe->pipe_buffer.buffer = NULL;
1269                 cpipe->pipe_buffer.size = 0;
1270         }
1271 }
1272
1273 /*
1274  * shutdown the pipe
1275  */
1276 static void
1277 pipeclose(struct pipe *cpipe)
1278 {
1279         struct pipe *ppipe;
1280
1281         if (cpipe == NULL)
1282                 return;
1283         /* partially created pipes won't have a valid mutex. */
1284         if (PIPE_MTX(cpipe) != NULL)
1285                 PIPE_LOCK(cpipe);
1286
1287
1288         /*
1289          * If the other side is blocked, wake it up saying that
1290          * we want to close it down.
1291          */
1292         cpipe->pipe_state &= ~PIPE_DRAIN;
1293         cpipe->pipe_state |= PIPE_EOF;
1294         pipeselwakeup(cpipe, cpipe);
1295
1296         while (cpipe->pipe_busy) {
1297                 cpipe->pipe_state |= PIPE_WANT;
1298
1299                 wakeup(cpipe);
1300                 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1301         }
1302
1303 #if CONFIG_MACF
1304         /*
1305          * Free the shared pipe label only after the two ends are disconnected.
1306          */
1307         if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
1308                 mac_pipe_label_destroy(cpipe);
1309 #endif
1310
1311         /*
1312          * Disconnect from peer
1313          */
1314         if ((ppipe = cpipe->pipe_peer) != NULL) {
1315
1316                 ppipe->pipe_state &= ~(PIPE_DRAIN);
1317                 ppipe->pipe_state |= PIPE_EOF;
1318
1319                 pipeselwakeup(ppipe, ppipe);
1320                 wakeup(ppipe);
1321
1322                 if (cpipe->pipe_state & PIPE_KNOTE)
1323                         KNOTE(&ppipe->pipe_sel.si_note, 1);
1324
1325                 postpipeevent(ppipe, EV_RCLOSED);
1326
1327                 ppipe->pipe_peer = NULL;
1328         }
1329         evpipefree(cpipe);
1330
1331         /*
1332          * free resources
1333          */
1334         if (PIPE_MTX(cpipe) != NULL) {
1335                 if (ppipe != NULL) {
1336                         /*
1337                          * since the mutex is shared and the peer is still
1338                          * alive, we need to release the mutex, not free it
1339                          */
1340                         PIPE_UNLOCK(cpipe);
1341                 } else {
1342                         /*
1343                          * peer is gone, so we're the sole party left with
1344                          * interest in this mutex... unlock and free it
1345                          */
1346                         PIPE_UNLOCK(cpipe);
1347                         lck_mtx_free(PIPE_MTX(cpipe), pipe_mtx_grp);
1348                 }
1349         }
1350         pipe_free_kmem(cpipe);
1351         if (cpipe->pipe_state & PIPE_WSELECT) {
1352                 pipe_garbage_collect(cpipe);
1353         } else {
1354                 zfree(pipe_zone, cpipe);
1355                 pipe_garbage_collect(NULL);
1356         }
1357
1358 }
1359
1360 /*ARGSUSED*/
1361 static int
1362 pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
1363 {
1364         struct pipe *cpipe;
1365
1366         cpipe = (struct pipe *)kn->kn_fp->f_data;
1367
1368         PIPE_LOCK(cpipe);
1369 #if CONFIG_MACF
1370         /*
1371          * XXX We should use a per thread credential here; minimally, the
1372          * XXX process credential should have a persistent reference on it
1373          * XXX before being passed in here.
1374          */
1375         if (mac_pipe_check_kqfilter(vfs_context_ucred(ctx), kn, cpipe) != 0) {
1376                 PIPE_UNLOCK(cpipe);
1377                 return (1);
1378         }
1379 #endif
1380
1381         switch (kn->kn_filter) {
1382         case EVFILT_READ:
1383                 kn->kn_fop = &pipe_rfiltops;
1384
1385                 break;
1386         case EVFILT_WRITE:
1387                 kn->kn_fop = &pipe_wfiltops;
1388
1389                 if (cpipe->pipe_peer == NULL) {
1390                         /*
1391                          * other end of pipe has been closed
1392                          */
1393                         PIPE_UNLOCK(cpipe);
1394                         return (EPIPE);
1395                 }
1396                 if (cpipe->pipe_peer)
1397                 cpipe = cpipe->pipe_peer;
1398                 break;
1399         default:
1400                 PIPE_UNLOCK(cpipe);
1401                 return (1);
1402         }
1403
1404         if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn))
1405                 cpipe->pipe_state |= PIPE_KNOTE;
1406
1407         PIPE_UNLOCK(cpipe);
1408         return (0);
1409 }
1410
1411 static void
1412 filt_pipedetach(struct knote *kn)
1413 {
1414         struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1415
1416         PIPE_LOCK(cpipe);
1417
1418         if (kn->kn_filter == EVFILT_WRITE) {
1419                 if (cpipe->pipe_peer == NULL) {
1420                         PIPE_UNLOCK(cpipe);
1421                         return;
1422                 }
1423                 cpipe = cpipe->pipe_peer;
1424         }
1425         if (cpipe->pipe_state & PIPE_KNOTE) {
1426                 if (KNOTE_DETACH(&cpipe->pipe_sel.si_note, kn))
1427                         cpipe->pipe_state &= ~PIPE_KNOTE;
1428         }
1429         PIPE_UNLOCK(cpipe);
1430 }
1431
1432 /*ARGSUSED*/
1433 static int
1434 filt_piperead(struct knote *kn, long hint)
1435 {
1436         struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1437         struct pipe *wpipe;
1438         int    retval;
1439
1440         /*
1441          * if hint == 0, then we've been called from the kevent
1442          * world directly and do not currently hold the pipe mutex...
1443          * if hint == 1, we're being called back via the KNOTE post
1444          * we made in pipeselwakeup, and we already hold the mutex...
1445          */
1446         if (hint == 0)
1447                 PIPE_LOCK(rpipe);
1448
1449         wpipe = rpipe->pipe_peer;
1450         kn->kn_data = rpipe->pipe_buffer.cnt;
1451         if ((rpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) ||
1452             (wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1453                 kn->kn_flags |= EV_EOF;
1454                 retval = 1;
1455         } else {
1456                 int64_t lowwat = 1;
1457                 if (kn->kn_sfflags & NOTE_LOWAT) {
1458                         if (rpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(rpipe))
1459                                 lowwat = MAX_PIPESIZE(rpipe);
1460                         else if (kn->kn_sdata > lowwat)
1461                                 lowwat = kn->kn_sdata;
1462                 }
1463                 retval = kn->kn_data >= lowwat;
1464         }
1465
1466         if (hint == 0)
1467                 PIPE_UNLOCK(rpipe);
1468
1469         return (retval);
1470 }
1471
1472 /*ARGSUSED*/
1473 static int
1474 filt_pipewrite(struct knote *kn, long hint)
1475 {
1476         struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1477         struct pipe *wpipe;
1478
1479         /*
1480          * if hint == 0, then we've been called from the kevent
1481          * world directly and do not currently hold the pipe mutex...
1482          * if hint == 1, we're being called back via the KNOTE post
1483          * we made in pipeselwakeup, and we already hold the mutex...
1484          */
1485         if (hint == 0)
1486                 PIPE_LOCK(rpipe);
1487
1488         wpipe = rpipe->pipe_peer;
1489
1490         if ((wpipe == NULL) || (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF))) {
1491                 kn->kn_data = 0;
1492                 kn->kn_flags |= EV_EOF;
1493
1494                 if (hint == 0)
1495                         PIPE_UNLOCK(rpipe);
1496                 return (1);
1497         }
1498         kn->kn_data = MAX_PIPESIZE(wpipe) - wpipe->pipe_buffer.cnt;
1499
1500         int64_t lowwat = PIPE_BUF;
1501         if (kn->kn_sfflags & NOTE_LOWAT) {
1502                 if (wpipe->pipe_buffer.size && kn->kn_sdata > MAX_PIPESIZE(wpipe))
1503                         lowwat = MAX_PIPESIZE(wpipe);
1504                 else if (kn->kn_sdata > lowwat)
1505                         lowwat = kn->kn_sdata;
1506         }
1507
1508         if (hint == 0)
1509                 PIPE_UNLOCK(rpipe);
1510
1511         return (kn->kn_data >= lowwat);
1512 }
1513
1514 int
1515 fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo)
1516 {
1517 #if CONFIG_MACF
1518         int error;
1519 #endif
1520         struct timeval now;
1521         struct vinfo_stat * ub;
1522         int pipe_size = 0;
1523         int pipe_count;
1524
1525         if (cpipe == NULL)
1526                 return (EBADF);
1527         PIPE_LOCK(cpipe);
1528
1529 #if CONFIG_MACF
1530         error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
1531         if (error) {
1532                 PIPE_UNLOCK(cpipe);
1533                 return (error);
1534         }
1535 #endif
1536         if (cpipe->pipe_buffer.buffer == 0) {
1537                 /*
1538                  * must be stat'ing the write fd
1539                  */
1540                 if (cpipe->pipe_peer) {
1541                         /*
1542                          * the peer still exists, use it's info
1543                          */
1544                         pipe_size  = MAX_PIPESIZE(cpipe->pipe_peer);
1545                         pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
1546                 } else {
1547                         pipe_count = 0;
1548                 }
1549         } else {
1550                 pipe_size  = MAX_PIPESIZE(cpipe);
1551                 pipe_count = cpipe->pipe_buffer.cnt;
1552         }
1553         /*
1554          * since peer's buffer is setup ouside of lock
1555          * we might catch it in transient state
1556          */
1557         if (pipe_size == 0)
1558                 pipe_size  = PIPE_SIZE;
1559
1560         ub = &pinfo->pipe_stat;
1561
1562         bzero(ub, sizeof(*ub));
1563         ub->vst_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
1564         ub->vst_blksize = pipe_size;
1565         ub->vst_size = pipe_count;
1566         if (ub->vst_blksize != 0)
1567                 ub->vst_blocks = (ub->vst_size + ub->vst_blksize - 1) / ub->vst_blksize;
1568         ub->vst_nlink = 1;
1569
1570         ub->vst_uid = kauth_getuid();
1571         ub->vst_gid = kauth_getgid();
1572
1573         microtime(&now);
1574         ub->vst_atime  = now.tv_sec;
1575         ub->vst_atimensec = now.tv_usec * 1000;
1576
1577         ub->vst_mtime  = now.tv_sec;
1578         ub->vst_mtimensec = now.tv_usec * 1000;
1579
1580         ub->vst_ctime  = now.tv_sec;
1581         ub->vst_ctimensec = now.tv_usec * 1000;
1582
1583         /*
1584          * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen, st_uid, st_gid.
1585          * XXX (st_dev, st_ino) should be unique.
1586          */
1587
1588         pinfo->pipe_handle = (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)cpipe);
1589         pinfo->pipe_peerhandle = (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(cpipe->pipe_peer));
1590         pinfo->pipe_status = cpipe->pipe_state;
1591
1592         PIPE_UNLOCK(cpipe);
1593
1594         return (0);
1595 }
1596
1597
1598 static int
1599 pipe_drain(struct fileproc *fp, __unused vfs_context_t ctx)
1600 {
1601
1602         /* Note: fdlock already held */
1603         struct pipe *ppipe, *cpipe = (struct pipe *)(fp->f_fglob->fg_data);
1604
1605         if (cpipe) {
1606                 PIPE_LOCK(cpipe);
1607                 cpipe->pipe_state |= PIPE_DRAIN;
1608                 cpipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1609                 wakeup(cpipe);
1610
1611                 /* Must wake up peer: a writer sleeps on the read side */
1612                 if ((ppipe = cpipe->pipe_peer)) {
1613                         ppipe->pipe_state |= PIPE_DRAIN;
1614                         ppipe->pipe_state &= ~(PIPE_WANTR | PIPE_WANTW);
1615                         wakeup(ppipe);
1616                 }
1617
1618                 PIPE_UNLOCK(cpipe);
1619                 return 0;
1620         }
1621
1622         return 1;
1623 }
1624
1625
1626  /*
1627  * When a thread sets a write-select on a pipe, it creates an implicit,
1628  * untracked dependency between that thread and the peer of the pipe
1629  * on which the select is set.  If the peer pipe is closed and freed
1630  * before the select()ing thread wakes up, the system will panic as
1631  * it attempts to unwind the dangling select().  To avoid that panic,
1632  * we notice whenever a dangerous select() is set on a pipe, and
1633  * defer the final deletion of the pipe until that select()s are all
1634  * resolved.  Since we can't currently detect exactly when that
1635  * resolution happens, we use a simple garbage collection queue to
1636  * reap the at-risk pipes 'later'.
1637  */
1638 static void
1639 pipe_garbage_collect(struct pipe *cpipe)
1640 {
1641         uint64_t old, now;
1642         struct pipe_garbage *pgp;
1643
1644         /* Convert msecs to nsecs and then to abstime */
1645         old = pipe_garbage_age_limit * 1000000;
1646         nanoseconds_to_absolutetime(old, &old);
1647
1648         lck_mtx_lock(pipe_garbage_lock);
1649
1650         /* Free anything that's been on the queue for <mumble> seconds */
1651         now = mach_absolute_time();
1652         old = now - old;
1653         while ((pgp = pipe_garbage_head) && pgp->pg_timestamp < old) {
1654                 pipe_garbage_head = pgp->pg_next;
1655                 if (pipe_garbage_head == NULL)
1656                         pipe_garbage_tail = NULL;
1657                 pipe_garbage_count--;
1658                 zfree(pipe_zone, pgp->pg_pipe);
1659                 zfree(pipe_garbage_zone, pgp);
1660         }
1661
1662         /* Add the new pipe (if any) to the tail of the garbage queue */
1663         if (cpipe) {
1664                 cpipe->pipe_state = PIPE_DEAD;
1665                 pgp = (struct pipe_garbage *)zalloc(pipe_garbage_zone);
1666                 if (pgp == NULL) {
1667                         /*
1668                          * We're too low on memory to garbage collect the
1669                          * pipe.  Freeing it runs the risk of panicing the
1670                          * system.  All we can do is leak it and leave
1671                          * a breadcrumb behind.  The good news, such as it
1672                          * is, is that this will probably never happen.
1673                          * We will probably hit the panic below first.
1674                          */
1675                         printf("Leaking pipe %p - no room left in the queue",
1676                             cpipe);
1677                         lck_mtx_unlock(pipe_garbage_lock);
1678                         return;
1679                 }
1680
1681                 pgp->pg_pipe = cpipe;
1682                 pgp->pg_timestamp = now;
1683                 pgp->pg_next = NULL;
1684
1685                 if (pipe_garbage_tail)
1686                         pipe_garbage_tail->pg_next = pgp;
1687                 pipe_garbage_tail = pgp;
1688                 if (pipe_garbage_head == NULL)
1689                         pipe_garbage_head = pipe_garbage_tail;
1690
1691                 if (pipe_garbage_count++ >= PIPE_GARBAGE_QUEUE_LIMIT)
1692                         panic("Length of pipe garbage queue exceeded %d",
1693                             PIPE_GARBAGE_QUEUE_LIMIT);
1694         }
1695         lck_mtx_unlock(pipe_garbage_lock);
1696 }
1697