bsd/kern/sys_pipe.c

   1 /*
   2  * Copyright (c) 1996 John S. Dyson
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice immediately at the beginning of the file, without modification,
  10  *    this list of conditions, and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  * 3. Absolutely no warranty of function or purpose is made by the author
  15  *    John S. Dyson.
  16  * 4. Modifications may be freely made to this file if the above conditions
  17  *    are met.
  18  */
  19 /*
  20  * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
  21  *
  22  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  23  *
  24  * This file contains Original Code and/or Modifications of Original Code
  25  * as defined in and that are subject to the Apple Public Source License
  26  * Version 2.0 (the 'License'). You may not use this file except in
  27  * compliance with the License. The rights granted to you under the License
  28  * may not be used to create, or enable the creation or redistribution of,
  29  * unlawful or unlicensed copies of an Apple operating system, or to
  30  * circumvent, violate, or enable the circumvention or violation of, any
  31  * terms of an Apple operating system software license agreement.
  32  *
  33  * Please obtain a copy of the License at
  34  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  35  *
  36  * The Original Code and all software distributed under the License are
  37  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  38  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  39  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  40  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  41  * Please see the License for the specific language governing rights and
  42  * limitations under the License.
  43  *
  44  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  45  */
  46 /*
  47  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  48  * support for mandatory and extensible security protections.  This notice
  49  * is included in support of clause 2.2 (b) of the Apple Public License,
  50  * Version 2.0.
  51  */
  52
  53 /*
  54  * This file contains a high-performance replacement for the socket-based
  55  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
  56  * all features of sockets, but does do everything that pipes normally
  57  * do.
  58  */
  59
  60 /*
  61  * This code has two modes of operation, a small write mode and a large
  62  * write mode.  The small write mode acts like conventional pipes with
  63  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
  64  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
  65  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
  66  * the receiving process can copy it directly from the pages in the sending
  67  * process.
  68  *
  69  * If the sending process receives a signal, it is possible that it will
  70  * go away, and certainly its address space can change, because control
  71  * is returned back to the user-mode side.  In that case, the pipe code
  72  * arranges to copy the buffer supplied by the user process, to a pageable
  73  * kernel buffer, and the receiving process will grab the data from the
  74  * pageable kernel buffer.  Since signals don't happen all that often,
  75  * the copy operation is normally eliminated.
  76  *
  77  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
  78  * happen for small transfers so that the system will not spend all of
  79  * its time context switching.
  80  *
  81  * In order to limit the resource use of pipes, two sysctls exist:
  82  *
  83  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
  84  * address space available to us in pipe_map.  Whenever the amount in use
  85  * exceeds half of this value, all new pipes will be created with size
  86  * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
  87  * as well.  This value is loader tunable only.
  88  *
  89  * kern.ipc.maxpipekvawired - This value limits the amount of memory that may
  90  * be wired in order to facilitate direct copies using page flipping.
  91  * Whenever this value is exceeded, pipes will fall back to using regular
  92  * copies.  This value is sysctl controllable at all times.
  93  *
  94  * These values are autotuned in subr_param.c.
  95  *
  96  * Memory usage may be monitored through the sysctls
  97  * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
  98  *
  99  */
 100
 101 #include <sys/param.h>
 102 #include <sys/systm.h>
 103 #include <sys/filedesc.h>
 104 #include <sys/kernel.h>
 105 #include <sys/vnode.h>
 106 #include <sys/proc_internal.h>
 107 #include <sys/kauth.h>
 108 #include <sys/file_internal.h>
 109 #include <sys/stat.h>
 110 #include <sys/ioctl.h>
 111 #include <sys/fcntl.h>
 112 #include <sys/malloc.h>
 113 #include <sys/syslog.h>
 114 #include <sys/unistd.h>
 115 #include <sys/resourcevar.h>
 116 #include <sys/aio_kern.h>
 117 #include <sys/signalvar.h>
 118 #include <sys/pipe.h>
 119 #include <sys/sysproto.h>
 120 #include <sys/proc_info.h>
 121
 122 #include <bsm/audit_kernel.h>
 123
 124 #include <sys/kdebug.h>
 125
 126 #include <kern/zalloc.h>
 127 #include <vm/vm_kern.h>
 128 #include <libkern/OSAtomic.h>
 129
 130 #define f_flag f_fglob->fg_flag
 131 #define f_type f_fglob->fg_type
 132 #define f_msgcount f_fglob->fg_msgcount
 133 #define f_cred f_fglob->fg_cred
 134 #define f_ops f_fglob->fg_ops
 135 #define f_offset f_fglob->fg_offset
 136 #define f_data f_fglob->fg_data
 137 /*
 138  * Use this define if you want to disable *fancy* VM things.  Expect an
 139  * approx 30% decrease in transfer rate.  This could be useful for
 140  * NetBSD or OpenBSD.
 141  *
 142  * this needs to be ported to X and the performance measured
 143  * before committing to supporting it
 144  */
 145 #define PIPE_NODIRECT  1
 146
 147 #ifndef PIPE_NODIRECT
 148
 149 #include <vm/vm.h>
 150 #include <vm/vm_param.h>
 151 #include <vm/vm_object.h>
 152 #include <vm/vm_kern.h>
 153 #include <vm/vm_extern.h>
 154 #include <vm/pmap.h>
 155 #include <vm/vm_map.h>
 156 #include <vm/vm_page.h>
 157 #include <vm/uma.h>
 158
 159 #endif
 160
 161
 162 /*
 163  * interfaces to the outside world
 164  */
 165 static int pipe_read(struct fileproc *fp, struct uio *uio,
 166                 int flags, vfs_context_t ctx);
 167
 168 static int pipe_write(struct fileproc *fp, struct uio *uio,
 169                 int flags, vfs_context_t ctx);
 170
 171 static int pipe_close(struct fileglob *fg, vfs_context_t ctx);
 172
 173 static int pipe_select(struct fileproc *fp, int which, void * wql,
 174                 vfs_context_t ctx);
 175
 176 static int pipe_kqfilter(struct fileproc *fp, struct knote *kn,
 177                 vfs_context_t ctx);
 178
 179 static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
 180                 vfs_context_t ctx);
 181
 182
 183 struct  fileops pipeops =
 184   { pipe_read,
 185     pipe_write,
 186     pipe_ioctl,
 187     pipe_select,
 188     pipe_close,
 189     pipe_kqfilter,
 190     NULL };
 191
 192
 193 static void     filt_pipedetach(struct knote *kn);
 194 static int      filt_piperead(struct knote *kn, long hint);
 195 static int      filt_pipewrite(struct knote *kn, long hint);
 196
 197 static struct filterops pipe_rfiltops =
 198         { 1, NULL, filt_pipedetach, filt_piperead };
 199 static struct filterops pipe_wfiltops =
 200         { 1, NULL, filt_pipedetach, filt_pipewrite };
 201
 202 /*
 203  * Default pipe buffer size(s), this can be kind-of large now because pipe
 204  * space is pageable.  The pipe code will try to maintain locality of
 205  * reference for performance reasons, so small amounts of outstanding I/O
 206  * will not wipe the cache.
 207  */
 208 #define MINPIPESIZE (PIPE_SIZE/3)
 209
 210 /*
 211  * Limit the number of "big" pipes
 212  */
 213 #define LIMITBIGPIPES   32
 214 static int nbigpipe;
 215
 216 static int amountpipes;
 217 static int amountpipekva;
 218
 219 #ifndef PIPE_NODIRECT
 220 static int amountpipekvawired;
 221 #endif
 222 int maxpipekva = 1024 * 1024 * 16;
 223
 224 #if PIPE_SYSCTLS
 225 SYSCTL_DECL(_kern_ipc);
 226
 227 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD,
 228            &maxpipekva, 0, "Pipe KVA limit");
 229 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
 230            &maxpipekvawired, 0, "Pipe KVA wired limit");
 231 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
 232            &amountpipes, 0, "Current # of pipes");
 233 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
 234            &nbigpipe, 0, "Current # of big pipes");
 235 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
 236            &amountpipekva, 0, "Pipe KVA usage");
 237 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
 238            &amountpipekvawired, 0, "Pipe wired KVA usage");
 239 #endif
 240
 241 static void pipeclose(struct pipe *cpipe);
 242 static void pipe_free_kmem(struct pipe *cpipe);
 243 static int pipe_create(struct pipe **cpipep);
 244 static void pipeselwakeup(struct pipe *cpipe, struct pipe *spipe);
 245 static __inline int pipelock(struct pipe *cpipe, int catch);
 246 static __inline void pipeunlock(struct pipe *cpipe);
 247
 248 #ifndef PIPE_NODIRECT
 249 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
 250 static void pipe_destroy_write_buffer(struct pipe *wpipe);
 251 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
 252 static void pipe_clone_write_buffer(struct pipe *wpipe);
 253 #endif
 254
 255 extern int postpipeevent(struct pipe *, int);
 256 extern void evpipefree(struct pipe *cpipe);
 257
 258
 259 static int pipespace(struct pipe *cpipe, int size);
 260
 261 static lck_grp_t        *pipe_mtx_grp;
 262 static lck_attr_t       *pipe_mtx_attr;
 263 static lck_grp_attr_t   *pipe_mtx_grp_attr;
 264
 265 static zone_t pipe_zone;
 266
 267 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 268
 269 void
 270 pipeinit(void)
 271 {
 272         pipe_zone = (zone_t)zinit(sizeof(struct pipe), 8192 * sizeof(struct pipe), 4096, "pipe zone");
 273
 274         /*
 275          * allocate lock group attribute and group for pipe mutexes
 276          */
 277         pipe_mtx_grp_attr = lck_grp_attr_alloc_init();
 278         pipe_mtx_grp = lck_grp_alloc_init("pipe", pipe_mtx_grp_attr);
 279
 280         /*
 281          * allocate the lock attribute for pipe mutexes
 282          */
 283         pipe_mtx_attr = lck_attr_alloc_init();
 284 }
 285
 286 /* Bitmap for things to touch in pipe_touch() */
 287 #define PIPE_ATIME      0x00000001      /* time of last access */
 288 #define PIPE_MTIME      0x00000002      /* time of last modification */
 289 #define PIPE_CTIME      0x00000004      /* time of last status change */
 290
 291 static void
 292 pipe_touch(struct pipe *tpipe, int touch)
 293 {
 294         struct timeval now;
 295
 296         microtime(&now);
 297
 298         if (touch & PIPE_ATIME) {
 299                 tpipe->st_atimespec.tv_sec  = now.tv_sec;
 300                 tpipe->st_atimespec.tv_nsec = now.tv_usec * 1000;
 301         }
 302
 303         if (touch & PIPE_MTIME) {
 304                 tpipe->st_mtimespec.tv_sec  = now.tv_sec;
 305                 tpipe->st_mtimespec.tv_nsec = now.tv_usec * 1000;
 306         }
 307
 308         if (touch & PIPE_CTIME) {
 309                 tpipe->st_ctimespec.tv_sec  = now.tv_sec;
 310                 tpipe->st_ctimespec.tv_nsec = now.tv_usec * 1000;
 311         }
 312 }
 313
 314
 315
 316 /*
 317  * The pipe system call for the DTYPE_PIPE type of pipes
 318  */
 319
 320 /* ARGSUSED */
 321 int
 322 pipe(proc_t p, __unused struct pipe_args *uap, register_t *retval)
 323 {
 324         struct fileproc *rf, *wf;
 325         struct pipe *rpipe, *wpipe;
 326         lck_mtx_t   *pmtx;
 327         int fd, error;
 328
 329         if ((pmtx = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr)) == NULL)
 330                 return (ENOMEM);
 331
 332         rpipe = wpipe = NULL;
 333         if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
 334                 error = ENFILE;
 335                 goto freepipes;
 336         }
 337         /*
 338          * allocate the space for the normal I/O direction up
 339          * front... we'll delay the allocation for the other
 340          * direction until a write actually occurs (most
 341          * likely it won't)...
 342          *
 343          * Reduce to 1/4th pipe size if we're over our global max.
 344          */
 345         if (amountpipekva > maxpipekva / 2)
 346                 error = pipespace(rpipe, SMALL_PIPE_SIZE);
 347         else
 348                 error = pipespace(rpipe, PIPE_SIZE);
 349         if (error)
 350                 goto freepipes;
 351
 352 #ifndef PIPE_NODIRECT
 353         rpipe->pipe_state |= PIPE_DIRECTOK;
 354         wpipe->pipe_state |= PIPE_DIRECTOK;
 355 #endif
 356         TAILQ_INIT(&rpipe->pipe_evlist);
 357         TAILQ_INIT(&wpipe->pipe_evlist);
 358
 359         error = falloc(p, &rf, &fd, vfs_context_current());
 360         if (error) {
 361                 goto freepipes;
 362         }
 363         retval[0] = fd;
 364
 365         /*
 366          * for now we'll create half-duplex
 367          * pipes... this is what we've always
 368          * supported..
 369          */
 370         rf->f_flag = FREAD;
 371         rf->f_type = DTYPE_PIPE;
 372         rf->f_data = (caddr_t)rpipe;
 373         rf->f_ops = &pipeops;
 374
 375         error = falloc(p, &wf, &fd, vfs_context_current());
 376         if (error) {
 377                 fp_free(p, retval[0], rf);
 378                 goto freepipes;
 379         }
 380         wf->f_flag = FWRITE;
 381         wf->f_type = DTYPE_PIPE;
 382         wf->f_data = (caddr_t)wpipe;
 383         wf->f_ops = &pipeops;
 384
 385         rpipe->pipe_peer = wpipe;
 386         wpipe->pipe_peer = rpipe;
 387         rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
 388
 389         retval[1] = fd;
 390 #if CONFIG_MACF
 391         /*
 392          * XXXXXXXX SHOULD NOT HOLD FILE_LOCK() XXXXXXXXXXXX
 393          *
 394          * struct pipe represents a pipe endpoint.  The MAC label is shared
 395          * between the connected endpoints.  As a result mac_pipe_label_init() and
 396          * mac_pipe_label_associate() should only be called on one of the endpoints
 397          * after they have been connected.
 398          */
 399         mac_pipe_label_init(rpipe);
 400         mac_pipe_label_associate(kauth_cred_get(), rpipe);
 401         wpipe->pipe_label = rpipe->pipe_label;
 402 #endif
 403         proc_fdlock_spin(p);
 404         procfdtbl_releasefd(p, retval[0], NULL);
 405         procfdtbl_releasefd(p, retval[1], NULL);
 406         fp_drop(p, retval[0], rf, 1);
 407         fp_drop(p, retval[1], wf, 1);
 408         proc_fdunlock(p);
 409
 410
 411         return (0);
 412
 413 freepipes:
 414         pipeclose(rpipe);
 415         pipeclose(wpipe);
 416         lck_mtx_free(pmtx, pipe_mtx_grp);
 417
 418         return (error);
 419 }
 420
 421 int
 422 pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
 423 {
 424 #if CONFIG_MACF
 425         int error;
 426 #endif
 427         int     pipe_size = 0;
 428         int     pipe_count;
 429         struct stat *sb = (struct stat *)0;     /* warning avoidance ; protected by isstat64 */
 430         struct stat64 * sb64 = (struct stat64 *)0;  /* warning avoidance ; protected by isstat64 */
 431
 432         if (cpipe == NULL)
 433                 return (EBADF);
 434         PIPE_LOCK(cpipe);
 435
 436 #if CONFIG_MACF
 437         error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
 438         if (error) {
 439                 PIPE_UNLOCK(cpipe);
 440                 return (error);
 441         }
 442 #endif
 443         if (cpipe->pipe_buffer.buffer == 0) {
 444                 /*
 445                  * must be stat'ing the write fd
 446                  */
 447                 if (cpipe->pipe_peer) {
 448                         /*
 449                          * the peer still exists, use it's info
 450                          */
 451                         pipe_size  = cpipe->pipe_peer->pipe_buffer.size;
 452                         pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
 453                 } else {
 454                         pipe_count = 0;
 455                 }
 456         } else {
 457                 pipe_size  = cpipe->pipe_buffer.size;
 458                 pipe_count = cpipe->pipe_buffer.cnt;
 459         }
 460         /*
 461          * since peer's buffer is setup ouside of lock
 462          * we might catch it in transient state
 463          */
 464         if (pipe_size == 0)
 465                 pipe_size  = PIPE_SIZE;
 466
 467         if (isstat64 != 0) {
 468                 sb64 = (struct stat64 *)ub;
 469
 470                 bzero(sb64, sizeof(*sb64));
 471                 sb64->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 472                 sb64->st_blksize = pipe_size;
 473                 sb64->st_size = pipe_count;
 474                 sb64->st_blocks = (sb64->st_size + sb64->st_blksize - 1) / sb64->st_blksize;
 475
 476                 sb64->st_uid = kauth_getuid();
 477                 sb64->st_gid = kauth_getgid();
 478
 479                 sb64->st_atimespec.tv_sec  = cpipe->st_atimespec.tv_sec;
 480                 sb64->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
 481
 482                 sb64->st_mtimespec.tv_sec  = cpipe->st_mtimespec.tv_sec;
 483                 sb64->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
 484
 485                 sb64->st_ctimespec.tv_sec  = cpipe->st_ctimespec.tv_sec;
 486                 sb64->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
 487
 488                 /*
 489                 * Return a relatively unique inode number based on the current
 490                 * address of this pipe's struct pipe.  This number may be recycled
 491                 * relatively quickly.
 492                 */
 493                 sb64->st_ino = (ino64_t)((uint32_t)cpipe);
 494         } else {
 495                 sb = (struct stat *)ub;
 496
 497                 bzero(sb, sizeof(*sb));
 498                 sb->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
 499                 sb->st_blksize = pipe_size;
 500                 sb->st_size = pipe_count;
 501                 sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
 502
 503                 sb->st_uid = kauth_getuid();
 504                 sb->st_gid = kauth_getgid();
 505
 506                 sb->st_atimespec.tv_sec  = cpipe->st_atimespec.tv_sec;
 507                 sb->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
 508
 509                 sb->st_mtimespec.tv_sec  = cpipe->st_mtimespec.tv_sec;
 510                 sb->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
 511
 512                 sb->st_ctimespec.tv_sec  = cpipe->st_ctimespec.tv_sec;
 513                 sb->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
 514
 515                 /*
 516                 * Return a relatively unique inode number based on the current
 517                 * address of this pipe's struct pipe.  This number may be recycled
 518                 * relatively quickly.
 519                 */
 520                 sb->st_ino = (ino_t)cpipe;
 521         }
 522         PIPE_UNLOCK(cpipe);
 523
 524         /*
 525          * POSIX: Left as 0: st_dev, st_nlink, st_rdev, st_flags, st_gen,
 526          * st_uid, st_gid.
 527          *
 528          * XXX (st_dev) should be unique, but there is no device driver that
 529          * XXX is associated with pipes, since they are implemented via a
 530          * XXX struct fileops indirection rather than as FS objects.
 531          */
 532         return (0);
 533 }
 534
 535
 536 /*
 537  * Allocate kva for pipe circular buffer, the space is pageable
 538  * This routine will 'realloc' the size of a pipe safely, if it fails
 539  * it will retain the old buffer.
 540  * If it fails it will return ENOMEM.
 541  */
 542 static int
 543 pipespace(struct pipe *cpipe, int size)
 544 {
 545         vm_offset_t buffer;
 546
 547         size = round_page(size);
 548
 549         if (kmem_alloc(kernel_map, &buffer, size) != KERN_SUCCESS)
 550                 return(ENOMEM);
 551
 552         /* free old resources if we're resizing */
 553         pipe_free_kmem(cpipe);
 554         cpipe->pipe_buffer.buffer = (caddr_t)buffer;
 555         cpipe->pipe_buffer.size = size;
 556         cpipe->pipe_buffer.in = 0;
 557         cpipe->pipe_buffer.out = 0;
 558         cpipe->pipe_buffer.cnt = 0;
 559
 560         OSAddAtomic(1, (SInt32 *)&amountpipes);
 561         OSAddAtomic(cpipe->pipe_buffer.size, (SInt32 *)&amountpipekva);
 562
 563         return (0);
 564 }
 565
 566 /*
 567  * initialize and allocate VM and memory for pipe
 568  */
 569 static int
 570 pipe_create(struct pipe **cpipep)
 571 {
 572         struct pipe *cpipe;
 573
 574         cpipe = (struct pipe *)zalloc(pipe_zone);
 575
 576         if ((*cpipep = cpipe) == NULL)
 577                 return (ENOMEM);
 578
 579         /*
 580          * protect so pipespace or pipeclose don't follow a junk pointer
 581          * if pipespace() fails.
 582          */
 583         bzero(cpipe, sizeof *cpipe);
 584
 585         /* Initial times are all the time of creation of the pipe */
 586         pipe_touch(cpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME);
 587
 588         return (0);
 589 }
 590
 591
 592 /*
 593  * lock a pipe for I/O, blocking other access
 594  */
 595 static inline int
 596 pipelock(struct pipe *cpipe, int catch)
 597 {
 598         int error;
 599
 600         while (cpipe->pipe_state & PIPE_LOCKFL) {
 601                 cpipe->pipe_state |= PIPE_LWANT;
 602
 603                 error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO,
 604                                "pipelk", 0);
 605                 if (error != 0)
 606                         return (error);
 607         }
 608         cpipe->pipe_state |= PIPE_LOCKFL;
 609
 610         return (0);
 611 }
 612
 613 /*
 614  * unlock a pipe I/O lock
 615  */
 616 static inline void
 617 pipeunlock(struct pipe *cpipe)
 618 {
 619         cpipe->pipe_state &= ~PIPE_LOCKFL;
 620
 621         if (cpipe->pipe_state & PIPE_LWANT) {
 622                 cpipe->pipe_state &= ~PIPE_LWANT;
 623                 wakeup(cpipe);
 624         }
 625 }
 626
 627 static void
 628 pipeselwakeup(struct pipe *cpipe, struct pipe *spipe)
 629 {
 630         if (cpipe->pipe_state & PIPE_SEL) {
 631                 cpipe->pipe_state &= ~PIPE_SEL;
 632                 selwakeup(&cpipe->pipe_sel);
 633         }
 634         if (cpipe->pipe_state & PIPE_KNOTE)
 635                KNOTE(&cpipe->pipe_sel.si_note, 1);
 636
 637         postpipeevent(cpipe, EV_RWBYTES);
 638
 639         if (spipe && (spipe->pipe_state & PIPE_ASYNC) && spipe->pipe_pgid) {
 640                 if (spipe->pipe_pgid < 0)
 641                         gsignal(-spipe->pipe_pgid, SIGIO);
 642                 else
 643                         proc_signal(spipe->pipe_pgid, SIGIO);
 644         }
 645 }
 646
 647 /* ARGSUSED */
 648 static int
 649 pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags,
 650         __unused vfs_context_t ctx)
 651 {
 652         struct pipe *rpipe = (struct pipe *)fp->f_data;
 653         int error;
 654         int nread = 0;
 655         u_int size;
 656
 657         PIPE_LOCK(rpipe);
 658         ++rpipe->pipe_busy;
 659
 660         error = pipelock(rpipe, 1);
 661         if (error)
 662                 goto unlocked_error;
 663
 664 #if CONFIG_MACF
 665         error = mac_pipe_check_read(kauth_cred_get(), rpipe);
 666         if (error)
 667                 goto locked_error;
 668 #endif
 669
 670         while (uio_resid(uio)) {
 671                 /*
 672                  * normal pipe buffer receive
 673                  */
 674                 if (rpipe->pipe_buffer.cnt > 0) {
 675                         size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
 676                         if (size > rpipe->pipe_buffer.cnt)
 677                                 size = rpipe->pipe_buffer.cnt;
 678                         // LP64todo - fix this!
 679                         if (size > (u_int) uio_resid(uio))
 680                                 size = (u_int) uio_resid(uio);
 681
 682                         PIPE_UNLOCK(rpipe);
 683                         error = uiomove(
 684                             &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
 685                             size, uio);
 686                         PIPE_LOCK(rpipe);
 687                         if (error)
 688                                 break;
 689
 690                         rpipe->pipe_buffer.out += size;
 691                         if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
 692                                 rpipe->pipe_buffer.out = 0;
 693
 694                         rpipe->pipe_buffer.cnt -= size;
 695
 696                         /*
 697                          * If there is no more to read in the pipe, reset
 698                          * its pointers to the beginning.  This improves
 699                          * cache hit stats.
 700                          */
 701                         if (rpipe->pipe_buffer.cnt == 0) {
 702                                 rpipe->pipe_buffer.in = 0;
 703                                 rpipe->pipe_buffer.out = 0;
 704                         }
 705                         nread += size;
 706 #ifndef PIPE_NODIRECT
 707                 /*
 708                  * Direct copy, bypassing a kernel buffer.
 709                  */
 710                 } else if ((size = rpipe->pipe_map.cnt) &&
 711                            (rpipe->pipe_state & PIPE_DIRECTW)) {
 712                         caddr_t va;
 713                         // LP64todo - fix this!
 714                         if (size > (u_int) uio_resid(uio))
 715                                 size = (u_int) uio_resid(uio);
 716
 717                         va = (caddr_t) rpipe->pipe_map.kva +
 718                             rpipe->pipe_map.pos;
 719                         PIPE_UNLOCK(rpipe);
 720                         error = uiomove(va, size, uio);
 721                         PIPE_LOCK(rpipe);
 722                         if (error)
 723                                 break;
 724                         nread += size;
 725                         rpipe->pipe_map.pos += size;
 726                         rpipe->pipe_map.cnt -= size;
 727                         if (rpipe->pipe_map.cnt == 0) {
 728                                 rpipe->pipe_state &= ~PIPE_DIRECTW;
 729                                 wakeup(rpipe);
 730                         }
 731 #endif
 732                 } else {
 733                         /*
 734                          * detect EOF condition
 735                          * read returns 0 on EOF, no need to set error
 736                          */
 737                         if (rpipe->pipe_state & PIPE_EOF)
 738                                 break;
 739
 740                         /*
 741                          * If the "write-side" has been blocked, wake it up now.
 742                          */
 743                         if (rpipe->pipe_state & PIPE_WANTW) {
 744                                 rpipe->pipe_state &= ~PIPE_WANTW;
 745                                 wakeup(rpipe);
 746                         }
 747
 748                         /*
 749                          * Break if some data was read.
 750                          */
 751                         if (nread > 0)
 752                                 break;
 753
 754                         /*
 755                          * Unlock the pipe buffer for our remaining processing.
 756                          * We will either break out with an error or we will
 757                          * sleep and relock to loop.
 758                          */
 759                         pipeunlock(rpipe);
 760
 761                         /*
 762                          * Handle non-blocking mode operation or
 763                          * wait for more data.
 764                          */
 765                         if (fp->f_flag & FNONBLOCK) {
 766                                 error = EAGAIN;
 767                         } else {
 768                                 rpipe->pipe_state |= PIPE_WANTR;
 769
 770                                 error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0);
 771
 772                                 if (error == 0)
 773                                         error = pipelock(rpipe, 1);
 774                         }
 775                         if (error)
 776                                 goto unlocked_error;
 777                 }
 778         }
 779 #if CONFIG_MACF
 780 locked_error:
 781 #endif
 782         pipeunlock(rpipe);
 783
 784 unlocked_error:
 785         --rpipe->pipe_busy;
 786
 787         /*
 788          * PIPE_WANT processing only makes sense if pipe_busy is 0.
 789          */
 790         if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
 791                 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
 792                 wakeup(rpipe);
 793         } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
 794                 /*
 795                  * Handle write blocking hysteresis.
 796                  */
 797                 if (rpipe->pipe_state & PIPE_WANTW) {
 798                         rpipe->pipe_state &= ~PIPE_WANTW;
 799                         wakeup(rpipe);
 800                 }
 801         }
 802
 803         if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
 804                 pipeselwakeup(rpipe, rpipe->pipe_peer);
 805
 806         /* update last read time */
 807         pipe_touch(rpipe, PIPE_ATIME);
 808
 809         PIPE_UNLOCK(rpipe);
 810
 811         return (error);
 812 }
 813
 814
 815
 816 #ifndef PIPE_NODIRECT
 817 /*
 818  * Map the sending processes' buffer into kernel space and wire it.
 819  * This is similar to a physical write operation.
 820  */
 821 static int
 822 pipe_build_write_buffer(wpipe, uio)
 823         struct pipe *wpipe;
 824         struct uio *uio;
 825 {
 826         pmap_t pmap;
 827         u_int size;
 828         int i, j;
 829         vm_offset_t addr, endaddr;
 830
 831
 832         size = (u_int) uio->uio_iov->iov_len;
 833         if (size > wpipe->pipe_buffer.size)
 834                 size = wpipe->pipe_buffer.size;
 835
 836         pmap = vmspace_pmap(curproc->p_vmspace);
 837         endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
 838         addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
 839         for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
 840                 /*
 841                  * vm_fault_quick() can sleep.  Consequently,
 842                  * vm_page_lock_queue() and vm_page_unlock_queue()
 843                  * should not be performed outside of this loop.
 844                  */
 845         race:
 846                 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
 847                         vm_page_lock_queues();
 848                         for (j = 0; j < i; j++)
 849                                 vm_page_unhold(wpipe->pipe_map.ms[j]);
 850                         vm_page_unlock_queues();
 851                         return (EFAULT);
 852                 }
 853                 wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
 854                     VM_PROT_READ);
 855                 if (wpipe->pipe_map.ms[i] == NULL)
 856                         goto race;
 857         }
 858
 859 /*
 860  * set up the control block
 861  */
 862         wpipe->pipe_map.npages = i;
 863         wpipe->pipe_map.pos =
 864             ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
 865         wpipe->pipe_map.cnt = size;
 866
 867 /*
 868  * and map the buffer
 869  */
 870         if (wpipe->pipe_map.kva == 0) {
 871                 /*
 872                  * We need to allocate space for an extra page because the
 873                  * address range might (will) span pages at times.
 874                  */
 875                 wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map,
 876                         wpipe->pipe_buffer.size + PAGE_SIZE);
 877                 atomic_add_int(&amountpipekvawired,
 878                     wpipe->pipe_buffer.size + PAGE_SIZE);
 879         }
 880         pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
 881                 wpipe->pipe_map.npages);
 882
 883 /*
 884  * and update the uio data
 885  */
 886
 887         uio->uio_iov->iov_len -= size;
 888         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
 889         if (uio->uio_iov->iov_len == 0)
 890                 uio->uio_iov++;
 891         uio_setresid(uio, (uio_resid(uio) - size));
 892         uio->uio_offset += size;
 893         return (0);
 894 }
 895
 896 /*
 897  * unmap and unwire the process buffer
 898  */
 899 static void
 900 pipe_destroy_write_buffer(wpipe)
 901         struct pipe *wpipe;
 902 {
 903         int i;
 904
 905         if (wpipe->pipe_map.kva) {
 906                 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
 907
 908                 if (amountpipekvawired > maxpipekvawired / 2) {
 909                         /* Conserve address space */
 910                         vm_offset_t kva = wpipe->pipe_map.kva;
 911                         wpipe->pipe_map.kva = 0;
 912                         kmem_free(kernel_map, kva,
 913                             wpipe->pipe_buffer.size + PAGE_SIZE);
 914                         atomic_subtract_int(&amountpipekvawired,
 915                             wpipe->pipe_buffer.size + PAGE_SIZE);
 916                 }
 917         }
 918         vm_page_lock_queues();
 919         for (i = 0; i < wpipe->pipe_map.npages; i++) {
 920                 vm_page_unhold(wpipe->pipe_map.ms[i]);
 921         }
 922         vm_page_unlock_queues();
 923         wpipe->pipe_map.npages = 0;
 924 }
 925
 926 /*
 927  * In the case of a signal, the writing process might go away.  This
 928  * code copies the data into the circular buffer so that the source
 929  * pages can be freed without loss of data.
 930  */
 931 static void
 932 pipe_clone_write_buffer(wpipe)
 933         struct pipe *wpipe;
 934 {
 935         int size;
 936         int pos;
 937
 938         size = wpipe->pipe_map.cnt;
 939         pos = wpipe->pipe_map.pos;
 940
 941         wpipe->pipe_buffer.in = size;
 942         wpipe->pipe_buffer.out = 0;
 943         wpipe->pipe_buffer.cnt = size;
 944         wpipe->pipe_state &= ~PIPE_DIRECTW;
 945
 946         PIPE_UNLOCK(wpipe);
 947         bcopy((caddr_t) wpipe->pipe_map.kva + pos,
 948             wpipe->pipe_buffer.buffer, size);
 949         pipe_destroy_write_buffer(wpipe);
 950         PIPE_LOCK(wpipe);
 951 }
 952
 953 /*
 954  * This implements the pipe buffer write mechanism.  Note that only
 955  * a direct write OR a normal pipe write can be pending at any given time.
 956  * If there are any characters in the pipe buffer, the direct write will
 957  * be deferred until the receiving process grabs all of the bytes from
 958  * the pipe buffer.  Then the direct mapping write is set-up.
 959  */
 960 static int
 961 pipe_direct_write(wpipe, uio)
 962         struct pipe *wpipe;
 963         struct uio *uio;
 964 {
 965         int error;
 966
 967 retry:
 968         while (wpipe->pipe_state & PIPE_DIRECTW) {
 969                 if (wpipe->pipe_state & PIPE_WANTR) {
 970                         wpipe->pipe_state &= ~PIPE_WANTR;
 971                         wakeup(wpipe);
 972                 }
 973                 wpipe->pipe_state |= PIPE_WANTW;
 974                 error = msleep(wpipe, PIPE_MTX(wpipe),
 975                     PRIBIO | PCATCH, "pipdww", 0);
 976                 if (error)
 977                         goto error1;
 978                 if (wpipe->pipe_state & PIPE_EOF) {
 979                         error = EPIPE;
 980                         goto error1;
 981                 }
 982         }
 983         wpipe->pipe_map.cnt = 0;        /* transfer not ready yet */
 984         if (wpipe->pipe_buffer.cnt > 0) {
 985                 if (wpipe->pipe_state & PIPE_WANTR) {
 986                         wpipe->pipe_state &= ~PIPE_WANTR;
 987                         wakeup(wpipe);
 988                 }
 989
 990                 wpipe->pipe_state |= PIPE_WANTW;
 991                 error = msleep(wpipe, PIPE_MTX(wpipe),
 992                     PRIBIO | PCATCH, "pipdwc", 0);
 993                 if (error)
 994                         goto error1;
 995                 if (wpipe->pipe_state & PIPE_EOF) {
 996                         error = EPIPE;
 997                         goto error1;
 998                 }
 999                 goto retry;
1000         }
1001
1002         wpipe->pipe_state |= PIPE_DIRECTW;
1003
1004         pipelock(wpipe, 0);
1005         PIPE_UNLOCK(wpipe);
1006         error = pipe_build_write_buffer(wpipe, uio);
1007         PIPE_LOCK(wpipe);
1008         pipeunlock(wpipe);
1009         if (error) {
1010                 wpipe->pipe_state &= ~PIPE_DIRECTW;
1011                 goto error1;
1012         }
1013
1014         error = 0;
1015         while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1016                 if (wpipe->pipe_state & PIPE_EOF) {
1017                         pipelock(wpipe, 0);
1018                         PIPE_UNLOCK(wpipe);
1019                         pipe_destroy_write_buffer(wpipe);
1020                         PIPE_LOCK(wpipe);
1021                         pipeselwakeup(wpipe, wpipe);
1022                         pipeunlock(wpipe);
1023                         error = EPIPE;
1024                         goto error1;
1025                 }
1026                 if (wpipe->pipe_state & PIPE_WANTR) {
1027                         wpipe->pipe_state &= ~PIPE_WANTR;
1028                         wakeup(wpipe);
1029                 }
1030                 pipeselwakeup(wpipe, wpipe);
1031                 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
1032                     "pipdwt", 0);
1033         }
1034
1035         pipelock(wpipe,0);
1036         if (wpipe->pipe_state & PIPE_DIRECTW) {
1037                 /*
1038                  * this bit of trickery substitutes a kernel buffer for
1039                  * the process that might be going away.
1040                  */
1041                 pipe_clone_write_buffer(wpipe);
1042         } else {
1043                 PIPE_UNLOCK(wpipe);
1044                 pipe_destroy_write_buffer(wpipe);
1045                 PIPE_LOCK(wpipe);
1046         }
1047         pipeunlock(wpipe);
1048         return (error);
1049
1050 error1:
1051         wakeup(wpipe);
1052         return (error);
1053 }
1054 #endif
1055
1056
1057
1058 static int
1059 pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
1060         __unused vfs_context_t ctx)
1061 {
1062         int error = 0;
1063         int orig_resid;
1064         int pipe_size;
1065         struct pipe *wpipe, *rpipe;
1066
1067         rpipe = (struct pipe *)fp->f_data;
1068
1069         PIPE_LOCK(rpipe);
1070         wpipe = rpipe->pipe_peer;
1071
1072         /*
1073          * detect loss of pipe read side, issue SIGPIPE if lost.
1074          */
1075         if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)) {
1076                 PIPE_UNLOCK(rpipe);
1077                 return (EPIPE);
1078         }
1079 #if CONFIG_MACF
1080         error = mac_pipe_check_write(kauth_cred_get(), wpipe);
1081         if (error) {
1082                 PIPE_UNLOCK(rpipe);
1083                 return (error);
1084         }
1085 #endif
1086         ++wpipe->pipe_busy;
1087
1088         pipe_size = 0;
1089
1090         if (wpipe->pipe_buffer.buffer == 0) {
1091                 /*
1092                  * need to allocate some storage... we delay the allocation
1093                  * until the first write on fd[0] to avoid allocating storage for both
1094                  * 'pipe ends'... most pipes are half-duplex with the writes targeting
1095                  * fd[1], so allocating space for both ends is a waste...
1096                  *
1097                  * Reduce to 1/4th pipe size if we're over our global max.
1098                  */
1099                 if (amountpipekva > maxpipekva / 2)
1100                         pipe_size = SMALL_PIPE_SIZE;
1101                 else
1102                         pipe_size = PIPE_SIZE;
1103         }
1104
1105         /*
1106          * If it is advantageous to resize the pipe buffer, do
1107          * so.
1108          */
1109         if ((uio_resid(uio) > PIPE_SIZE) &&
1110                 (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1111                 (amountpipekva < maxpipekva / 2) &&
1112                 (nbigpipe < LIMITBIGPIPES) &&
1113 #ifndef PIPE_NODIRECT
1114                 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1115 #endif
1116                 (wpipe->pipe_buffer.cnt == 0)) {
1117
1118                 pipe_size = BIG_PIPE_SIZE;
1119
1120         }
1121         if (pipe_size) {
1122                 /*
1123                  * need to do initial allocation or resizing of pipe
1124                  */
1125                 if ((error = pipelock(wpipe, 1)) == 0) {
1126                         PIPE_UNLOCK(wpipe);
1127                         if (pipespace(wpipe, pipe_size) == 0)
1128                                 OSAddAtomic(1, (SInt32 *)&nbigpipe);
1129                         PIPE_LOCK(wpipe);
1130                         pipeunlock(wpipe);
1131
1132                         if (wpipe->pipe_buffer.buffer == 0) {
1133                                 /*
1134                                  * initial allocation failed
1135                                  */
1136                                 error = ENOMEM;
1137                         }
1138                 }
1139                 if (error) {
1140                         /*
1141                          * If an error occurred unbusy and return, waking up any pending
1142                          * readers.
1143                          */
1144                         --wpipe->pipe_busy;
1145                         if ((wpipe->pipe_busy == 0) &&
1146                             (wpipe->pipe_state & PIPE_WANT)) {
1147                                 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1148                                 wakeup(wpipe);
1149                         }
1150                         PIPE_UNLOCK(rpipe);
1151                         return(error);
1152                 }
1153         }
1154         // LP64todo - fix this!
1155         orig_resid = uio_resid(uio);
1156
1157         while (uio_resid(uio)) {
1158                 int space;
1159
1160 #ifndef PIPE_NODIRECT
1161                 /*
1162                  * If the transfer is large, we can gain performance if
1163                  * we do process-to-process copies directly.
1164                  * If the write is non-blocking, we don't use the
1165                  * direct write mechanism.
1166                  *
1167                  * The direct write mechanism will detect the reader going
1168                  * away on us.
1169                  */
1170                 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1171                     (fp->f_flag & FNONBLOCK) == 0 &&
1172                     amountpipekvawired + uio->uio_resid < maxpipekvawired) {
1173                         error = pipe_direct_write(wpipe, uio);
1174                         if (error)
1175                                 break;
1176                         continue;
1177                 }
1178
1179                 /*
1180                  * Pipe buffered writes cannot be coincidental with
1181                  * direct writes.  We wait until the currently executing
1182                  * direct write is completed before we start filling the
1183                  * pipe buffer.  We break out if a signal occurs or the
1184                  * reader goes away.
1185                  */
1186         retrywrite:
1187                 while (wpipe->pipe_state & PIPE_DIRECTW) {
1188                         if (wpipe->pipe_state & PIPE_WANTR) {
1189                                 wpipe->pipe_state &= ~PIPE_WANTR;
1190                                 wakeup(wpipe);
1191                         }
1192                         error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipbww", 0);
1193
1194                         if (wpipe->pipe_state & PIPE_EOF)
1195                                 break;
1196                         if (error)
1197                                 break;
1198                 }
1199 #else
1200         retrywrite:
1201 #endif
1202                 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1203
1204                 /*
1205                  * Writes of size <= PIPE_BUF must be atomic.
1206                  */
1207                 if ((space < uio_resid(uio)) && (orig_resid <= PIPE_BUF))
1208                         space = 0;
1209
1210                 if (space > 0) {
1211
1212                         if ((error = pipelock(wpipe,1)) == 0) {
1213                                 int size;       /* Transfer size */
1214                                 int segsize;    /* first segment to transfer */
1215
1216                                 if (wpipe->pipe_state & PIPE_EOF) {
1217                                         pipeunlock(wpipe);
1218                                         error = EPIPE;
1219                                         break;
1220                                 }
1221 #ifndef PIPE_NODIRECT
1222                                 /*
1223                                  * It is possible for a direct write to
1224                                  * slip in on us... handle it here...
1225                                  */
1226                                 if (wpipe->pipe_state & PIPE_DIRECTW) {
1227                                         pipeunlock(wpipe);
1228                                         goto retrywrite;
1229                                 }
1230 #endif
1231                                 /*
1232                                  * If a process blocked in pipelock, our
1233                                  * value for space might be bad... the mutex
1234                                  * is dropped while we're blocked
1235                                  */
1236                                 if (space > (int)(wpipe->pipe_buffer.size -
1237                                     wpipe->pipe_buffer.cnt)) {
1238                                         pipeunlock(wpipe);
1239                                         goto retrywrite;
1240                                 }
1241
1242                                 /*
1243                                  * Transfer size is minimum of uio transfer
1244                                  * and free space in pipe buffer.
1245                                  */
1246                                 // LP64todo - fix this!
1247                                 if (space > uio_resid(uio))
1248                                         size = uio_resid(uio);
1249                                 else
1250                                         size = space;
1251                                 /*
1252                                  * First segment to transfer is minimum of
1253                                  * transfer size and contiguous space in
1254                                  * pipe buffer.  If first segment to transfer
1255                                  * is less than the transfer size, we've got
1256                                  * a wraparound in the buffer.
1257                                  */
1258                                 segsize = wpipe->pipe_buffer.size -
1259                                         wpipe->pipe_buffer.in;
1260                                 if (segsize > size)
1261                                         segsize = size;
1262
1263                                 /* Transfer first segment */
1264
1265                                 PIPE_UNLOCK(rpipe);
1266                                 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1267                                                 segsize, uio);
1268                                 PIPE_LOCK(rpipe);
1269
1270                                 if (error == 0 && segsize < size) {
1271                                         /*
1272                                          * Transfer remaining part now, to
1273                                          * support atomic writes.  Wraparound
1274                                          * happened.
1275                                          */
1276                                         if (wpipe->pipe_buffer.in + segsize !=
1277                                             wpipe->pipe_buffer.size)
1278                                                 panic("Expected pipe buffer "
1279                                                     "wraparound disappeared");
1280
1281                                         PIPE_UNLOCK(rpipe);
1282                                         error = uiomove(
1283                                             &wpipe->pipe_buffer.buffer[0],
1284                                             size - segsize, uio);
1285                                         PIPE_LOCK(rpipe);
1286                                 }
1287                                 if (error == 0) {
1288                                         wpipe->pipe_buffer.in += size;
1289                                         if (wpipe->pipe_buffer.in >=
1290                                             wpipe->pipe_buffer.size) {
1291                                                 if (wpipe->pipe_buffer.in !=
1292                                                     size - segsize +
1293                                                     wpipe->pipe_buffer.size)
1294                                                         panic("Expected "
1295                                                             "wraparound bad");
1296                                                 wpipe->pipe_buffer.in = size -
1297                                                     segsize;
1298                                         }
1299
1300                                         wpipe->pipe_buffer.cnt += size;
1301                                         if (wpipe->pipe_buffer.cnt >
1302                                             wpipe->pipe_buffer.size)
1303                                                 panic("Pipe buffer overflow");
1304
1305                                 }
1306                                 pipeunlock(wpipe);
1307                         }
1308                         if (error)
1309                                 break;
1310
1311                 } else {
1312                         /*
1313                          * If the "read-side" has been blocked, wake it up now.
1314                          */
1315                         if (wpipe->pipe_state & PIPE_WANTR) {
1316                                 wpipe->pipe_state &= ~PIPE_WANTR;
1317                                 wakeup(wpipe);
1318                         }
1319                         /*
1320                          * don't block on non-blocking I/O
1321                          * we'll do the pipeselwakeup on the way out
1322                          */
1323                         if (fp->f_flag & FNONBLOCK) {
1324                                 error = EAGAIN;
1325                                 break;
1326                         }
1327                         /*
1328                          * We have no more space and have something to offer,
1329                          * wake up select/poll.
1330                          */
1331                         pipeselwakeup(wpipe, wpipe);
1332
1333                         wpipe->pipe_state |= PIPE_WANTW;
1334
1335                         error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipewr", 0);
1336
1337                         if (error != 0)
1338                                 break;
1339                         /*
1340                          * If read side wants to go away, we just issue a signal
1341                          * to ourselves.
1342                          */
1343                         if (wpipe->pipe_state & PIPE_EOF) {
1344                                 error = EPIPE;
1345                                 break;
1346                         }
1347                 }
1348         }
1349         --wpipe->pipe_busy;
1350
1351         if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1352                 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1353                 wakeup(wpipe);
1354         }
1355         if (wpipe->pipe_buffer.cnt > 0) {
1356                 /*
1357                  * If there are any characters in the buffer, we wake up
1358                  * the reader if it was blocked waiting for data.
1359                  */
1360                 if (wpipe->pipe_state & PIPE_WANTR) {
1361                         wpipe->pipe_state &= ~PIPE_WANTR;
1362                         wakeup(wpipe);
1363                 }
1364                 /*
1365                  * wake up thread blocked in select/poll or post the notification
1366                  */
1367                 pipeselwakeup(wpipe, wpipe);
1368         }
1369
1370         /* Update modification, status change (# of bytes in pipe) times */
1371         pipe_touch(rpipe, PIPE_MTIME | PIPE_CTIME);
1372         pipe_touch(wpipe, PIPE_MTIME | PIPE_CTIME);
1373         PIPE_UNLOCK(rpipe);
1374
1375         return (error);
1376 }
1377
1378 /*
1379  * we implement a very minimal set of ioctls for compatibility with sockets.
1380  */
1381 /* ARGSUSED 3 */
1382 static int
1383 pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
1384         __unused vfs_context_t ctx)
1385 {
1386         struct pipe *mpipe = (struct pipe *)fp->f_data;
1387 #if CONFIG_MACF
1388         int error;
1389 #endif
1390
1391         PIPE_LOCK(mpipe);
1392
1393 #if CONFIG_MACF
1394         error = mac_pipe_check_ioctl(kauth_cred_get(), mpipe, cmd);
1395         if (error) {
1396                 PIPE_UNLOCK(mpipe);
1397
1398                 return (error);
1399         }
1400 #endif
1401
1402         switch (cmd) {
1403
1404         case FIONBIO:
1405                 PIPE_UNLOCK(mpipe);
1406                 return (0);
1407
1408         case FIOASYNC:
1409                 if (*(int *)data) {
1410                         mpipe->pipe_state |= PIPE_ASYNC;
1411                 } else {
1412                         mpipe->pipe_state &= ~PIPE_ASYNC;
1413                 }
1414                 PIPE_UNLOCK(mpipe);
1415                 return (0);
1416
1417         case FIONREAD:
1418 #ifndef PIPE_NODIRECT
1419                 if (mpipe->pipe_state & PIPE_DIRECTW)
1420                         *(int *)data = mpipe->pipe_map.cnt;
1421                 else
1422 #endif
1423                         *(int *)data = mpipe->pipe_buffer.cnt;
1424                 PIPE_UNLOCK(mpipe);
1425                 return (0);
1426
1427         case TIOCSPGRP:
1428                 mpipe->pipe_pgid = *(int *)data;
1429
1430                 PIPE_UNLOCK(mpipe);
1431                 return (0);
1432
1433         case TIOCGPGRP:
1434                 *(int *)data = mpipe->pipe_pgid;
1435
1436                 PIPE_UNLOCK(mpipe);
1437                 return (0);
1438
1439         }
1440         PIPE_UNLOCK(mpipe);
1441         return (ENOTTY);
1442 }
1443
1444
1445 static int
1446 pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
1447 {
1448         struct pipe *rpipe = (struct pipe *)fp->f_data;
1449         struct pipe *wpipe;
1450         int    retnum = 0;
1451
1452         if (rpipe == NULL || rpipe == (struct pipe *)-1)
1453                 return (retnum);
1454
1455         PIPE_LOCK(rpipe);
1456
1457         wpipe = rpipe->pipe_peer;
1458
1459 #if CONFIG_MACF
1460         /*
1461          * XXX We should use a per thread credential here; minimally, the
1462          * XXX process credential should have a persistent reference on it
1463          * XXX before being passed in here.
1464          */
1465         if (mac_pipe_check_select(vfs_context_ucred(ctx), rpipe, which)) {
1466                 PIPE_UNLOCK(rpipe);
1467                 return (0);
1468         }
1469 #endif
1470         switch (which) {
1471
1472         case FREAD:
1473                 if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1474                     (rpipe->pipe_buffer.cnt > 0) ||
1475                     (rpipe->pipe_state & PIPE_EOF)) {
1476
1477                         retnum = 1;
1478                 } else {
1479                         rpipe->pipe_state |= PIPE_SEL;
1480                         selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
1481                 }
1482                 break;
1483
1484         case FWRITE:
1485                 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1486                     (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1487                      (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
1488
1489                         retnum = 1;
1490                 } else {
1491                         wpipe->pipe_state |= PIPE_SEL;
1492                         selrecord(vfs_context_proc(ctx), &wpipe->pipe_sel, wql);
1493                 }
1494                 break;
1495         case 0:
1496                 rpipe->pipe_state |= PIPE_SEL;
1497                 selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
1498                 break;
1499         }
1500         PIPE_UNLOCK(rpipe);
1501
1502         return (retnum);
1503 }
1504
1505
1506 /* ARGSUSED 1 */
1507 static int
1508 pipe_close(struct fileglob *fg, __unused vfs_context_t ctx)
1509 {
1510         struct pipe *cpipe;
1511
1512         proc_fdlock_spin(vfs_context_proc(ctx));
1513         cpipe = (struct pipe *)fg->fg_data;
1514         fg->fg_data = NULL;
1515         proc_fdunlock(vfs_context_proc(ctx));
1516
1517         if (cpipe)
1518                 pipeclose(cpipe);
1519
1520         return (0);
1521 }
1522
1523 static void
1524 pipe_free_kmem(struct pipe *cpipe)
1525 {
1526
1527         if (cpipe->pipe_buffer.buffer != NULL) {
1528                 if (cpipe->pipe_buffer.size > PIPE_SIZE)
1529                         OSAddAtomic(-1, (SInt32 *)&nbigpipe);
1530                 OSAddAtomic(-(cpipe->pipe_buffer.size), (SInt32 *)&amountpipekva);
1531                 OSAddAtomic(-1, (SInt32 *)&amountpipes);
1532
1533                 kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer,
1534                           cpipe->pipe_buffer.size);
1535                 cpipe->pipe_buffer.buffer = NULL;
1536         }
1537 #ifndef PIPE_NODIRECT
1538         if (cpipe->pipe_map.kva != 0) {
1539                 atomic_subtract_int(&amountpipekvawired,
1540                     cpipe->pipe_buffer.size + PAGE_SIZE);
1541                 kmem_free(kernel_map,
1542                         cpipe->pipe_map.kva,
1543                         cpipe->pipe_buffer.size + PAGE_SIZE);
1544                 cpipe->pipe_map.cnt = 0;
1545                 cpipe->pipe_map.kva = 0;
1546                 cpipe->pipe_map.pos = 0;
1547                 cpipe->pipe_map.npages = 0;
1548         }
1549 #endif
1550 }
1551
1552 /*
1553  * shutdown the pipe
1554  */
1555 static void
1556 pipeclose(struct pipe *cpipe)
1557 {
1558         struct pipe *ppipe;
1559
1560         if (cpipe == NULL)
1561                 return;
1562
1563         /* partially created pipes won't have a valid mutex. */
1564         if (PIPE_MTX(cpipe) != NULL)
1565                 PIPE_LOCK(cpipe);
1566
1567
1568         /*
1569          * If the other side is blocked, wake it up saying that
1570          * we want to close it down.
1571          */
1572         cpipe->pipe_state |= PIPE_EOF;
1573         pipeselwakeup(cpipe, cpipe);
1574
1575         while (cpipe->pipe_busy) {
1576                 cpipe->pipe_state |= PIPE_WANT;
1577
1578                 wakeup(cpipe);
1579                 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1580         }
1581
1582 #if CONFIG_MACF
1583         /*
1584          * Free the shared pipe label only after the two ends are disconnected.
1585          */
1586         if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
1587                 mac_pipe_label_destroy(cpipe);
1588 #endif
1589
1590         /*
1591          * Disconnect from peer
1592          */
1593         if ((ppipe = cpipe->pipe_peer) != NULL) {
1594
1595                 ppipe->pipe_state |= PIPE_EOF;
1596
1597                 pipeselwakeup(ppipe, ppipe);
1598                 wakeup(ppipe);
1599
1600                 if (cpipe->pipe_state & PIPE_KNOTE)
1601                         KNOTE(&ppipe->pipe_sel.si_note, 1);
1602
1603                 postpipeevent(ppipe, EV_RCLOSED);
1604
1605                 ppipe->pipe_peer = NULL;
1606         }
1607         evpipefree(cpipe);
1608
1609         /*
1610          * free resources
1611          */
1612         if (PIPE_MTX(cpipe) != NULL) {
1613                 if (ppipe != NULL) {
1614                         /*
1615                          * since the mutex is shared and the peer is still
1616                          * alive, we need to release the mutex, not free it
1617                          */
1618                         PIPE_UNLOCK(cpipe);
1619                 } else {
1620                         /*
1621                          * peer is gone, so we're the sole party left with
1622                          * interest in this mutex... we can just free it
1623                          */
1624                         lck_mtx_free(PIPE_MTX(cpipe), pipe_mtx_grp);
1625                 }
1626         }
1627         pipe_free_kmem(cpipe);
1628
1629         zfree(pipe_zone, cpipe);
1630 }
1631
1632 /*ARGSUSED*/
1633 static int
1634 pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
1635 {
1636         struct pipe *cpipe;
1637
1638         cpipe = (struct pipe *)kn->kn_fp->f_data;
1639
1640         PIPE_LOCK(cpipe);
1641 #if CONFIG_MACF
1642         /*
1643          * XXX We should use a per thread credential here; minimally, the
1644          * XXX process credential should have a persistent reference on it
1645          * XXX before being passed in here.
1646          */
1647         if (mac_pipe_check_kqfilter(vfs_context_ucred(ctx), kn, cpipe) != 0) {
1648                 PIPE_UNLOCK(cpipe);
1649                 return (1);
1650         }
1651 #endif
1652
1653         switch (kn->kn_filter) {
1654         case EVFILT_READ:
1655                 kn->kn_fop = &pipe_rfiltops;
1656
1657                 break;
1658         case EVFILT_WRITE:
1659                 kn->kn_fop = &pipe_wfiltops;
1660
1661                 if (cpipe->pipe_peer == NULL) {
1662                         /*
1663                          * other end of pipe has been closed
1664                          */
1665                         PIPE_UNLOCK(cpipe);
1666                         return (EPIPE);
1667                 }
1668                 if (cpipe->pipe_peer)
1669                 cpipe = cpipe->pipe_peer;
1670                 break;
1671         default:
1672                 PIPE_UNLOCK(cpipe);
1673                 return (1);
1674         }
1675
1676         if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn))
1677                 cpipe->pipe_state |= PIPE_KNOTE;
1678
1679         PIPE_UNLOCK(cpipe);
1680         return (0);
1681 }
1682
1683 static void
1684 filt_pipedetach(struct knote *kn)
1685 {
1686         struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1687
1688         PIPE_LOCK(cpipe);
1689
1690         if (kn->kn_filter == EVFILT_WRITE) {
1691                 if (cpipe->pipe_peer == NULL) {
1692                         PIPE_UNLOCK(cpipe);
1693                         return;
1694                 }
1695                 cpipe = cpipe->pipe_peer;
1696         }
1697         if (cpipe->pipe_state & PIPE_KNOTE) {
1698                 if (KNOTE_DETACH(&cpipe->pipe_sel.si_note, kn))
1699                         cpipe->pipe_state &= ~PIPE_KNOTE;
1700         }
1701         PIPE_UNLOCK(cpipe);
1702 }
1703
1704 /*ARGSUSED*/
1705 static int
1706 filt_piperead(struct knote *kn, long hint)
1707 {
1708         struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1709         struct pipe *wpipe;
1710         int    retval;
1711
1712         /*
1713          * if hint == 0, then we've been called from the kevent
1714          * world directly and do not currently hold the pipe mutex...
1715          * if hint == 1, we're being called back via the KNOTE post
1716          * we made in pipeselwakeup, and we already hold the mutex...
1717          */
1718         if (hint == 0)
1719                 PIPE_LOCK(rpipe);
1720
1721         wpipe = rpipe->pipe_peer;
1722         kn->kn_data = rpipe->pipe_buffer.cnt;
1723
1724 #ifndef PIPE_NODIRECT
1725         if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1726                 kn->kn_data = rpipe->pipe_map.cnt;
1727 #endif
1728         if ((rpipe->pipe_state & PIPE_EOF) ||
1729             (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1730                 kn->kn_flags |= EV_EOF;
1731                 retval = 1;
1732         } else {
1733                 retval = (kn->kn_sfflags & NOTE_LOWAT) ?
1734                          (kn->kn_data >= kn->kn_sdata) : (kn->kn_data > 0);
1735         }
1736
1737         if (hint == 0)
1738                 PIPE_UNLOCK(rpipe);
1739
1740         return (retval);
1741 }
1742
1743 /*ARGSUSED*/
1744 static int
1745 filt_pipewrite(struct knote *kn, long hint)
1746 {
1747         struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1748         struct pipe *wpipe;
1749
1750         /*
1751          * if hint == 0, then we've been called from the kevent
1752          * world directly and do not currently hold the pipe mutex...
1753          * if hint == 1, we're being called back via the KNOTE post
1754          * we made in pipeselwakeup, and we already hold the mutex...
1755          */
1756         if (hint == 0)
1757                 PIPE_LOCK(rpipe);
1758
1759         wpipe = rpipe->pipe_peer;
1760
1761         if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1762                 kn->kn_data = 0;
1763                 kn->kn_flags |= EV_EOF;
1764
1765                 if (hint == 0)
1766                         PIPE_UNLOCK(rpipe);
1767                 return (1);
1768         }
1769         kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1770         if (!kn->kn_data && wpipe->pipe_buffer.size == 0)
1771                 kn->kn_data = 1; /* unwritten pipe is ready for write */
1772
1773 #ifndef PIPE_NODIRECT
1774         if (wpipe->pipe_state & PIPE_DIRECTW)
1775                 kn->kn_data = 0;
1776 #endif
1777         if (hint == 0)
1778                 PIPE_UNLOCK(rpipe);
1779
1780         return (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
1781                                  kn->kn_sdata : PIPE_BUF));
1782 }
1783
1784 int
1785 fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo)
1786 {
1787 #if CONFIG_MACF
1788         int error;
1789 #endif
1790         struct timeval now;
1791         struct vinfo_stat * ub;
1792         int pipe_size = 0;
1793         int pipe_count;
1794
1795         if (cpipe == NULL)
1796                 return (EBADF);
1797         PIPE_LOCK(cpipe);
1798
1799 #if CONFIG_MACF
1800         error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
1801         if (error) {
1802                 PIPE_UNLOCK(cpipe);
1803                 return (error);
1804         }
1805 #endif
1806         if (cpipe->pipe_buffer.buffer == 0) {
1807                 /*
1808                  * must be stat'ing the write fd
1809                  */
1810                 if (cpipe->pipe_peer) {
1811                         /*
1812                          * the peer still exists, use it's info
1813                          */
1814                         pipe_size  = cpipe->pipe_peer->pipe_buffer.size;
1815                         pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
1816                 } else {
1817                         pipe_count = 0;
1818                 }
1819         } else {
1820                 pipe_size  = cpipe->pipe_buffer.size;
1821                 pipe_count = cpipe->pipe_buffer.cnt;
1822         }
1823         /*
1824          * since peer's buffer is setup ouside of lock
1825          * we might catch it in transient state
1826          */
1827         if (pipe_size == 0)
1828                 pipe_size  = PIPE_SIZE;
1829
1830         ub = &pinfo->pipe_stat;
1831
1832         bzero(ub, sizeof(*ub));
1833         ub->vst_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
1834         ub->vst_blksize = pipe_size;
1835         ub->vst_size = pipe_count;
1836         if (ub->vst_blksize != 0)
1837                 ub->vst_blocks = (ub->vst_size + ub->vst_blksize - 1) / ub->vst_blksize;
1838         ub->vst_nlink = 1;
1839
1840         ub->vst_uid = kauth_getuid();
1841         ub->vst_gid = kauth_getgid();
1842
1843         microtime(&now);
1844         ub->vst_atime  = now.tv_sec;
1845         ub->vst_atimensec = now.tv_usec * 1000;
1846
1847         ub->vst_mtime  = now.tv_sec;
1848         ub->vst_mtimensec = now.tv_usec * 1000;
1849
1850         ub->vst_ctime  = now.tv_sec;
1851         ub->vst_ctimensec = now.tv_usec * 1000;
1852
1853         /*
1854          * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen, st_uid, st_gid.
1855          * XXX (st_dev, st_ino) should be unique.
1856          */
1857
1858         pinfo->pipe_handle = (uint64_t)((uintptr_t)cpipe);
1859         pinfo->pipe_peerhandle = (uint64_t)((uintptr_t)(cpipe->pipe_peer));
1860         pinfo->pipe_status = cpipe->pipe_state;
1861
1862         PIPE_UNLOCK(cpipe);
1863
1864         return (0);
1865 }