]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/sys_pipe.c
xnu-1228.0.2.tar.gz
[apple/xnu.git] / bsd / kern / sys_pipe.c
CommitLineData
91447636
A
1/*
2 * Copyright (c) 1996 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 * John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 * are met.
18 */
19/*
2d21ac55 20 * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
91447636 21 *
2d21ac55 22 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
91447636 23 *
2d21ac55
A
24 * This file contains Original Code and/or Modifications of Original Code
25 * as defined in and that are subject to the Apple Public Source License
26 * Version 2.0 (the 'License'). You may not use this file except in
27 * compliance with the License. The rights granted to you under the License
28 * may not be used to create, or enable the creation or redistribution of,
29 * unlawful or unlicensed copies of an Apple operating system, or to
30 * circumvent, violate, or enable the circumvention or violation of, any
31 * terms of an Apple operating system software license agreement.
8f6c56a5 32 *
2d21ac55
A
33 * Please obtain a copy of the License at
34 * http://www.opensource.apple.com/apsl/ and read it before using this file.
35 *
36 * The Original Code and all software distributed under the License are
37 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
38 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
39 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
40 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
41 * Please see the License for the specific language governing rights and
42 * limitations under the License.
8f6c56a5 43 *
2d21ac55
A
44 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
45 */
46/*
47 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
48 * support for mandatory and extensible security protections. This notice
49 * is included in support of clause 2.2 (b) of the Apple Public License,
50 * Version 2.0.
91447636
A
51 */
52
53/*
54 * This file contains a high-performance replacement for the socket-based
55 * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
56 * all features of sockets, but does do everything that pipes normally
57 * do.
58 */
59
60/*
61 * This code has two modes of operation, a small write mode and a large
62 * write mode. The small write mode acts like conventional pipes with
63 * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
64 * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
65 * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
66 * the receiving process can copy it directly from the pages in the sending
67 * process.
68 *
69 * If the sending process receives a signal, it is possible that it will
70 * go away, and certainly its address space can change, because control
71 * is returned back to the user-mode side. In that case, the pipe code
72 * arranges to copy the buffer supplied by the user process, to a pageable
73 * kernel buffer, and the receiving process will grab the data from the
74 * pageable kernel buffer. Since signals don't happen all that often,
75 * the copy operation is normally eliminated.
76 *
77 * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
78 * happen for small transfers so that the system will not spend all of
79 * its time context switching.
80 *
81 * In order to limit the resource use of pipes, two sysctls exist:
82 *
83 * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
84 * address space available to us in pipe_map. Whenever the amount in use
85 * exceeds half of this value, all new pipes will be created with size
86 * SMALL_PIPE_SIZE, rather than PIPE_SIZE. Big pipe creation will be limited
87 * as well. This value is loader tunable only.
88 *
89 * kern.ipc.maxpipekvawired - This value limits the amount of memory that may
90 * be wired in order to facilitate direct copies using page flipping.
91 * Whenever this value is exceeded, pipes will fall back to using regular
92 * copies. This value is sysctl controllable at all times.
93 *
94 * These values are autotuned in subr_param.c.
95 *
96 * Memory usage may be monitored through the sysctls
97 * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
98 *
99 */
100
101#include <sys/param.h>
102#include <sys/systm.h>
103#include <sys/filedesc.h>
104#include <sys/kernel.h>
105#include <sys/vnode.h>
106#include <sys/proc_internal.h>
107#include <sys/kauth.h>
108#include <sys/file_internal.h>
109#include <sys/stat.h>
110#include <sys/ioctl.h>
111#include <sys/fcntl.h>
112#include <sys/malloc.h>
113#include <sys/syslog.h>
114#include <sys/unistd.h>
115#include <sys/resourcevar.h>
116#include <sys/aio_kern.h>
117#include <sys/signalvar.h>
118#include <sys/pipe.h>
119#include <sys/sysproto.h>
0c530ab8 120#include <sys/proc_info.h>
91447636
A
121
122#include <bsm/audit_kernel.h>
123
124#include <sys/kdebug.h>
125
126#include <kern/zalloc.h>
127#include <vm/vm_kern.h>
128#include <libkern/OSAtomic.h>
129
130#define f_flag f_fglob->fg_flag
131#define f_type f_fglob->fg_type
132#define f_msgcount f_fglob->fg_msgcount
133#define f_cred f_fglob->fg_cred
134#define f_ops f_fglob->fg_ops
135#define f_offset f_fglob->fg_offset
136#define f_data f_fglob->fg_data
137/*
138 * Use this define if you want to disable *fancy* VM things. Expect an
139 * approx 30% decrease in transfer rate. This could be useful for
140 * NetBSD or OpenBSD.
141 *
142 * this needs to be ported to X and the performance measured
143 * before committing to supporting it
144 */
145#define PIPE_NODIRECT 1
146
147#ifndef PIPE_NODIRECT
148
149#include <vm/vm.h>
150#include <vm/vm_param.h>
151#include <vm/vm_object.h>
152#include <vm/vm_kern.h>
153#include <vm/vm_extern.h>
154#include <vm/pmap.h>
155#include <vm/vm_map.h>
156#include <vm/vm_page.h>
157#include <vm/uma.h>
158
159#endif
160
161
162/*
163 * interfaces to the outside world
164 */
165static int pipe_read(struct fileproc *fp, struct uio *uio,
2d21ac55 166 int flags, vfs_context_t ctx);
91447636
A
167
168static int pipe_write(struct fileproc *fp, struct uio *uio,
2d21ac55 169 int flags, vfs_context_t ctx);
91447636 170
2d21ac55 171static int pipe_close(struct fileglob *fg, vfs_context_t ctx);
91447636 172
2d21ac55
A
173static int pipe_select(struct fileproc *fp, int which, void * wql,
174 vfs_context_t ctx);
91447636 175
2d21ac55
A
176static int pipe_kqfilter(struct fileproc *fp, struct knote *kn,
177 vfs_context_t ctx);
91447636 178
2d21ac55
A
179static int pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
180 vfs_context_t ctx);
91447636
A
181
182
183struct fileops pipeops =
184 { pipe_read,
185 pipe_write,
186 pipe_ioctl,
187 pipe_select,
188 pipe_close,
189 pipe_kqfilter,
2d21ac55 190 NULL };
91447636
A
191
192
193static void filt_pipedetach(struct knote *kn);
194static int filt_piperead(struct knote *kn, long hint);
195static int filt_pipewrite(struct knote *kn, long hint);
196
197static struct filterops pipe_rfiltops =
198 { 1, NULL, filt_pipedetach, filt_piperead };
199static struct filterops pipe_wfiltops =
200 { 1, NULL, filt_pipedetach, filt_pipewrite };
201
202/*
203 * Default pipe buffer size(s), this can be kind-of large now because pipe
204 * space is pageable. The pipe code will try to maintain locality of
205 * reference for performance reasons, so small amounts of outstanding I/O
206 * will not wipe the cache.
207 */
208#define MINPIPESIZE (PIPE_SIZE/3)
209
210/*
211 * Limit the number of "big" pipes
212 */
213#define LIMITBIGPIPES 32
214static int nbigpipe;
215
216static int amountpipes;
217static int amountpipekva;
218
219#ifndef PIPE_NODIRECT
220static int amountpipekvawired;
221#endif
222int maxpipekva = 1024 * 1024 * 16;
223
224#if PIPE_SYSCTLS
225SYSCTL_DECL(_kern_ipc);
226
227SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD,
228 &maxpipekva, 0, "Pipe KVA limit");
229SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
230 &maxpipekvawired, 0, "Pipe KVA wired limit");
231SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
232 &amountpipes, 0, "Current # of pipes");
233SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
234 &nbigpipe, 0, "Current # of big pipes");
235SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
236 &amountpipekva, 0, "Pipe KVA usage");
237SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
238 &amountpipekvawired, 0, "Pipe wired KVA usage");
239#endif
240
91447636
A
241static void pipeclose(struct pipe *cpipe);
242static void pipe_free_kmem(struct pipe *cpipe);
243static int pipe_create(struct pipe **cpipep);
244static void pipeselwakeup(struct pipe *cpipe, struct pipe *spipe);
245static __inline int pipelock(struct pipe *cpipe, int catch);
246static __inline void pipeunlock(struct pipe *cpipe);
247
248#ifndef PIPE_NODIRECT
249static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
250static void pipe_destroy_write_buffer(struct pipe *wpipe);
251static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
252static void pipe_clone_write_buffer(struct pipe *wpipe);
253#endif
254
255extern int postpipeevent(struct pipe *, int);
256extern void evpipefree(struct pipe *cpipe);
257
258
259static int pipespace(struct pipe *cpipe, int size);
260
261static lck_grp_t *pipe_mtx_grp;
262static lck_attr_t *pipe_mtx_attr;
263static lck_grp_attr_t *pipe_mtx_grp_attr;
264
265static zone_t pipe_zone;
266
267SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
268
269void
2d21ac55 270pipeinit(void)
91447636
A
271{
272 pipe_zone = (zone_t)zinit(sizeof(struct pipe), 8192 * sizeof(struct pipe), 4096, "pipe zone");
273
274 /*
275 * allocate lock group attribute and group for pipe mutexes
276 */
277 pipe_mtx_grp_attr = lck_grp_attr_alloc_init();
91447636
A
278 pipe_mtx_grp = lck_grp_alloc_init("pipe", pipe_mtx_grp_attr);
279
280 /*
281 * allocate the lock attribute for pipe mutexes
282 */
283 pipe_mtx_attr = lck_attr_alloc_init();
91447636
A
284}
285
2d21ac55
A
286/* Bitmap for things to touch in pipe_touch() */
287#define PIPE_ATIME 0x00000001 /* time of last access */
288#define PIPE_MTIME 0x00000002 /* time of last modification */
289#define PIPE_CTIME 0x00000004 /* time of last status change */
290
291static void
292pipe_touch(struct pipe *tpipe, int touch)
293{
294 struct timeval now;
295
296 microtime(&now);
297
298 if (touch & PIPE_ATIME) {
299 tpipe->st_atimespec.tv_sec = now.tv_sec;
300 tpipe->st_atimespec.tv_nsec = now.tv_usec * 1000;
301 }
302
303 if (touch & PIPE_MTIME) {
304 tpipe->st_mtimespec.tv_sec = now.tv_sec;
305 tpipe->st_mtimespec.tv_nsec = now.tv_usec * 1000;
306 }
307
308 if (touch & PIPE_CTIME) {
309 tpipe->st_ctimespec.tv_sec = now.tv_sec;
310 tpipe->st_ctimespec.tv_nsec = now.tv_usec * 1000;
311 }
312}
313
91447636
A
314
315
316/*
317 * The pipe system call for the DTYPE_PIPE type of pipes
318 */
319
320/* ARGSUSED */
321int
2d21ac55 322pipe(proc_t p, __unused struct pipe_args *uap, register_t *retval)
91447636
A
323{
324 struct fileproc *rf, *wf;
325 struct pipe *rpipe, *wpipe;
326 lck_mtx_t *pmtx;
327 int fd, error;
328
329 if ((pmtx = lck_mtx_alloc_init(pipe_mtx_grp, pipe_mtx_attr)) == NULL)
330 return (ENOMEM);
331
332 rpipe = wpipe = NULL;
333 if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
334 error = ENFILE;
335 goto freepipes;
336 }
337 /*
338 * allocate the space for the normal I/O direction up
339 * front... we'll delay the allocation for the other
340 * direction until a write actually occurs (most
341 * likely it won't)...
342 *
343 * Reduce to 1/4th pipe size if we're over our global max.
344 */
345 if (amountpipekva > maxpipekva / 2)
346 error = pipespace(rpipe, SMALL_PIPE_SIZE);
347 else
348 error = pipespace(rpipe, PIPE_SIZE);
349 if (error)
350 goto freepipes;
351
352#ifndef PIPE_NODIRECT
353 rpipe->pipe_state |= PIPE_DIRECTOK;
354 wpipe->pipe_state |= PIPE_DIRECTOK;
355#endif
356 TAILQ_INIT(&rpipe->pipe_evlist);
357 TAILQ_INIT(&wpipe->pipe_evlist);
358
2d21ac55 359 error = falloc(p, &rf, &fd, vfs_context_current());
91447636
A
360 if (error) {
361 goto freepipes;
362 }
363 retval[0] = fd;
364
365 /*
366 * for now we'll create half-duplex
367 * pipes... this is what we've always
368 * supported..
369 */
370 rf->f_flag = FREAD;
371 rf->f_type = DTYPE_PIPE;
372 rf->f_data = (caddr_t)rpipe;
373 rf->f_ops = &pipeops;
374
2d21ac55 375 error = falloc(p, &wf, &fd, vfs_context_current());
91447636
A
376 if (error) {
377 fp_free(p, retval[0], rf);
378 goto freepipes;
379 }
380 wf->f_flag = FWRITE;
381 wf->f_type = DTYPE_PIPE;
382 wf->f_data = (caddr_t)wpipe;
383 wf->f_ops = &pipeops;
384
6601e61a
A
385 rpipe->pipe_peer = wpipe;
386 wpipe->pipe_peer = rpipe;
6601e61a 387 rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
2d21ac55 388
91447636 389 retval[1] = fd;
2d21ac55 390#if CONFIG_MACF
91447636
A
391 /*
392 * XXXXXXXX SHOULD NOT HOLD FILE_LOCK() XXXXXXXXXXXX
393 *
394 * struct pipe represents a pipe endpoint. The MAC label is shared
2d21ac55
A
395 * between the connected endpoints. As a result mac_pipe_label_init() and
396 * mac_pipe_label_associate() should only be called on one of the endpoints
91447636
A
397 * after they have been connected.
398 */
2d21ac55
A
399 mac_pipe_label_init(rpipe);
400 mac_pipe_label_associate(kauth_cred_get(), rpipe);
401 wpipe->pipe_label = rpipe->pipe_label;
91447636 402#endif
2d21ac55 403 proc_fdlock_spin(p);
6601e61a
A
404 procfdtbl_releasefd(p, retval[0], NULL);
405 procfdtbl_releasefd(p, retval[1], NULL);
91447636
A
406 fp_drop(p, retval[0], rf, 1);
407 fp_drop(p, retval[1], wf, 1);
408 proc_fdunlock(p);
409
91447636
A
410
411 return (0);
412
413freepipes:
414 pipeclose(rpipe);
415 pipeclose(wpipe);
416 lck_mtx_free(pmtx, pipe_mtx_grp);
417
418 return (error);
419}
420
91447636 421int
2d21ac55 422pipe_stat(struct pipe *cpipe, void *ub, int isstat64)
91447636 423{
2d21ac55 424#if CONFIG_MACF
91447636
A
425 int error;
426#endif
2d21ac55
A
427 int pipe_size = 0;
428 int pipe_count;
429 struct stat *sb = (struct stat *)0; /* warning avoidance ; protected by isstat64 */
430 struct stat64 * sb64 = (struct stat64 *)0; /* warning avoidance ; protected by isstat64 */
91447636
A
431
432 if (cpipe == NULL)
433 return (EBADF);
91447636 434 PIPE_LOCK(cpipe);
2d21ac55
A
435
436#if CONFIG_MACF
437 error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
438 if (error) {
439 PIPE_UNLOCK(cpipe);
91447636 440 return (error);
2d21ac55 441 }
91447636
A
442#endif
443 if (cpipe->pipe_buffer.buffer == 0) {
444 /*
445 * must be stat'ing the write fd
446 */
2d21ac55
A
447 if (cpipe->pipe_peer) {
448 /*
449 * the peer still exists, use it's info
450 */
451 pipe_size = cpipe->pipe_peer->pipe_buffer.size;
452 pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
453 } else {
454 pipe_count = 0;
455 }
456 } else {
457 pipe_size = cpipe->pipe_buffer.size;
458 pipe_count = cpipe->pipe_buffer.cnt;
91447636 459 }
2d21ac55
A
460 /*
461 * since peer's buffer is setup ouside of lock
462 * we might catch it in transient state
463 */
464 if (pipe_size == 0)
465 pipe_size = PIPE_SIZE;
91447636 466
2d21ac55
A
467 if (isstat64 != 0) {
468 sb64 = (struct stat64 *)ub;
91447636 469
2d21ac55
A
470 bzero(sb64, sizeof(*sb64));
471 sb64->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
472 sb64->st_blksize = pipe_size;
473 sb64->st_size = pipe_count;
474 sb64->st_blocks = (sb64->st_size + sb64->st_blksize - 1) / sb64->st_blksize;
475
476 sb64->st_uid = kauth_getuid();
477 sb64->st_gid = kauth_getgid();
478
479 sb64->st_atimespec.tv_sec = cpipe->st_atimespec.tv_sec;
480 sb64->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
481
482 sb64->st_mtimespec.tv_sec = cpipe->st_mtimespec.tv_sec;
483 sb64->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
91447636 484
2d21ac55
A
485 sb64->st_ctimespec.tv_sec = cpipe->st_ctimespec.tv_sec;
486 sb64->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
91447636 487
2d21ac55
A
488 /*
489 * Return a relatively unique inode number based on the current
490 * address of this pipe's struct pipe. This number may be recycled
491 * relatively quickly.
492 */
493 sb64->st_ino = (ino64_t)((uint32_t)cpipe);
494 } else {
495 sb = (struct stat *)ub;
496
497 bzero(sb, sizeof(*sb));
498 sb->st_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
499 sb->st_blksize = pipe_size;
500 sb->st_size = pipe_count;
501 sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
502
503 sb->st_uid = kauth_getuid();
504 sb->st_gid = kauth_getgid();
505
506 sb->st_atimespec.tv_sec = cpipe->st_atimespec.tv_sec;
507 sb->st_atimespec.tv_nsec = cpipe->st_atimespec.tv_nsec;
508
509 sb->st_mtimespec.tv_sec = cpipe->st_mtimespec.tv_sec;
510 sb->st_mtimespec.tv_nsec = cpipe->st_mtimespec.tv_nsec;
511
512 sb->st_ctimespec.tv_sec = cpipe->st_ctimespec.tv_sec;
513 sb->st_ctimespec.tv_nsec = cpipe->st_ctimespec.tv_nsec;
514
515 /*
516 * Return a relatively unique inode number based on the current
517 * address of this pipe's struct pipe. This number may be recycled
518 * relatively quickly.
519 */
520 sb->st_ino = (ino_t)cpipe;
521 }
522 PIPE_UNLOCK(cpipe);
91447636
A
523
524 /*
2d21ac55
A
525 * POSIX: Left as 0: st_dev, st_nlink, st_rdev, st_flags, st_gen,
526 * st_uid, st_gid.
527 *
528 * XXX (st_dev) should be unique, but there is no device driver that
529 * XXX is associated with pipes, since they are implemented via a
530 * XXX struct fileops indirection rather than as FS objects.
91447636
A
531 */
532 return (0);
533}
534
535
536/*
537 * Allocate kva for pipe circular buffer, the space is pageable
538 * This routine will 'realloc' the size of a pipe safely, if it fails
539 * it will retain the old buffer.
540 * If it fails it will return ENOMEM.
541 */
542static int
543pipespace(struct pipe *cpipe, int size)
544{
545 vm_offset_t buffer;
546
547 size = round_page(size);
548
549 if (kmem_alloc(kernel_map, &buffer, size) != KERN_SUCCESS)
550 return(ENOMEM);
551
552 /* free old resources if we're resizing */
553 pipe_free_kmem(cpipe);
554 cpipe->pipe_buffer.buffer = (caddr_t)buffer;
555 cpipe->pipe_buffer.size = size;
556 cpipe->pipe_buffer.in = 0;
557 cpipe->pipe_buffer.out = 0;
558 cpipe->pipe_buffer.cnt = 0;
559
560 OSAddAtomic(1, (SInt32 *)&amountpipes);
561 OSAddAtomic(cpipe->pipe_buffer.size, (SInt32 *)&amountpipekva);
562
563 return (0);
564}
565
566/*
567 * initialize and allocate VM and memory for pipe
568 */
569static int
570pipe_create(struct pipe **cpipep)
571{
572 struct pipe *cpipe;
573
574 cpipe = (struct pipe *)zalloc(pipe_zone);
575
576 if ((*cpipep = cpipe) == NULL)
577 return (ENOMEM);
578
579 /*
580 * protect so pipespace or pipeclose don't follow a junk pointer
581 * if pipespace() fails.
582 */
583 bzero(cpipe, sizeof *cpipe);
584
2d21ac55
A
585 /* Initial times are all the time of creation of the pipe */
586 pipe_touch(cpipe, PIPE_ATIME | PIPE_MTIME | PIPE_CTIME);
587
91447636
A
588 return (0);
589}
590
591
592/*
593 * lock a pipe for I/O, blocking other access
594 */
2d21ac55
A
595static inline int
596pipelock(struct pipe *cpipe, int catch)
91447636
A
597{
598 int error;
599
600 while (cpipe->pipe_state & PIPE_LOCKFL) {
601 cpipe->pipe_state |= PIPE_LWANT;
602
603 error = msleep(cpipe, PIPE_MTX(cpipe), catch ? (PRIBIO | PCATCH) : PRIBIO,
604 "pipelk", 0);
605 if (error != 0)
606 return (error);
607 }
608 cpipe->pipe_state |= PIPE_LOCKFL;
609
610 return (0);
611}
612
613/*
614 * unlock a pipe I/O lock
615 */
2d21ac55
A
616static inline void
617pipeunlock(struct pipe *cpipe)
91447636 618{
91447636
A
619 cpipe->pipe_state &= ~PIPE_LOCKFL;
620
621 if (cpipe->pipe_state & PIPE_LWANT) {
622 cpipe->pipe_state &= ~PIPE_LWANT;
623 wakeup(cpipe);
624 }
625}
626
627static void
2d21ac55 628pipeselwakeup(struct pipe *cpipe, struct pipe *spipe)
91447636 629{
91447636
A
630 if (cpipe->pipe_state & PIPE_SEL) {
631 cpipe->pipe_state &= ~PIPE_SEL;
632 selwakeup(&cpipe->pipe_sel);
633 }
634 if (cpipe->pipe_state & PIPE_KNOTE)
635 KNOTE(&cpipe->pipe_sel.si_note, 1);
636
637 postpipeevent(cpipe, EV_RWBYTES);
638
639 if (spipe && (spipe->pipe_state & PIPE_ASYNC) && spipe->pipe_pgid) {
91447636
A
640 if (spipe->pipe_pgid < 0)
641 gsignal(-spipe->pipe_pgid, SIGIO);
2d21ac55
A
642 else
643 proc_signal(spipe->pipe_pgid, SIGIO);
91447636
A
644 }
645}
646
647/* ARGSUSED */
648static int
2d21ac55
A
649pipe_read(struct fileproc *fp, struct uio *uio, __unused int flags,
650 __unused vfs_context_t ctx)
91447636
A
651{
652 struct pipe *rpipe = (struct pipe *)fp->f_data;
653 int error;
654 int nread = 0;
655 u_int size;
656
657 PIPE_LOCK(rpipe);
658 ++rpipe->pipe_busy;
659
660 error = pipelock(rpipe, 1);
661 if (error)
662 goto unlocked_error;
663
2d21ac55
A
664#if CONFIG_MACF
665 error = mac_pipe_check_read(kauth_cred_get(), rpipe);
91447636
A
666 if (error)
667 goto locked_error;
668#endif
669
670 while (uio_resid(uio)) {
671 /*
672 * normal pipe buffer receive
673 */
674 if (rpipe->pipe_buffer.cnt > 0) {
675 size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
676 if (size > rpipe->pipe_buffer.cnt)
677 size = rpipe->pipe_buffer.cnt;
678 // LP64todo - fix this!
679 if (size > (u_int) uio_resid(uio))
680 size = (u_int) uio_resid(uio);
681
682 PIPE_UNLOCK(rpipe);
683 error = uiomove(
684 &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
685 size, uio);
686 PIPE_LOCK(rpipe);
687 if (error)
688 break;
689
690 rpipe->pipe_buffer.out += size;
691 if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
692 rpipe->pipe_buffer.out = 0;
693
694 rpipe->pipe_buffer.cnt -= size;
695
696 /*
697 * If there is no more to read in the pipe, reset
698 * its pointers to the beginning. This improves
699 * cache hit stats.
700 */
701 if (rpipe->pipe_buffer.cnt == 0) {
702 rpipe->pipe_buffer.in = 0;
703 rpipe->pipe_buffer.out = 0;
704 }
705 nread += size;
706#ifndef PIPE_NODIRECT
707 /*
708 * Direct copy, bypassing a kernel buffer.
709 */
710 } else if ((size = rpipe->pipe_map.cnt) &&
711 (rpipe->pipe_state & PIPE_DIRECTW)) {
712 caddr_t va;
713 // LP64todo - fix this!
714 if (size > (u_int) uio_resid(uio))
715 size = (u_int) uio_resid(uio);
716
717 va = (caddr_t) rpipe->pipe_map.kva +
718 rpipe->pipe_map.pos;
719 PIPE_UNLOCK(rpipe);
720 error = uiomove(va, size, uio);
721 PIPE_LOCK(rpipe);
722 if (error)
723 break;
724 nread += size;
725 rpipe->pipe_map.pos += size;
726 rpipe->pipe_map.cnt -= size;
727 if (rpipe->pipe_map.cnt == 0) {
728 rpipe->pipe_state &= ~PIPE_DIRECTW;
729 wakeup(rpipe);
730 }
731#endif
732 } else {
733 /*
734 * detect EOF condition
735 * read returns 0 on EOF, no need to set error
736 */
737 if (rpipe->pipe_state & PIPE_EOF)
738 break;
739
740 /*
741 * If the "write-side" has been blocked, wake it up now.
742 */
743 if (rpipe->pipe_state & PIPE_WANTW) {
744 rpipe->pipe_state &= ~PIPE_WANTW;
745 wakeup(rpipe);
746 }
747
748 /*
749 * Break if some data was read.
750 */
751 if (nread > 0)
752 break;
753
754 /*
755 * Unlock the pipe buffer for our remaining processing.
756 * We will either break out with an error or we will
757 * sleep and relock to loop.
758 */
759 pipeunlock(rpipe);
760
761 /*
762 * Handle non-blocking mode operation or
763 * wait for more data.
764 */
765 if (fp->f_flag & FNONBLOCK) {
766 error = EAGAIN;
767 } else {
768 rpipe->pipe_state |= PIPE_WANTR;
769
770 error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0);
771
772 if (error == 0)
773 error = pipelock(rpipe, 1);
774 }
775 if (error)
776 goto unlocked_error;
777 }
778 }
2d21ac55 779#if CONFIG_MACF
91447636
A
780locked_error:
781#endif
782 pipeunlock(rpipe);
783
784unlocked_error:
785 --rpipe->pipe_busy;
786
787 /*
788 * PIPE_WANT processing only makes sense if pipe_busy is 0.
789 */
790 if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
791 rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
792 wakeup(rpipe);
793 } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
794 /*
795 * Handle write blocking hysteresis.
796 */
797 if (rpipe->pipe_state & PIPE_WANTW) {
798 rpipe->pipe_state &= ~PIPE_WANTW;
799 wakeup(rpipe);
800 }
801 }
802
803 if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
804 pipeselwakeup(rpipe, rpipe->pipe_peer);
805
2d21ac55
A
806 /* update last read time */
807 pipe_touch(rpipe, PIPE_ATIME);
808
91447636
A
809 PIPE_UNLOCK(rpipe);
810
811 return (error);
812}
813
814
815
816#ifndef PIPE_NODIRECT
817/*
818 * Map the sending processes' buffer into kernel space and wire it.
819 * This is similar to a physical write operation.
820 */
821static int
822pipe_build_write_buffer(wpipe, uio)
823 struct pipe *wpipe;
824 struct uio *uio;
825{
826 pmap_t pmap;
827 u_int size;
828 int i, j;
829 vm_offset_t addr, endaddr;
830
831
832 size = (u_int) uio->uio_iov->iov_len;
833 if (size > wpipe->pipe_buffer.size)
834 size = wpipe->pipe_buffer.size;
835
836 pmap = vmspace_pmap(curproc->p_vmspace);
837 endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
838 addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
839 for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
840 /*
841 * vm_fault_quick() can sleep. Consequently,
842 * vm_page_lock_queue() and vm_page_unlock_queue()
843 * should not be performed outside of this loop.
844 */
845 race:
846 if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
847 vm_page_lock_queues();
848 for (j = 0; j < i; j++)
849 vm_page_unhold(wpipe->pipe_map.ms[j]);
850 vm_page_unlock_queues();
851 return (EFAULT);
852 }
853 wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
854 VM_PROT_READ);
855 if (wpipe->pipe_map.ms[i] == NULL)
856 goto race;
857 }
858
859/*
860 * set up the control block
861 */
862 wpipe->pipe_map.npages = i;
863 wpipe->pipe_map.pos =
864 ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
865 wpipe->pipe_map.cnt = size;
866
867/*
868 * and map the buffer
869 */
870 if (wpipe->pipe_map.kva == 0) {
871 /*
872 * We need to allocate space for an extra page because the
873 * address range might (will) span pages at times.
874 */
875 wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map,
876 wpipe->pipe_buffer.size + PAGE_SIZE);
877 atomic_add_int(&amountpipekvawired,
878 wpipe->pipe_buffer.size + PAGE_SIZE);
879 }
880 pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
881 wpipe->pipe_map.npages);
882
883/*
884 * and update the uio data
885 */
886
887 uio->uio_iov->iov_len -= size;
888 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
889 if (uio->uio_iov->iov_len == 0)
890 uio->uio_iov++;
891 uio_setresid(uio, (uio_resid(uio) - size));
892 uio->uio_offset += size;
893 return (0);
894}
895
896/*
897 * unmap and unwire the process buffer
898 */
899static void
900pipe_destroy_write_buffer(wpipe)
901 struct pipe *wpipe;
902{
903 int i;
904
905 if (wpipe->pipe_map.kva) {
906 pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
907
908 if (amountpipekvawired > maxpipekvawired / 2) {
909 /* Conserve address space */
910 vm_offset_t kva = wpipe->pipe_map.kva;
911 wpipe->pipe_map.kva = 0;
912 kmem_free(kernel_map, kva,
913 wpipe->pipe_buffer.size + PAGE_SIZE);
914 atomic_subtract_int(&amountpipekvawired,
915 wpipe->pipe_buffer.size + PAGE_SIZE);
916 }
917 }
918 vm_page_lock_queues();
919 for (i = 0; i < wpipe->pipe_map.npages; i++) {
920 vm_page_unhold(wpipe->pipe_map.ms[i]);
921 }
922 vm_page_unlock_queues();
923 wpipe->pipe_map.npages = 0;
924}
925
926/*
927 * In the case of a signal, the writing process might go away. This
928 * code copies the data into the circular buffer so that the source
929 * pages can be freed without loss of data.
930 */
931static void
932pipe_clone_write_buffer(wpipe)
933 struct pipe *wpipe;
934{
935 int size;
936 int pos;
937
938 size = wpipe->pipe_map.cnt;
939 pos = wpipe->pipe_map.pos;
940
941 wpipe->pipe_buffer.in = size;
942 wpipe->pipe_buffer.out = 0;
943 wpipe->pipe_buffer.cnt = size;
944 wpipe->pipe_state &= ~PIPE_DIRECTW;
945
946 PIPE_UNLOCK(wpipe);
947 bcopy((caddr_t) wpipe->pipe_map.kva + pos,
948 wpipe->pipe_buffer.buffer, size);
949 pipe_destroy_write_buffer(wpipe);
950 PIPE_LOCK(wpipe);
951}
952
953/*
954 * This implements the pipe buffer write mechanism. Note that only
955 * a direct write OR a normal pipe write can be pending at any given time.
956 * If there are any characters in the pipe buffer, the direct write will
957 * be deferred until the receiving process grabs all of the bytes from
958 * the pipe buffer. Then the direct mapping write is set-up.
959 */
960static int
961pipe_direct_write(wpipe, uio)
962 struct pipe *wpipe;
963 struct uio *uio;
964{
965 int error;
966
967retry:
968 while (wpipe->pipe_state & PIPE_DIRECTW) {
969 if (wpipe->pipe_state & PIPE_WANTR) {
970 wpipe->pipe_state &= ~PIPE_WANTR;
971 wakeup(wpipe);
972 }
973 wpipe->pipe_state |= PIPE_WANTW;
974 error = msleep(wpipe, PIPE_MTX(wpipe),
975 PRIBIO | PCATCH, "pipdww", 0);
976 if (error)
977 goto error1;
978 if (wpipe->pipe_state & PIPE_EOF) {
979 error = EPIPE;
980 goto error1;
981 }
982 }
983 wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
984 if (wpipe->pipe_buffer.cnt > 0) {
985 if (wpipe->pipe_state & PIPE_WANTR) {
986 wpipe->pipe_state &= ~PIPE_WANTR;
987 wakeup(wpipe);
988 }
989
990 wpipe->pipe_state |= PIPE_WANTW;
991 error = msleep(wpipe, PIPE_MTX(wpipe),
992 PRIBIO | PCATCH, "pipdwc", 0);
993 if (error)
994 goto error1;
995 if (wpipe->pipe_state & PIPE_EOF) {
996 error = EPIPE;
997 goto error1;
998 }
999 goto retry;
1000 }
1001
1002 wpipe->pipe_state |= PIPE_DIRECTW;
1003
1004 pipelock(wpipe, 0);
1005 PIPE_UNLOCK(wpipe);
1006 error = pipe_build_write_buffer(wpipe, uio);
1007 PIPE_LOCK(wpipe);
1008 pipeunlock(wpipe);
1009 if (error) {
1010 wpipe->pipe_state &= ~PIPE_DIRECTW;
1011 goto error1;
1012 }
1013
1014 error = 0;
1015 while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
1016 if (wpipe->pipe_state & PIPE_EOF) {
1017 pipelock(wpipe, 0);
1018 PIPE_UNLOCK(wpipe);
1019 pipe_destroy_write_buffer(wpipe);
1020 PIPE_LOCK(wpipe);
1021 pipeselwakeup(wpipe, wpipe);
1022 pipeunlock(wpipe);
1023 error = EPIPE;
1024 goto error1;
1025 }
1026 if (wpipe->pipe_state & PIPE_WANTR) {
1027 wpipe->pipe_state &= ~PIPE_WANTR;
1028 wakeup(wpipe);
1029 }
1030 pipeselwakeup(wpipe, wpipe);
1031 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
1032 "pipdwt", 0);
1033 }
1034
1035 pipelock(wpipe,0);
1036 if (wpipe->pipe_state & PIPE_DIRECTW) {
1037 /*
1038 * this bit of trickery substitutes a kernel buffer for
1039 * the process that might be going away.
1040 */
1041 pipe_clone_write_buffer(wpipe);
1042 } else {
1043 PIPE_UNLOCK(wpipe);
1044 pipe_destroy_write_buffer(wpipe);
1045 PIPE_LOCK(wpipe);
1046 }
1047 pipeunlock(wpipe);
1048 return (error);
1049
1050error1:
1051 wakeup(wpipe);
1052 return (error);
1053}
1054#endif
1055
1056
1057
1058static int
2d21ac55
A
1059pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags,
1060 __unused vfs_context_t ctx)
91447636
A
1061{
1062 int error = 0;
1063 int orig_resid;
1064 int pipe_size;
1065 struct pipe *wpipe, *rpipe;
1066
1067 rpipe = (struct pipe *)fp->f_data;
1068
1069 PIPE_LOCK(rpipe);
1070 wpipe = rpipe->pipe_peer;
1071
1072 /*
1073 * detect loss of pipe read side, issue SIGPIPE if lost.
1074 */
1075 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF)) {
1076 PIPE_UNLOCK(rpipe);
1077 return (EPIPE);
1078 }
2d21ac55
A
1079#if CONFIG_MACF
1080 error = mac_pipe_check_write(kauth_cred_get(), wpipe);
91447636
A
1081 if (error) {
1082 PIPE_UNLOCK(rpipe);
1083 return (error);
1084 }
1085#endif
1086 ++wpipe->pipe_busy;
1087
1088 pipe_size = 0;
1089
1090 if (wpipe->pipe_buffer.buffer == 0) {
1091 /*
1092 * need to allocate some storage... we delay the allocation
1093 * until the first write on fd[0] to avoid allocating storage for both
1094 * 'pipe ends'... most pipes are half-duplex with the writes targeting
1095 * fd[1], so allocating space for both ends is a waste...
1096 *
1097 * Reduce to 1/4th pipe size if we're over our global max.
1098 */
1099 if (amountpipekva > maxpipekva / 2)
1100 pipe_size = SMALL_PIPE_SIZE;
1101 else
1102 pipe_size = PIPE_SIZE;
1103 }
1104
1105 /*
1106 * If it is advantageous to resize the pipe buffer, do
1107 * so.
1108 */
1109 if ((uio_resid(uio) > PIPE_SIZE) &&
1110 (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
1111 (amountpipekva < maxpipekva / 2) &&
1112 (nbigpipe < LIMITBIGPIPES) &&
1113#ifndef PIPE_NODIRECT
1114 (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1115#endif
1116 (wpipe->pipe_buffer.cnt == 0)) {
1117
1118 pipe_size = BIG_PIPE_SIZE;
1119
1120 }
1121 if (pipe_size) {
1122 /*
1123 * need to do initial allocation or resizing of pipe
1124 */
1125 if ((error = pipelock(wpipe, 1)) == 0) {
1126 PIPE_UNLOCK(wpipe);
1127 if (pipespace(wpipe, pipe_size) == 0)
1128 OSAddAtomic(1, (SInt32 *)&nbigpipe);
1129 PIPE_LOCK(wpipe);
1130 pipeunlock(wpipe);
1131
1132 if (wpipe->pipe_buffer.buffer == 0) {
1133 /*
1134 * initial allocation failed
1135 */
1136 error = ENOMEM;
1137 }
1138 }
1139 if (error) {
1140 /*
1141 * If an error occurred unbusy and return, waking up any pending
1142 * readers.
1143 */
1144 --wpipe->pipe_busy;
1145 if ((wpipe->pipe_busy == 0) &&
1146 (wpipe->pipe_state & PIPE_WANT)) {
1147 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1148 wakeup(wpipe);
1149 }
1150 PIPE_UNLOCK(rpipe);
1151 return(error);
1152 }
1153 }
1154 // LP64todo - fix this!
1155 orig_resid = uio_resid(uio);
1156
1157 while (uio_resid(uio)) {
1158 int space;
1159
1160#ifndef PIPE_NODIRECT
1161 /*
1162 * If the transfer is large, we can gain performance if
1163 * we do process-to-process copies directly.
1164 * If the write is non-blocking, we don't use the
1165 * direct write mechanism.
1166 *
1167 * The direct write mechanism will detect the reader going
1168 * away on us.
1169 */
1170 if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1171 (fp->f_flag & FNONBLOCK) == 0 &&
1172 amountpipekvawired + uio->uio_resid < maxpipekvawired) {
1173 error = pipe_direct_write(wpipe, uio);
1174 if (error)
1175 break;
1176 continue;
1177 }
1178
1179 /*
1180 * Pipe buffered writes cannot be coincidental with
1181 * direct writes. We wait until the currently executing
1182 * direct write is completed before we start filling the
1183 * pipe buffer. We break out if a signal occurs or the
1184 * reader goes away.
1185 */
1186 retrywrite:
1187 while (wpipe->pipe_state & PIPE_DIRECTW) {
1188 if (wpipe->pipe_state & PIPE_WANTR) {
1189 wpipe->pipe_state &= ~PIPE_WANTR;
1190 wakeup(wpipe);
1191 }
1192 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipbww", 0);
1193
1194 if (wpipe->pipe_state & PIPE_EOF)
1195 break;
1196 if (error)
1197 break;
1198 }
1199#else
1200 retrywrite:
1201#endif
1202 space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1203
1204 /*
1205 * Writes of size <= PIPE_BUF must be atomic.
1206 */
1207 if ((space < uio_resid(uio)) && (orig_resid <= PIPE_BUF))
1208 space = 0;
1209
1210 if (space > 0) {
1211
1212 if ((error = pipelock(wpipe,1)) == 0) {
1213 int size; /* Transfer size */
1214 int segsize; /* first segment to transfer */
1215
1216 if (wpipe->pipe_state & PIPE_EOF) {
1217 pipeunlock(wpipe);
1218 error = EPIPE;
1219 break;
1220 }
1221#ifndef PIPE_NODIRECT
1222 /*
1223 * It is possible for a direct write to
1224 * slip in on us... handle it here...
1225 */
1226 if (wpipe->pipe_state & PIPE_DIRECTW) {
1227 pipeunlock(wpipe);
1228 goto retrywrite;
1229 }
1230#endif
1231 /*
1232 * If a process blocked in pipelock, our
1233 * value for space might be bad... the mutex
1234 * is dropped while we're blocked
1235 */
1236 if (space > (int)(wpipe->pipe_buffer.size -
1237 wpipe->pipe_buffer.cnt)) {
1238 pipeunlock(wpipe);
1239 goto retrywrite;
1240 }
1241
1242 /*
1243 * Transfer size is minimum of uio transfer
1244 * and free space in pipe buffer.
1245 */
1246 // LP64todo - fix this!
1247 if (space > uio_resid(uio))
1248 size = uio_resid(uio);
1249 else
1250 size = space;
1251 /*
1252 * First segment to transfer is minimum of
1253 * transfer size and contiguous space in
1254 * pipe buffer. If first segment to transfer
1255 * is less than the transfer size, we've got
1256 * a wraparound in the buffer.
1257 */
1258 segsize = wpipe->pipe_buffer.size -
1259 wpipe->pipe_buffer.in;
1260 if (segsize > size)
1261 segsize = size;
1262
1263 /* Transfer first segment */
1264
1265 PIPE_UNLOCK(rpipe);
1266 error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1267 segsize, uio);
1268 PIPE_LOCK(rpipe);
1269
1270 if (error == 0 && segsize < size) {
1271 /*
1272 * Transfer remaining part now, to
1273 * support atomic writes. Wraparound
1274 * happened.
1275 */
1276 if (wpipe->pipe_buffer.in + segsize !=
1277 wpipe->pipe_buffer.size)
1278 panic("Expected pipe buffer "
1279 "wraparound disappeared");
1280
1281 PIPE_UNLOCK(rpipe);
1282 error = uiomove(
1283 &wpipe->pipe_buffer.buffer[0],
1284 size - segsize, uio);
1285 PIPE_LOCK(rpipe);
1286 }
1287 if (error == 0) {
1288 wpipe->pipe_buffer.in += size;
1289 if (wpipe->pipe_buffer.in >=
1290 wpipe->pipe_buffer.size) {
1291 if (wpipe->pipe_buffer.in !=
1292 size - segsize +
1293 wpipe->pipe_buffer.size)
1294 panic("Expected "
1295 "wraparound bad");
1296 wpipe->pipe_buffer.in = size -
1297 segsize;
1298 }
1299
1300 wpipe->pipe_buffer.cnt += size;
1301 if (wpipe->pipe_buffer.cnt >
1302 wpipe->pipe_buffer.size)
1303 panic("Pipe buffer overflow");
1304
1305 }
1306 pipeunlock(wpipe);
1307 }
1308 if (error)
1309 break;
1310
1311 } else {
1312 /*
1313 * If the "read-side" has been blocked, wake it up now.
1314 */
1315 if (wpipe->pipe_state & PIPE_WANTR) {
1316 wpipe->pipe_state &= ~PIPE_WANTR;
1317 wakeup(wpipe);
1318 }
1319 /*
1320 * don't block on non-blocking I/O
1321 * we'll do the pipeselwakeup on the way out
1322 */
1323 if (fp->f_flag & FNONBLOCK) {
1324 error = EAGAIN;
1325 break;
1326 }
1327 /*
1328 * We have no more space and have something to offer,
1329 * wake up select/poll.
1330 */
1331 pipeselwakeup(wpipe, wpipe);
1332
1333 wpipe->pipe_state |= PIPE_WANTW;
1334
1335 error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipewr", 0);
1336
1337 if (error != 0)
1338 break;
1339 /*
1340 * If read side wants to go away, we just issue a signal
1341 * to ourselves.
1342 */
1343 if (wpipe->pipe_state & PIPE_EOF) {
1344 error = EPIPE;
1345 break;
1346 }
1347 }
1348 }
1349 --wpipe->pipe_busy;
1350
1351 if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1352 wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1353 wakeup(wpipe);
1354 }
1355 if (wpipe->pipe_buffer.cnt > 0) {
1356 /*
1357 * If there are any characters in the buffer, we wake up
1358 * the reader if it was blocked waiting for data.
1359 */
1360 if (wpipe->pipe_state & PIPE_WANTR) {
1361 wpipe->pipe_state &= ~PIPE_WANTR;
1362 wakeup(wpipe);
1363 }
1364 /*
1365 * wake up thread blocked in select/poll or post the notification
1366 */
1367 pipeselwakeup(wpipe, wpipe);
1368 }
2d21ac55
A
1369
1370 /* Update modification, status change (# of bytes in pipe) times */
1371 pipe_touch(rpipe, PIPE_MTIME | PIPE_CTIME);
1372 pipe_touch(wpipe, PIPE_MTIME | PIPE_CTIME);
91447636
A
1373 PIPE_UNLOCK(rpipe);
1374
1375 return (error);
1376}
1377
1378/*
1379 * we implement a very minimal set of ioctls for compatibility with sockets.
1380 */
1381/* ARGSUSED 3 */
1382static int
2d21ac55
A
1383pipe_ioctl(struct fileproc *fp, u_long cmd, caddr_t data,
1384 __unused vfs_context_t ctx)
91447636
A
1385{
1386 struct pipe *mpipe = (struct pipe *)fp->f_data;
2d21ac55 1387#if CONFIG_MACF
91447636
A
1388 int error;
1389#endif
1390
1391 PIPE_LOCK(mpipe);
1392
2d21ac55
A
1393#if CONFIG_MACF
1394 error = mac_pipe_check_ioctl(kauth_cred_get(), mpipe, cmd);
91447636
A
1395 if (error) {
1396 PIPE_UNLOCK(mpipe);
1397
1398 return (error);
1399 }
1400#endif
1401
1402 switch (cmd) {
1403
1404 case FIONBIO:
1405 PIPE_UNLOCK(mpipe);
1406 return (0);
1407
1408 case FIOASYNC:
1409 if (*(int *)data) {
1410 mpipe->pipe_state |= PIPE_ASYNC;
1411 } else {
1412 mpipe->pipe_state &= ~PIPE_ASYNC;
1413 }
1414 PIPE_UNLOCK(mpipe);
1415 return (0);
1416
1417 case FIONREAD:
1418#ifndef PIPE_NODIRECT
1419 if (mpipe->pipe_state & PIPE_DIRECTW)
1420 *(int *)data = mpipe->pipe_map.cnt;
1421 else
1422#endif
1423 *(int *)data = mpipe->pipe_buffer.cnt;
1424 PIPE_UNLOCK(mpipe);
1425 return (0);
1426
1427 case TIOCSPGRP:
1428 mpipe->pipe_pgid = *(int *)data;
1429
1430 PIPE_UNLOCK(mpipe);
1431 return (0);
1432
1433 case TIOCGPGRP:
1434 *(int *)data = mpipe->pipe_pgid;
1435
1436 PIPE_UNLOCK(mpipe);
1437 return (0);
1438
1439 }
1440 PIPE_UNLOCK(mpipe);
1441 return (ENOTTY);
1442}
1443
1444
1445static int
2d21ac55 1446pipe_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
91447636
A
1447{
1448 struct pipe *rpipe = (struct pipe *)fp->f_data;
1449 struct pipe *wpipe;
1450 int retnum = 0;
1451
1452 if (rpipe == NULL || rpipe == (struct pipe *)-1)
1453 return (retnum);
1454
1455 PIPE_LOCK(rpipe);
1456
1457 wpipe = rpipe->pipe_peer;
1458
2d21ac55
A
1459#if CONFIG_MACF
1460 /*
1461 * XXX We should use a per thread credential here; minimally, the
1462 * XXX process credential should have a persistent reference on it
1463 * XXX before being passed in here.
1464 */
1465 if (mac_pipe_check_select(vfs_context_ucred(ctx), rpipe, which)) {
1466 PIPE_UNLOCK(rpipe);
1467 return (0);
1468 }
1469#endif
91447636
A
1470 switch (which) {
1471
1472 case FREAD:
1473 if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1474 (rpipe->pipe_buffer.cnt > 0) ||
1475 (rpipe->pipe_state & PIPE_EOF)) {
1476
1477 retnum = 1;
1478 } else {
1479 rpipe->pipe_state |= PIPE_SEL;
2d21ac55 1480 selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
91447636
A
1481 }
1482 break;
1483
1484 case FWRITE:
1485 if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1486 (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1487 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
1488
1489 retnum = 1;
1490 } else {
1491 wpipe->pipe_state |= PIPE_SEL;
2d21ac55 1492 selrecord(vfs_context_proc(ctx), &wpipe->pipe_sel, wql);
91447636
A
1493 }
1494 break;
1495 case 0:
1496 rpipe->pipe_state |= PIPE_SEL;
2d21ac55 1497 selrecord(vfs_context_proc(ctx), &rpipe->pipe_sel, wql);
91447636
A
1498 break;
1499 }
1500 PIPE_UNLOCK(rpipe);
1501
1502 return (retnum);
1503}
1504
1505
1506/* ARGSUSED 1 */
1507static int
2d21ac55 1508pipe_close(struct fileglob *fg, __unused vfs_context_t ctx)
91447636
A
1509{
1510 struct pipe *cpipe;
1511
2d21ac55 1512 proc_fdlock_spin(vfs_context_proc(ctx));
91447636
A
1513 cpipe = (struct pipe *)fg->fg_data;
1514 fg->fg_data = NULL;
2d21ac55 1515 proc_fdunlock(vfs_context_proc(ctx));
91447636
A
1516
1517 if (cpipe)
1518 pipeclose(cpipe);
1519
1520 return (0);
1521}
1522
1523static void
1524pipe_free_kmem(struct pipe *cpipe)
1525{
1526
1527 if (cpipe->pipe_buffer.buffer != NULL) {
1528 if (cpipe->pipe_buffer.size > PIPE_SIZE)
1529 OSAddAtomic(-1, (SInt32 *)&nbigpipe);
0c530ab8 1530 OSAddAtomic(-(cpipe->pipe_buffer.size), (SInt32 *)&amountpipekva);
91447636
A
1531 OSAddAtomic(-1, (SInt32 *)&amountpipes);
1532
1533 kmem_free(kernel_map, (vm_offset_t)cpipe->pipe_buffer.buffer,
1534 cpipe->pipe_buffer.size);
1535 cpipe->pipe_buffer.buffer = NULL;
1536 }
1537#ifndef PIPE_NODIRECT
1538 if (cpipe->pipe_map.kva != 0) {
1539 atomic_subtract_int(&amountpipekvawired,
1540 cpipe->pipe_buffer.size + PAGE_SIZE);
1541 kmem_free(kernel_map,
1542 cpipe->pipe_map.kva,
1543 cpipe->pipe_buffer.size + PAGE_SIZE);
1544 cpipe->pipe_map.cnt = 0;
1545 cpipe->pipe_map.kva = 0;
1546 cpipe->pipe_map.pos = 0;
1547 cpipe->pipe_map.npages = 0;
1548 }
1549#endif
1550}
1551
1552/*
1553 * shutdown the pipe
1554 */
1555static void
1556pipeclose(struct pipe *cpipe)
1557{
1558 struct pipe *ppipe;
1559
1560 if (cpipe == NULL)
1561 return;
1562
1563 /* partially created pipes won't have a valid mutex. */
1564 if (PIPE_MTX(cpipe) != NULL)
1565 PIPE_LOCK(cpipe);
1566
91447636
A
1567
1568 /*
1569 * If the other side is blocked, wake it up saying that
1570 * we want to close it down.
1571 */
2d21ac55
A
1572 cpipe->pipe_state |= PIPE_EOF;
1573 pipeselwakeup(cpipe, cpipe);
1574
91447636 1575 while (cpipe->pipe_busy) {
2d21ac55 1576 cpipe->pipe_state |= PIPE_WANT;
91447636
A
1577
1578 wakeup(cpipe);
91447636
A
1579 msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1580 }
1581
2d21ac55
A
1582#if CONFIG_MACF
1583 /*
1584 * Free the shared pipe label only after the two ends are disconnected.
1585 */
91447636 1586 if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
2d21ac55 1587 mac_pipe_label_destroy(cpipe);
91447636
A
1588#endif
1589
1590 /*
1591 * Disconnect from peer
1592 */
1593 if ((ppipe = cpipe->pipe_peer) != NULL) {
1594
1595 ppipe->pipe_state |= PIPE_EOF;
1596
1597 pipeselwakeup(ppipe, ppipe);
1598 wakeup(ppipe);
1599
1600 if (cpipe->pipe_state & PIPE_KNOTE)
1601 KNOTE(&ppipe->pipe_sel.si_note, 1);
1602
1603 postpipeevent(ppipe, EV_RCLOSED);
1604
1605 ppipe->pipe_peer = NULL;
1606 }
1607 evpipefree(cpipe);
1608
1609 /*
1610 * free resources
1611 */
1612 if (PIPE_MTX(cpipe) != NULL) {
1613 if (ppipe != NULL) {
1614 /*
1615 * since the mutex is shared and the peer is still
1616 * alive, we need to release the mutex, not free it
1617 */
1618 PIPE_UNLOCK(cpipe);
1619 } else {
1620 /*
1621 * peer is gone, so we're the sole party left with
1622 * interest in this mutex... we can just free it
1623 */
1624 lck_mtx_free(PIPE_MTX(cpipe), pipe_mtx_grp);
1625 }
1626 }
1627 pipe_free_kmem(cpipe);
1628
1629 zfree(pipe_zone, cpipe);
1630}
1631
91447636
A
1632/*ARGSUSED*/
1633static int
2d21ac55 1634pipe_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused vfs_context_t ctx)
91447636
A
1635{
1636 struct pipe *cpipe;
1637
1638 cpipe = (struct pipe *)kn->kn_fp->f_data;
1639
1640 PIPE_LOCK(cpipe);
2d21ac55
A
1641#if CONFIG_MACF
1642 /*
1643 * XXX We should use a per thread credential here; minimally, the
1644 * XXX process credential should have a persistent reference on it
1645 * XXX before being passed in here.
1646 */
1647 if (mac_pipe_check_kqfilter(vfs_context_ucred(ctx), kn, cpipe) != 0) {
1648 PIPE_UNLOCK(cpipe);
1649 return (1);
1650 }
1651#endif
91447636
A
1652
1653 switch (kn->kn_filter) {
1654 case EVFILT_READ:
1655 kn->kn_fop = &pipe_rfiltops;
2d21ac55 1656
91447636
A
1657 break;
1658 case EVFILT_WRITE:
1659 kn->kn_fop = &pipe_wfiltops;
1660
1661 if (cpipe->pipe_peer == NULL) {
1662 /*
1663 * other end of pipe has been closed
1664 */
1665 PIPE_UNLOCK(cpipe);
1666 return (EPIPE);
1667 }
2d21ac55 1668 if (cpipe->pipe_peer)
91447636
A
1669 cpipe = cpipe->pipe_peer;
1670 break;
1671 default:
1672 PIPE_UNLOCK(cpipe);
1673 return (1);
1674 }
1675
1676 if (KNOTE_ATTACH(&cpipe->pipe_sel.si_note, kn))
1677 cpipe->pipe_state |= PIPE_KNOTE;
1678
1679 PIPE_UNLOCK(cpipe);
1680 return (0);
1681}
1682
1683static void
1684filt_pipedetach(struct knote *kn)
1685{
1686 struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1687
1688 PIPE_LOCK(cpipe);
1689
1690 if (kn->kn_filter == EVFILT_WRITE) {
1691 if (cpipe->pipe_peer == NULL) {
1692 PIPE_UNLOCK(cpipe);
1693 return;
1694 }
1695 cpipe = cpipe->pipe_peer;
1696 }
1697 if (cpipe->pipe_state & PIPE_KNOTE) {
1698 if (KNOTE_DETACH(&cpipe->pipe_sel.si_note, kn))
1699 cpipe->pipe_state &= ~PIPE_KNOTE;
1700 }
1701 PIPE_UNLOCK(cpipe);
1702}
1703
1704/*ARGSUSED*/
1705static int
1706filt_piperead(struct knote *kn, long hint)
1707{
1708 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1709 struct pipe *wpipe;
1710 int retval;
1711
1712 /*
1713 * if hint == 0, then we've been called from the kevent
1714 * world directly and do not currently hold the pipe mutex...
1715 * if hint == 1, we're being called back via the KNOTE post
1716 * we made in pipeselwakeup, and we already hold the mutex...
1717 */
1718 if (hint == 0)
1719 PIPE_LOCK(rpipe);
1720
1721 wpipe = rpipe->pipe_peer;
1722 kn->kn_data = rpipe->pipe_buffer.cnt;
1723
1724#ifndef PIPE_NODIRECT
1725 if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1726 kn->kn_data = rpipe->pipe_map.cnt;
1727#endif
1728 if ((rpipe->pipe_state & PIPE_EOF) ||
1729 (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1730 kn->kn_flags |= EV_EOF;
1731 retval = 1;
2d21ac55 1732 } else {
91447636
A
1733 retval = (kn->kn_sfflags & NOTE_LOWAT) ?
1734 (kn->kn_data >= kn->kn_sdata) : (kn->kn_data > 0);
2d21ac55 1735 }
91447636
A
1736
1737 if (hint == 0)
1738 PIPE_UNLOCK(rpipe);
1739
1740 return (retval);
1741}
1742
1743/*ARGSUSED*/
1744static int
1745filt_pipewrite(struct knote *kn, long hint)
1746{
1747 struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1748 struct pipe *wpipe;
1749
1750 /*
1751 * if hint == 0, then we've been called from the kevent
1752 * world directly and do not currently hold the pipe mutex...
1753 * if hint == 1, we're being called back via the KNOTE post
1754 * we made in pipeselwakeup, and we already hold the mutex...
1755 */
1756 if (hint == 0)
1757 PIPE_LOCK(rpipe);
1758
1759 wpipe = rpipe->pipe_peer;
1760
1761 if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1762 kn->kn_data = 0;
1763 kn->kn_flags |= EV_EOF;
1764
1765 if (hint == 0)
1766 PIPE_UNLOCK(rpipe);
1767 return (1);
1768 }
1769 kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
2d21ac55
A
1770 if (!kn->kn_data && wpipe->pipe_buffer.size == 0)
1771 kn->kn_data = 1; /* unwritten pipe is ready for write */
91447636
A
1772
1773#ifndef PIPE_NODIRECT
1774 if (wpipe->pipe_state & PIPE_DIRECTW)
1775 kn->kn_data = 0;
1776#endif
1777 if (hint == 0)
1778 PIPE_UNLOCK(rpipe);
1779
1780 return (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
1781 kn->kn_sdata : PIPE_BUF));
1782}
0c530ab8
A
1783
1784int
1785fill_pipeinfo(struct pipe * cpipe, struct pipe_info * pinfo)
1786{
2d21ac55 1787#if CONFIG_MACF
0c530ab8
A
1788 int error;
1789#endif
1790 struct timeval now;
2d21ac55
A
1791 struct vinfo_stat * ub;
1792 int pipe_size = 0;
1793 int pipe_count;
0c530ab8
A
1794
1795 if (cpipe == NULL)
1796 return (EBADF);
0c530ab8 1797 PIPE_LOCK(cpipe);
2d21ac55
A
1798
1799#if CONFIG_MACF
1800 error = mac_pipe_check_stat(kauth_cred_get(), cpipe);
1801 if (error) {
1802 PIPE_UNLOCK(cpipe);
0c530ab8 1803 return (error);
2d21ac55 1804 }
0c530ab8
A
1805#endif
1806 if (cpipe->pipe_buffer.buffer == 0) {
1807 /*
1808 * must be stat'ing the write fd
1809 */
2d21ac55
A
1810 if (cpipe->pipe_peer) {
1811 /*
1812 * the peer still exists, use it's info
1813 */
1814 pipe_size = cpipe->pipe_peer->pipe_buffer.size;
1815 pipe_count = cpipe->pipe_peer->pipe_buffer.cnt;
1816 } else {
1817 pipe_count = 0;
1818 }
1819 } else {
1820 pipe_size = cpipe->pipe_buffer.size;
1821 pipe_count = cpipe->pipe_buffer.cnt;
0c530ab8 1822 }
2d21ac55
A
1823 /*
1824 * since peer's buffer is setup ouside of lock
1825 * we might catch it in transient state
1826 */
1827 if (pipe_size == 0)
1828 pipe_size = PIPE_SIZE;
0c530ab8
A
1829
1830 ub = &pinfo->pipe_stat;
1831
1832 bzero(ub, sizeof(*ub));
2d21ac55
A
1833 ub->vst_mode = S_IFIFO | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
1834 ub->vst_blksize = pipe_size;
1835 ub->vst_size = pipe_count;
1836 if (ub->vst_blksize != 0)
1837 ub->vst_blocks = (ub->vst_size + ub->vst_blksize - 1) / ub->vst_blksize;
1838 ub->vst_nlink = 1;
0c530ab8 1839
2d21ac55
A
1840 ub->vst_uid = kauth_getuid();
1841 ub->vst_gid = kauth_getgid();
0c530ab8
A
1842
1843 microtime(&now);
2d21ac55
A
1844 ub->vst_atime = now.tv_sec;
1845 ub->vst_atimensec = now.tv_usec * 1000;
0c530ab8 1846
2d21ac55
A
1847 ub->vst_mtime = now.tv_sec;
1848 ub->vst_mtimensec = now.tv_usec * 1000;
0c530ab8 1849
2d21ac55
A
1850 ub->vst_ctime = now.tv_sec;
1851 ub->vst_ctimensec = now.tv_usec * 1000;
0c530ab8
A
1852
1853 /*
1854 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen, st_uid, st_gid.
1855 * XXX (st_dev, st_ino) should be unique.
1856 */
1857
1858 pinfo->pipe_handle = (uint64_t)((uintptr_t)cpipe);
1859 pinfo->pipe_peerhandle = (uint64_t)((uintptr_t)(cpipe->pipe_peer));
1860 pinfo->pipe_status = cpipe->pipe_state;
2d21ac55
A
1861
1862 PIPE_UNLOCK(cpipe);
1863
0c530ab8
A
1864 return (0);
1865}