]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/sys_generic.c
02286d88d4ea05a2c81f838d4fe9b730b28d0244
[apple/xnu.git] / bsd / kern / sys_generic.c
1 /*
2 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1989, 1993
31 * The Regents of the University of California. All rights reserved.
32 * (c) UNIX System Laboratories, Inc.
33 * All or some portions of this file are derived from material licensed
34 * to the University of California by American Telephone and Telegraph
35 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
36 * the permission of UNIX System Laboratories, Inc.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
67 */
68
69 #include <sys/param.h>
70 #include <sys/systm.h>
71 #include <sys/filedesc.h>
72 #include <sys/ioctl.h>
73 #include <sys/file_internal.h>
74 #include <sys/proc_internal.h>
75 #include <sys/socketvar.h>
76 #if KTRACE
77 #include <sys/uio_internal.h>
78 #else
79 #include <sys/uio.h>
80 #endif
81 #include <sys/kernel.h>
82 #include <sys/stat.h>
83 #include <sys/malloc.h>
84 #include <sys/sysproto.h>
85
86 #include <sys/mount_internal.h>
87 #include <sys/protosw.h>
88 #include <sys/ev.h>
89 #include <sys/user.h>
90 #include <sys/kdebug.h>
91 #include <sys/poll.h>
92 #include <sys/event.h>
93 #include <sys/eventvar.h>
94
95 #include <mach/mach_types.h>
96 #include <kern/kern_types.h>
97 #include <kern/assert.h>
98 #include <kern/kalloc.h>
99 #include <kern/thread.h>
100 #include <kern/clock.h>
101
102 #include <sys/mbuf.h>
103 #include <sys/socket.h>
104 #include <sys/socketvar.h>
105 #include <sys/errno.h>
106 #include <sys/syscall.h>
107 #include <sys/pipe.h>
108
109 #include <bsm/audit_kernel.h>
110
111 #include <net/if.h>
112 #include <net/route.h>
113
114 #include <netinet/in.h>
115 #include <netinet/in_systm.h>
116 #include <netinet/ip.h>
117 #include <netinet/in_pcb.h>
118 #include <netinet/ip_var.h>
119 #include <netinet/ip6.h>
120 #include <netinet/tcp.h>
121 #include <netinet/tcp_fsm.h>
122 #include <netinet/tcp_seq.h>
123 #include <netinet/tcp_timer.h>
124 #include <netinet/tcp_var.h>
125 #include <netinet/tcpip.h>
126 #include <netinet/tcp_debug.h>
127 /* for wait queue based select */
128 #include <kern/wait_queue.h>
129 #include <kern/kalloc.h>
130 #if KTRACE
131 #include <sys/ktrace.h>
132 #endif
133 #include <sys/vnode_internal.h>
134
135 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
136 int wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
137 extern void *get_bsduthreadarg(thread_t);
138 extern int *get_bsduthreadrval(thread_t);
139
140 __private_extern__ int dofileread(struct proc *p, struct fileproc *fp, int fd,
141 user_addr_t bufp, user_size_t nbyte,
142 off_t offset, int flags, user_ssize_t *retval);
143 __private_extern__ int dofilewrite(struct proc *p, struct fileproc *fp, int fd,
144 user_addr_t bufp, user_size_t nbyte,
145 off_t offset, int flags, user_ssize_t *retval);
146 __private_extern__ int preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
147 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
148
149 #if NETAT
150 extern int appletalk_inited;
151 #endif /* NETAT */
152
153 #define f_flag f_fglob->fg_flag
154 #define f_type f_fglob->fg_type
155 #define f_msgcount f_fglob->fg_msgcount
156 #define f_cred f_fglob->fg_cred
157 #define f_ops f_fglob->fg_ops
158 #define f_offset f_fglob->fg_offset
159 #define f_data f_fglob->fg_data
160 /*
161 * Read system call.
162 */
163 int
164 read(p, uap, retval)
165 struct proc *p;
166 register struct read_args *uap;
167 user_ssize_t *retval;
168 {
169 struct fileproc *fp;
170 int error;
171 int fd = uap->fd;
172
173 if ( (error = preparefileread(p, &fp, fd, 0)) )
174 return (error);
175
176 error = dofileread(p, fp, uap->fd, uap->cbuf, uap->nbyte,
177 (off_t)-1, 0, retval);
178
179 donefileread(p, fp, fd);
180
181 return (error);
182 }
183
184 /*
185 * Pread system call
186 */
187 int
188 pread(p, uap, retval)
189 struct proc *p;
190 register struct pread_args *uap;
191 user_ssize_t *retval;
192 {
193 struct fileproc *fp;
194 int fd = uap->fd;
195 int error;
196
197 if ( (error = preparefileread(p, &fp, fd, 1)) )
198 return (error);
199
200 error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
201 uap->offset, FOF_OFFSET, retval);
202
203 donefileread(p, fp, fd);
204
205 if (!error)
206 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
207 uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
208
209 return (error);
210 }
211
212 /*
213 * Code common for read and pread
214 */
215
216 void
217 donefileread(struct proc *p, struct fileproc *fp, int fd)
218 {
219 proc_fdlock(p);
220
221 fp->f_flags &= ~FP_INCHRREAD;
222
223 fp_drop(p, fd, fp, 1);
224 proc_fdunlock(p);
225 }
226
227 int
228 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
229 {
230 vnode_t vp;
231 int error;
232 struct fileproc *fp;
233
234 proc_fdlock(p);
235
236 error = fp_lookup(p, fd, &fp, 1);
237
238 if (error) {
239 proc_fdunlock(p);
240 return (error);
241 }
242 if ((fp->f_flag & FREAD) == 0) {
243 error = EBADF;
244 goto out;
245 }
246 if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
247 error = ESPIPE;
248 goto out;
249 }
250 if (fp->f_type == DTYPE_VNODE) {
251 vp = (struct vnode *)fp->f_fglob->fg_data;
252
253 if (vp->v_type == VCHR)
254 fp->f_flags |= FP_INCHRREAD;
255 }
256
257 *fp_ret = fp;
258
259 proc_fdunlock(p);
260 return (0);
261
262 out:
263 fp_drop(p, fd, fp, 1);
264 proc_fdunlock(p);
265 return (error);
266 }
267
268
269 __private_extern__ int
270 dofileread(p, fp, fd, bufp, nbyte, offset, flags, retval)
271 struct proc *p;
272 struct fileproc *fp;
273 int fd, flags;
274 user_addr_t bufp;
275 user_size_t nbyte;
276 off_t offset;
277 user_ssize_t *retval;
278 {
279 uio_t auio;
280 user_ssize_t bytecnt;
281 long error = 0;
282 char uio_buf[ UIO_SIZEOF(1) ];
283 #if KTRACE
284 uio_t ktruio = NULL;
285 char ktr_uio_buf[ UIO_SIZEOF(1) ];
286 int didktr = 0;
287 #endif
288
289 // LP64todo - do we want to raise this?
290 if (nbyte > INT_MAX)
291 return (EINVAL);
292
293 if (IS_64BIT_PROCESS(p)) {
294 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
295 &uio_buf[0], sizeof(uio_buf));
296 } else {
297 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
298 &uio_buf[0], sizeof(uio_buf));
299 }
300 uio_addiov(auio, bufp, nbyte);
301
302 #if KTRACE
303 /*
304 * if tracing, save a copy of iovec
305 */
306 if (KTRPOINT(p, KTR_GENIO)) {
307 didktr = 1;
308
309 if (IS_64BIT_PROCESS(p)) {
310 ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ,
311 &ktr_uio_buf[0], sizeof(ktr_uio_buf));
312 } else {
313 ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ,
314 &ktr_uio_buf[0], sizeof(ktr_uio_buf));
315 }
316 uio_addiov(ktruio, bufp, nbyte);
317 }
318 #endif
319 bytecnt = nbyte;
320
321 if ((error = fo_read(fp, auio, fp->f_cred, flags, p))) {
322 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
323 error == EINTR || error == EWOULDBLOCK))
324 error = 0;
325 }
326 bytecnt -= uio_resid(auio);
327 #if KTRACE
328 if (didktr && error == 0) {
329 uio_setresid(ktruio, bytecnt);
330 ktrgenio(p->p_tracep, fd, UIO_READ, ktruio, error);
331 }
332 #endif
333
334 *retval = bytecnt;
335
336 return (error);
337 }
338
339 /*
340 * Scatter read system call.
341 */
342 int
343 readv(p, uap, retval)
344 struct proc *p;
345 register struct readv_args *uap;
346 user_ssize_t *retval;
347 {
348 uio_t auio = NULL;
349 int error;
350 int size_of_iovec;
351 struct user_iovec *iovp;
352
353 /* Verify range bedfore calling uio_create() */
354 if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
355 return (EINVAL);
356
357 /* allocate a uio large enough to hold the number of iovecs passed */
358 auio = uio_create(uap->iovcnt, 0,
359 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
360 UIO_READ);
361
362 /* get location of iovecs within the uio. then copyin the iovecs from
363 * user space.
364 */
365 iovp = uio_iovsaddr(auio);
366 if (iovp == NULL) {
367 error = ENOMEM;
368 goto ExitThisRoutine;
369 }
370 size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec));
371 error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec));
372 if (error) {
373 goto ExitThisRoutine;
374 }
375
376 /* finalize uio_t for use and do the IO
377 */
378 uio_calculateresid(auio);
379 error = rd_uio(p, uap->fd, auio, retval);
380
381 ExitThisRoutine:
382 if (auio != NULL) {
383 uio_free(auio);
384 }
385 return (error);
386 }
387
388 /*
389 * Write system call
390 */
391 int
392 write(p, uap, retval)
393 struct proc *p;
394 register struct write_args *uap;
395 user_ssize_t *retval;
396 {
397 struct fileproc *fp;
398 int error;
399 int fd = uap->fd;
400
401 error = fp_lookup(p,fd,&fp,0);
402 if (error)
403 return(error);
404 if ((fp->f_flag & FWRITE) == 0) {
405 error = EBADF;
406 } else {
407 error = dofilewrite(p, fp, uap->fd, uap->cbuf, uap->nbyte,
408 (off_t)-1, 0, retval);
409 }
410 if (error == 0)
411 fp_drop_written(p, fd, fp);
412 else
413 fp_drop(p, fd, fp, 0);
414 return(error);
415 }
416
417 /*
418 * pwrite system call
419 */
420 int
421 pwrite(p, uap, retval)
422 struct proc *p;
423 register struct pwrite_args *uap;
424 user_ssize_t *retval;
425 {
426 struct fileproc *fp;
427 int error;
428 int fd = uap->fd;
429
430 error = fp_lookup(p,fd,&fp,0);
431 if (error)
432 return(error);
433
434 if ((fp->f_flag & FWRITE) == 0) {
435 error = EBADF;
436 } else {
437 if (fp->f_type != DTYPE_VNODE) {
438 error = ESPIPE;
439 } else {
440 error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
441 uap->offset, FOF_OFFSET, retval);
442 }
443 }
444 if (error == 0)
445 fp_drop_written(p, fd, fp);
446 else
447 fp_drop(p, fd, fp, 0);
448
449 if (!error)
450 KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
451 uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
452
453 return(error);
454 }
455
456 __private_extern__ int
457 dofilewrite(p, fp, fd, bufp, nbyte, offset, flags, retval)
458 struct proc *p;
459 struct fileproc *fp;
460 int fd, flags;
461 user_addr_t bufp;
462 user_size_t nbyte;
463 off_t offset;
464 user_ssize_t *retval;
465 {
466 uio_t auio;
467 long error = 0;
468 user_ssize_t bytecnt;
469 char uio_buf[ UIO_SIZEOF(1) ];
470 #if KTRACE
471 uio_t ktruio;
472 int didktr = 0;
473 char ktr_uio_buf[ UIO_SIZEOF(1) ];
474 #endif
475
476 // LP64todo - do we want to raise this?
477 if (nbyte > INT_MAX)
478 return (EINVAL);
479
480 if (IS_64BIT_PROCESS(p)) {
481 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
482 &uio_buf[0], sizeof(uio_buf));
483 } else {
484 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
485 &uio_buf[0], sizeof(uio_buf));
486 }
487 uio_addiov(auio, bufp, nbyte);
488
489 #if KTRACE
490 /*
491 * if tracing, save a copy of iovec and uio
492 */
493 if (KTRPOINT(p, KTR_GENIO)) {
494 didktr = 1;
495
496 if (IS_64BIT_PROCESS(p)) {
497 ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE,
498 &ktr_uio_buf[0], sizeof(ktr_uio_buf));
499 } else {
500 ktruio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE,
501 &ktr_uio_buf[0], sizeof(ktr_uio_buf));
502 }
503 uio_addiov(ktruio, bufp, nbyte);
504 }
505 #endif
506 bytecnt = nbyte;
507 if ((error = fo_write(fp, auio, fp->f_cred, flags, p))) {
508 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
509 error == EINTR || error == EWOULDBLOCK))
510 error = 0;
511 /* The socket layer handles SIGPIPE */
512 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
513 psignal(p, SIGPIPE);
514 }
515 bytecnt -= uio_resid(auio);
516 #if KTRACE
517 if (didktr && error == 0) {
518 uio_setresid(ktruio, bytecnt);
519 ktrgenio(p->p_tracep, fd, UIO_WRITE, ktruio, error);
520 }
521 #endif
522 *retval = bytecnt;
523
524 return (error);
525 }
526
527 /*
528 * Gather write system call
529 */
530 int
531 writev(p, uap, retval)
532 struct proc *p;
533 register struct writev_args *uap;
534 user_ssize_t *retval;
535 {
536 uio_t auio = NULL;
537 int error;
538 int size_of_iovec;
539 struct user_iovec *iovp;
540
541 /* Verify range bedfore calling uio_create() */
542 if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
543 return (EINVAL);
544
545 /* allocate a uio large enough to hold the number of iovecs passed */
546 auio = uio_create(uap->iovcnt, 0,
547 (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
548 UIO_WRITE);
549
550 /* get location of iovecs within the uio. then copyin the iovecs from
551 * user space.
552 */
553 iovp = uio_iovsaddr(auio);
554 if (iovp == NULL) {
555 error = ENOMEM;
556 goto ExitThisRoutine;
557 }
558 size_of_iovec = (IS_64BIT_PROCESS(p) ? sizeof(struct user_iovec) : sizeof(struct iovec));
559 error = copyin(uap->iovp, (caddr_t)iovp, (uap->iovcnt * size_of_iovec));
560 if (error) {
561 goto ExitThisRoutine;
562 }
563
564 /* finalize uio_t for use and do the IO
565 */
566 uio_calculateresid(auio);
567 error = wr_uio(p, uap->fd, auio, retval);
568
569 ExitThisRoutine:
570 if (auio != NULL) {
571 uio_free(auio);
572 }
573 return (error);
574 }
575
576
577 int
578 wr_uio(p, fdes, uio, retval)
579 struct proc *p;
580 int fdes;
581 register uio_t uio;
582 user_ssize_t *retval;
583 {
584 struct fileproc *fp;
585 int error;
586 user_ssize_t count;
587 #if KTRACE
588 struct iovec_64 *ktriov = NULL;
589 struct uio ktruio;
590 int didktr = 0;
591 u_int iovlen;
592 #endif
593
594 error = fp_lookup(p,fdes,&fp,0);
595 if (error)
596 return(error);
597
598 if ((fp->f_flag & FWRITE) == 0) {
599 error = EBADF;
600 goto out;
601 }
602 count = uio_resid(uio);
603 #if KTRACE
604 /*
605 * if tracing, save a copy of iovec
606 */
607 if (KTRPOINT(p, KTR_GENIO)) {
608 iovlen = uio->uio_iovcnt *
609 (IS_64BIT_PROCESS(p) ? sizeof (struct iovec_64) : sizeof (struct iovec_32));
610 MALLOC(ktriov, struct iovec_64 *, iovlen, M_TEMP, M_WAITOK);
611 if (ktriov != NULL) {
612 bcopy((caddr_t)uio->uio_iovs.iov64p, (caddr_t)ktriov, iovlen);
613 ktruio = *uio;
614 didktr = 1;
615 }
616 }
617 #endif
618 error = fo_write(fp, uio, fp->f_cred, 0, p);
619 if (error) {
620 if (uio_resid(uio) != count && (error == ERESTART ||
621 error == EINTR || error == EWOULDBLOCK))
622 error = 0;
623 /* The socket layer handles SIGPIPE */
624 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
625 psignal(p, SIGPIPE);
626 }
627 *retval = count - uio_resid(uio);
628
629 #if KTRACE
630 if (didktr) {
631 if (error == 0) {
632 ktruio.uio_iovs.iov64p = ktriov;
633 uio_setresid(&ktruio, *retval);
634 ktrgenio(p->p_tracep, fdes, UIO_WRITE, &ktruio, error);
635 }
636 FREE(ktriov, M_TEMP);
637 }
638 #endif
639
640 out:
641 if ( (error == 0) )
642 fp_drop_written(p, fdes, fp);
643 else
644 fp_drop(p, fdes, fp, 0);
645 return(error);
646 }
647
648
649 int
650 rd_uio(p, fdes, uio, retval)
651 struct proc *p;
652 int fdes;
653 register uio_t uio;
654 user_ssize_t *retval;
655 {
656 struct fileproc *fp;
657 int error;
658 user_ssize_t count;
659 #if KTRACE
660 struct iovec_64 *ktriov = NULL;
661 struct uio ktruio;
662 int didktr = 0;
663 u_int iovlen;
664 #endif
665
666 if ( (error = preparefileread(p, &fp, fdes, 0)) )
667 return (error);
668
669 count = uio_resid(uio);
670 #if KTRACE
671 /*
672 * if tracing, save a copy of iovec
673 */
674 if (KTRPOINT(p, KTR_GENIO)) {
675 iovlen = uio->uio_iovcnt *
676 (IS_64BIT_PROCESS(p) ? sizeof (struct iovec_64) : sizeof (struct iovec_32));
677 MALLOC(ktriov, struct iovec_64 *, iovlen, M_TEMP, M_WAITOK);
678 if (ktriov != NULL) {
679 bcopy((caddr_t)uio->uio_iovs.iov64p, (caddr_t)ktriov, iovlen);
680 ktruio = *uio;
681 didktr = 1;
682 }
683 }
684 #endif
685 error = fo_read(fp, uio, fp->f_cred, 0, p);
686
687 if (error) {
688 if (uio_resid(uio) != count && (error == ERESTART ||
689 error == EINTR || error == EWOULDBLOCK))
690 error = 0;
691 }
692 *retval = count - uio_resid(uio);
693
694 #if KTRACE
695 if (didktr) {
696 if (error == 0) {
697 ktruio.uio_iovs.iov64p = ktriov;
698 uio_setresid(&ktruio, *retval);
699 ktrgenio(p->p_tracep, fdes, UIO_READ, &ktruio, error);
700 }
701 FREE(ktriov, M_TEMP);
702 }
703 #endif
704 donefileread(p, fp, fdes);
705
706 return (error);
707 }
708
709 /*
710 * Ioctl system call
711 *
712 */
713 int
714 ioctl(struct proc *p, register struct ioctl_args *uap, __unused register_t *retval)
715 {
716 struct fileproc *fp;
717 register u_long com;
718 int error = 0;
719 register u_int size;
720 caddr_t datap, memp;
721 boolean_t is64bit;
722 int tmp;
723 #define STK_PARAMS 128
724 char stkbuf[STK_PARAMS];
725 int fd = uap->fd;
726
727 AUDIT_ARG(fd, uap->fd);
728 AUDIT_ARG(cmd, CAST_DOWN(int, uap->com)); /* LP64todo: uap->com is a user-land long */
729 AUDIT_ARG(addr, uap->data);
730
731 is64bit = proc_is64bit(p);
732
733 proc_fdlock(p);
734 error = fp_lookup(p,fd,&fp,1);
735 if (error) {
736 proc_fdunlock(p);
737 return(error);
738 }
739
740 AUDIT_ARG(file, p, fp);
741
742 if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
743 error = EBADF;
744 goto out;
745 }
746
747 #if NETAT
748 /*
749 * ### LD 6/11/97 Hack Alert: this is to get AppleTalk to work
750 * while implementing an ATioctl system call
751 */
752 {
753 if (appletalk_inited && ((uap->com & 0x0000FFFF) == 0xff99)) {
754 u_long fixed_command;
755 #ifdef APPLETALK_DEBUG
756 kprintf("ioctl: special AppleTalk \n");
757 #endif
758 datap = &stkbuf[0];
759 *(user_addr_t *)datap = uap->data;
760 fixed_command = _IOW(0, 0xff99, uap->data);
761 error = fo_ioctl(fp, fixed_command, datap, p);
762 goto out;
763 }
764 }
765
766 #endif /* NETAT */
767
768
769 switch (com = uap->com) {
770 case FIONCLEX:
771 *fdflags(p, uap->fd) &= ~UF_EXCLOSE;
772 error =0;
773 goto out;
774 case FIOCLEX:
775 *fdflags(p, uap->fd) |= UF_EXCLOSE;
776 error =0;
777 goto out;
778 }
779
780 /*
781 * Interpret high order word to find amount of data to be
782 * copied to/from the user's address space.
783 */
784 size = IOCPARM_LEN(com);
785 if (size > IOCPARM_MAX) {
786 error = ENOTTY;
787 goto out;
788 }
789 memp = NULL;
790 if (size > sizeof (stkbuf)) {
791 proc_fdunlock(p);
792 if ((memp = (caddr_t)kalloc(size)) == 0) {
793 proc_fdlock(p);
794 error = ENOMEM;
795 goto out;
796 }
797 proc_fdlock(p);
798 datap = memp;
799 } else
800 datap = &stkbuf[0];
801 if (com&IOC_IN) {
802 if (size) {
803 proc_fdunlock(p);
804 error = copyin(uap->data, datap, size);
805 if (error) {
806 if (memp)
807 kfree(memp, size);
808 proc_fdlock(p);
809 goto out;
810 }
811 proc_fdlock(p);
812 } else {
813 /* XXX - IOC_IN and no size? we should proably return an error here!! */
814 if (is64bit) {
815 *(user_addr_t *)datap = uap->data;
816 }
817 else {
818 *(uint32_t *)datap = (uint32_t)uap->data;
819 }
820 }
821 } else if ((com&IOC_OUT) && size)
822 /*
823 * Zero the buffer so the user always
824 * gets back something deterministic.
825 */
826 bzero(datap, size);
827 else if (com&IOC_VOID) {
828 /* XXX - this is odd since IOC_VOID means no parameters */
829 if (is64bit) {
830 *(user_addr_t *)datap = uap->data;
831 }
832 else {
833 *(uint32_t *)datap = (uint32_t)uap->data;
834 }
835 }
836
837 switch (com) {
838
839 case FIONBIO:
840 if ( (tmp = *(int *)datap) )
841 fp->f_flag |= FNONBLOCK;
842 else
843 fp->f_flag &= ~FNONBLOCK;
844 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
845 break;
846
847 case FIOASYNC:
848 if ( (tmp = *(int *)datap) )
849 fp->f_flag |= FASYNC;
850 else
851 fp->f_flag &= ~FASYNC;
852 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
853 break;
854
855 case FIOSETOWN:
856 tmp = *(int *)datap;
857 if (fp->f_type == DTYPE_SOCKET) {
858 ((struct socket *)fp->f_data)->so_pgid = tmp;
859 error = 0;
860 break;
861 }
862 if (fp->f_type == DTYPE_PIPE) {
863 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
864 break;
865 }
866 if (tmp <= 0) {
867 tmp = -tmp;
868 } else {
869 struct proc *p1 = pfind(tmp);
870 if (p1 == 0) {
871 error = ESRCH;
872 break;
873 }
874 tmp = p1->p_pgrp->pg_id;
875 }
876 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
877 break;
878
879 case FIOGETOWN:
880 if (fp->f_type == DTYPE_SOCKET) {
881 error = 0;
882 *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
883 break;
884 }
885 error = fo_ioctl(fp, TIOCGPGRP, datap, p);
886 *(int *)datap = -*(int *)datap;
887 break;
888
889 default:
890 error = fo_ioctl(fp, com, datap, p);
891 /*
892 * Copy any data to user, size was
893 * already set and checked above.
894 */
895 if (error == 0 && (com&IOC_OUT) && size)
896 error = copyout(datap, uap->data, (u_int)size);
897 break;
898 }
899 proc_fdunlock(p);
900 if (memp)
901 kfree(memp, size);
902 proc_fdlock(p);
903 out:
904 fp_drop(p, fd, fp, 1);
905 proc_fdunlock(p);
906 return(error);
907 }
908
909 int selwait, nselcoll;
910 #define SEL_FIRSTPASS 1
911 #define SEL_SECONDPASS 2
912 extern int selcontinue(int error);
913 extern int selprocess(int error, int sel_pass);
914 static int selscan(struct proc *p, struct _select * sel,
915 int nfd, register_t *retval, int sel_pass, wait_queue_sub_t wqsub);
916 static int selcount(struct proc *p, u_int32_t *ibits, u_int32_t *obits,
917 int nfd, int * count);
918 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
919 extern uint64_t tvtoabstime(struct timeval *tvp);
920
921 /*
922 * Select system call.
923 */
924 int
925 select(struct proc *p, struct select_args *uap, register_t *retval)
926 {
927 int error = 0;
928 u_int ni, nw, size;
929 thread_t th_act;
930 struct uthread *uth;
931 struct _select *sel;
932 int needzerofill = 1;
933 int count = 0;
934
935 th_act = current_thread();
936 uth = get_bsdthread_info(th_act);
937 sel = &uth->uu_select;
938 retval = (int *)get_bsduthreadrval(th_act);
939 *retval = 0;
940
941 if (uap->nd < 0) {
942 return (EINVAL);
943 }
944
945 if (uap->nd > p->p_fd->fd_nfiles)
946 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
947
948 nw = howmany(uap->nd, NFDBITS);
949 ni = nw * sizeof(fd_mask);
950
951 /*
952 * if this is the first select by the thread
953 * allocate the space for bits.
954 */
955 if (sel->nbytes == 0) {
956 sel->nbytes = 3 * ni;
957 MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
958 MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
959 if ((sel->ibits == NULL) || (sel->obits == NULL))
960 panic("select out of memory");
961 needzerofill = 0;
962 }
963
964 /*
965 * if the previously allocated space for the bits
966 * is smaller than what is requested. Reallocate.
967 */
968 if (sel->nbytes < (3 * ni)) {
969 sel->nbytes = (3 * ni);
970 FREE(sel->ibits, M_TEMP);
971 FREE(sel->obits, M_TEMP);
972 MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
973 MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK | M_ZERO);
974 if ((sel->ibits == NULL) || (sel->obits == NULL))
975 panic("select out of memory");
976 needzerofill = 0;
977 }
978
979 if (needzerofill) {
980 bzero((caddr_t)sel->ibits, sel->nbytes);
981 bzero((caddr_t)sel->obits, sel->nbytes);
982 }
983
984 /*
985 * get the bits from the user address space
986 */
987 #define getbits(name, x) \
988 do { \
989 if (uap->name && (error = copyin(uap->name, \
990 (caddr_t)&sel->ibits[(x) * nw], ni))) \
991 goto continuation; \
992 } while (0)
993
994 getbits(in, 0);
995 getbits(ou, 1);
996 getbits(ex, 2);
997 #undef getbits
998
999 if (uap->tv) {
1000 struct timeval atv;
1001 if (IS_64BIT_PROCESS(p)) {
1002 struct user_timeval atv64;
1003 error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
1004 /* Loses resolution - assume timeout < 68 years */
1005 atv.tv_sec = atv64.tv_sec;
1006 atv.tv_usec = atv64.tv_usec;
1007 } else {
1008 error = copyin(uap->tv, (caddr_t)&atv, sizeof(atv));
1009 }
1010 if (error)
1011 goto continuation;
1012 if (itimerfix(&atv)) {
1013 error = EINVAL;
1014 goto continuation;
1015 }
1016
1017 clock_absolutetime_interval_to_deadline(
1018 tvtoabstime(&atv), &sel->abstime);
1019 }
1020 else
1021 sel->abstime = 0;
1022
1023 if ( (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count)) ) {
1024 goto continuation;
1025 }
1026
1027 sel->count = count;
1028 size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK);
1029 if (sel->allocsize) {
1030 if (sel->wqset == 0)
1031 panic("select: wql memory smashed");
1032 /* needed for the select now */
1033 if (size > sel->allocsize) {
1034 kfree(sel->wqset, sel->allocsize);
1035 sel->allocsize = size;
1036 sel->wqset = (wait_queue_set_t)kalloc(size);
1037 if (sel->wqset == (wait_queue_set_t)NULL)
1038 panic("failed to allocate memory for waitqueue\n");
1039 }
1040 } else {
1041 sel->count = count;
1042 sel->allocsize = size;
1043 sel->wqset = (wait_queue_set_t)kalloc(sel->allocsize);
1044 if (sel->wqset == (wait_queue_set_t)NULL)
1045 panic("failed to allocate memory for waitqueue\n");
1046 }
1047 bzero(sel->wqset, size);
1048 sel->wql = (char *)sel->wqset + SIZEOF_WAITQUEUE_SET;
1049 wait_queue_set_init(sel->wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST));
1050
1051 continuation:
1052 return selprocess(error, SEL_FIRSTPASS);
1053 }
1054
1055 int
1056 selcontinue(int error)
1057 {
1058 return selprocess(error, SEL_SECONDPASS);
1059 }
1060
1061 int
1062 selprocess(int error, int sel_pass)
1063 {
1064 int ncoll;
1065 u_int ni, nw;
1066 thread_t th_act;
1067 struct uthread *uth;
1068 struct proc *p;
1069 struct select_args *uap;
1070 int *retval;
1071 struct _select *sel;
1072 int unwind = 1;
1073 int prepost = 0;
1074 int somewakeup = 0;
1075 int doretry = 0;
1076 wait_result_t wait_result;
1077
1078 p = current_proc();
1079 th_act = current_thread();
1080 uap = (struct select_args *)get_bsduthreadarg(th_act);
1081 retval = (int *)get_bsduthreadrval(th_act);
1082 uth = get_bsdthread_info(th_act);
1083 sel = &uth->uu_select;
1084
1085 /* if it is first pass wait queue is not setup yet */
1086 if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
1087 unwind = 0;
1088 if (sel->count == 0)
1089 unwind = 0;
1090 retry:
1091 if (error != 0) {
1092 goto done;
1093 }
1094
1095 ncoll = nselcoll;
1096 p->p_flag |= P_SELECT;
1097 /* skip scans if the select is just for timeouts */
1098 if (sel->count) {
1099 if (sel_pass == SEL_FIRSTPASS)
1100 wait_queue_sub_clearrefs(sel->wqset);
1101
1102 error = selscan(p, sel, uap->nd, retval, sel_pass, sel->wqset);
1103 if (error || *retval) {
1104 goto done;
1105 }
1106 if (prepost) {
1107 /* if the select of log, then we canwakeup and discover some one
1108 * else already read the data; go toselct again if time permits
1109 */
1110 prepost = 0;
1111 doretry = 1;
1112 }
1113 if (somewakeup) {
1114 somewakeup = 0;
1115 doretry = 1;
1116 }
1117 }
1118
1119 if (uap->tv) {
1120 uint64_t now;
1121
1122 clock_get_uptime(&now);
1123 if (now >= sel->abstime)
1124 goto done;
1125 }
1126
1127 if (doretry) {
1128 /* cleanup obits and try again */
1129 doretry = 0;
1130 sel_pass = SEL_FIRSTPASS;
1131 goto retry;
1132 }
1133
1134 /*
1135 * To effect a poll, the timeout argument should be
1136 * non-nil, pointing to a zero-valued timeval structure.
1137 */
1138 if (uap->tv && sel->abstime == 0) {
1139 goto done;
1140 }
1141
1142 /* No spurious wakeups due to colls,no need to check for them */
1143 if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
1144 sel_pass = SEL_FIRSTPASS;
1145 goto retry;
1146 }
1147
1148 p->p_flag &= ~P_SELECT;
1149
1150 /* if the select is just for timeout skip check */
1151 if (sel->count &&(sel_pass == SEL_SECONDPASS))
1152 panic("selprocess: 2nd pass assertwaiting");
1153
1154 /* Wait Queue Subordinate has waitqueue as first element */
1155 wait_result = wait_queue_assert_wait((wait_queue_t)sel->wqset,
1156 &selwait, THREAD_ABORTSAFE, sel->abstime);
1157 if (wait_result != THREAD_AWAKENED) {
1158 /* there are no preposted events */
1159 error = tsleep1(NULL, PSOCK | PCATCH,
1160 "select", 0, selcontinue);
1161 } else {
1162 prepost = 1;
1163 error = 0;
1164 }
1165
1166 sel_pass = SEL_SECONDPASS;
1167 if (error == 0) {
1168 if (!prepost)
1169 somewakeup =1;
1170 goto retry;
1171 }
1172 done:
1173 if (unwind) {
1174 wait_subqueue_unlink_all(sel->wqset);
1175 seldrop(p, sel->ibits, uap->nd);
1176 }
1177 p->p_flag &= ~P_SELECT;
1178 /* select is not restarted after signals... */
1179 if (error == ERESTART)
1180 error = EINTR;
1181 if (error == EWOULDBLOCK)
1182 error = 0;
1183 nw = howmany(uap->nd, NFDBITS);
1184 ni = nw * sizeof(fd_mask);
1185
1186 #define putbits(name, x) \
1187 do { \
1188 if (uap->name && (error2 = \
1189 copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
1190 error = error2; \
1191 } while (0)
1192
1193 if (error == 0) {
1194 int error2;
1195
1196 putbits(in, 0);
1197 putbits(ou, 1);
1198 putbits(ex, 2);
1199 #undef putbits
1200 }
1201 return(error);
1202 }
1203
1204 static int
1205 selscan(p, sel, nfd, retval, sel_pass, wqsub)
1206 struct proc *p;
1207 struct _select *sel;
1208 int nfd;
1209 register_t *retval;
1210 int sel_pass;
1211 wait_queue_sub_t wqsub;
1212 {
1213 register struct filedesc *fdp = p->p_fd;
1214 register int msk, i, j, fd;
1215 register u_int32_t bits;
1216 struct fileproc *fp;
1217 int n = 0;
1218 int nc = 0;
1219 static int flag[3] = { FREAD, FWRITE, 0 };
1220 u_int32_t *iptr, *optr;
1221 u_int nw;
1222 u_int32_t *ibits, *obits;
1223 char * wql;
1224 char * wql_ptr;
1225
1226 /*
1227 * Problems when reboot; due to MacOSX signal probs
1228 * in Beaker1C ; verify that the p->p_fd is valid
1229 */
1230 if (fdp == NULL) {
1231 *retval=0;
1232 return(EIO);
1233 }
1234 ibits = sel->ibits;
1235 obits = sel->obits;
1236 wql = sel->wql;
1237
1238 nw = howmany(nfd, NFDBITS);
1239
1240 nc = 0;
1241 proc_fdlock(p);
1242
1243 if (sel->count) {
1244 for (msk = 0; msk < 3; msk++) {
1245 iptr = (u_int32_t *)&ibits[msk * nw];
1246 optr = (u_int32_t *)&obits[msk * nw];
1247
1248 for (i = 0; i < nfd; i += NFDBITS) {
1249 bits = iptr[i/NFDBITS];
1250
1251 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1252 bits &= ~(1 << j);
1253 fp = fdp->fd_ofiles[fd];
1254
1255 if (fp == NULL ||
1256 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1257 proc_fdunlock(p);
1258 return(EBADF);
1259 }
1260 if (sel_pass == SEL_SECONDPASS) {
1261 wql_ptr = (char *)0;
1262 fp->f_flags &= ~FP_INSELECT;
1263 fp->f_waddr = (void *)0;
1264 } else {
1265 wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
1266 fp->f_flags |= FP_INSELECT;
1267 fp->f_waddr = (void *)wqsub;
1268 }
1269 if (fp->f_ops && fo_select(fp, flag[msk], wql_ptr, p)) {
1270 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
1271 n++;
1272 }
1273 nc++;
1274 }
1275 }
1276 }
1277 }
1278 proc_fdunlock(p);
1279 *retval = n;
1280 return (0);
1281 }
1282
1283 static int poll_callback(struct kqueue *, struct kevent *, void *);
1284
1285 struct poll_continue_args {
1286 user_addr_t pca_fds;
1287 u_int pca_nfds;
1288 u_int pca_rfds;
1289 };
1290
1291 int
1292 poll(struct proc *p, struct poll_args *uap, register_t *retval)
1293 {
1294 struct poll_continue_args *cont;
1295 struct pollfd *fds;
1296 struct kqueue *kq;
1297 struct timeval atv;
1298 int ncoll, error = 0;
1299 u_int nfds = uap->nfds;
1300 u_int rfds = 0;
1301 u_int i;
1302 size_t ni;
1303
1304 /*
1305 * This is kinda bogus. We have fd limits, but that is not
1306 * really related to the size of the pollfd array. Make sure
1307 * we let the process use at least FD_SETSIZE entries and at
1308 * least enough for the current limits. We want to be reasonably
1309 * safe, but not overly restrictive.
1310 */
1311 if (nfds > OPEN_MAX ||
1312 (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE))
1313 return (EINVAL);
1314
1315 kq = kqueue_alloc(p);
1316 if (kq == NULL)
1317 return (EAGAIN);
1318
1319 ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
1320 MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
1321 if (NULL == cont) {
1322 error = EAGAIN;
1323 goto out;
1324 }
1325
1326 fds = (struct pollfd *)&cont[1];
1327 error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
1328 if (error)
1329 goto out;
1330
1331 if (uap->timeout != -1) {
1332 struct timeval rtv;
1333
1334 atv.tv_sec = uap->timeout / 1000;
1335 atv.tv_usec = (uap->timeout % 1000) * 1000;
1336 if (itimerfix(&atv)) {
1337 error = EINVAL;
1338 goto out;
1339 }
1340 getmicrouptime(&rtv);
1341 timevaladd(&atv, &rtv);
1342 } else {
1343 atv.tv_sec = 0;
1344 atv.tv_usec = 0;
1345 }
1346
1347 /* JMM - all this P_SELECT stuff is bogus */
1348 ncoll = nselcoll;
1349 p->p_flag |= P_SELECT;
1350
1351 for (i = 0; i < nfds; i++) {
1352 short events = fds[i].events;
1353 struct kevent kev;
1354 int kerror = 0;
1355
1356 /* per spec, ignore fd values below zero */
1357 if (fds[i].fd < 0) {
1358 fds[i].revents = 0;
1359 continue;
1360 }
1361
1362 /* convert the poll event into a kqueue kevent */
1363 kev.ident = fds[i].fd;
1364 kev.flags = EV_ADD | EV_ONESHOT | EV_POLL;
1365 kev.fflags = NOTE_LOWAT;
1366 kev.data = 1; /* efficiency be damned: any data should trigger */
1367 kev.udata = CAST_USER_ADDR_T(&fds[i]);
1368
1369 /* Handle input events */
1370 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND )) {
1371 kev.filter = EVFILT_READ;
1372 if (!(events & ( POLLIN | POLLRDNORM )))
1373 kev.flags |= EV_OOBAND;
1374 kerror = kevent_register(kq, &kev, p);
1375 }
1376
1377 /* Handle output events */
1378 if (kerror == 0 &&
1379 events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) {
1380 kev.filter = EVFILT_WRITE;
1381 kerror = kevent_register(kq, &kev, p);
1382 }
1383
1384 /* Handle BSD extension vnode events */
1385 if (kerror == 0 &&
1386 events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) {
1387 kev.filter = EVFILT_VNODE;
1388 kev.fflags = 0;
1389 if (events & POLLEXTEND)
1390 kev.fflags |= NOTE_EXTEND;
1391 if (events & POLLATTRIB)
1392 kev.fflags |= NOTE_ATTRIB;
1393 if (events & POLLNLINK)
1394 kev.fflags |= NOTE_LINK;
1395 if (events & POLLWRITE)
1396 kev.fflags |= NOTE_WRITE;
1397 kerror = kevent_register(kq, &kev, p);
1398 }
1399
1400 if (kerror != 0) {
1401 fds[i].revents = POLLNVAL;
1402 rfds++;
1403 } else
1404 fds[i].revents = 0;
1405 }
1406
1407 /* Did we have any trouble registering? */
1408 if (rfds > 0)
1409 goto done;
1410
1411 /* scan for, and possibly wait for, the kevents to trigger */
1412 cont->pca_fds = uap->fds;
1413 cont->pca_nfds = nfds;
1414 cont->pca_rfds = rfds;
1415 error = kevent_scan(kq, poll_callback, NULL, cont, &atv, p);
1416 rfds = cont->pca_rfds;
1417
1418 done:
1419 p->p_flag &= ~P_SELECT;
1420 /* poll is not restarted after signals... */
1421 if (error == ERESTART)
1422 error = EINTR;
1423 if (error == EWOULDBLOCK)
1424 error = 0;
1425 if (error == 0) {
1426 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
1427 *retval = rfds;
1428 }
1429
1430 out:
1431 if (NULL != cont)
1432 FREE(cont, M_TEMP);
1433
1434 kqueue_dealloc(kq, p);
1435 return (error);
1436 }
1437
1438 static int
1439 poll_callback(__unused struct kqueue *kq, struct kevent *kevp, void *data)
1440 {
1441 struct poll_continue_args *cont = (struct poll_continue_args *)data;
1442 struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
1443 short mask;
1444
1445 /* convert the results back into revents */
1446 if (kevp->flags & EV_EOF)
1447 fds->revents |= POLLHUP;
1448 if (kevp->flags & EV_ERROR)
1449 fds->revents |= POLLERR;
1450 cont->pca_rfds++;
1451
1452 switch (kevp->filter) {
1453 case EVFILT_READ:
1454 if (fds->revents & POLLHUP)
1455 mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
1456 else {
1457 mask = 0;
1458 if (kevp->data != 0)
1459 mask |= (POLLIN | POLLRDNORM );
1460 if (kevp->flags & EV_OOBAND)
1461 mask |= ( POLLPRI | POLLRDBAND );
1462 }
1463 fds->revents |= (fds->events & mask);
1464 break;
1465
1466 case EVFILT_WRITE:
1467 if (!(fds->revents & POLLHUP))
1468 fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
1469 break;
1470
1471 case EVFILT_PROC:
1472 if (kevp->fflags & NOTE_EXTEND)
1473 fds->revents |= (fds->events & POLLEXTEND);
1474 if (kevp->fflags & NOTE_ATTRIB)
1475 fds->revents |= (fds->events & POLLATTRIB);
1476 if (kevp->fflags & NOTE_LINK)
1477 fds->revents |= (fds->events & POLLNLINK);
1478 if (kevp->fflags & NOTE_WRITE)
1479 fds->revents |= (fds->events & POLLWRITE);
1480 break;
1481 }
1482 return 0;
1483 }
1484
1485 int
1486 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
1487 {
1488
1489 return (1);
1490 }
1491
1492 static int
1493 selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits,
1494 int nfd, int *count)
1495 {
1496 register struct filedesc *fdp = p->p_fd;
1497 register int msk, i, j, fd;
1498 register u_int32_t bits;
1499 struct fileproc *fp;
1500 int n = 0;
1501 u_int32_t *iptr;
1502 u_int nw;
1503 int error=0;
1504 int dropcount;
1505
1506 /*
1507 * Problems when reboot; due to MacOSX signal probs
1508 * in Beaker1C ; verify that the p->p_fd is valid
1509 */
1510 if (fdp == NULL) {
1511 *count=0;
1512 return(EIO);
1513 }
1514 nw = howmany(nfd, NFDBITS);
1515
1516 proc_fdlock(p);
1517 for (msk = 0; msk < 3; msk++) {
1518 iptr = (u_int32_t *)&ibits[msk * nw];
1519 for (i = 0; i < nfd; i += NFDBITS) {
1520 bits = iptr[i/NFDBITS];
1521 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1522 bits &= ~(1 << j);
1523 fp = fdp->fd_ofiles[fd];
1524 if (fp == NULL ||
1525 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
1526 *count=0;
1527 error = EBADF;
1528 goto bad;
1529 }
1530 fp->f_iocount++;
1531 n++;
1532 }
1533 }
1534 }
1535 proc_fdunlock(p);
1536
1537 *count = n;
1538 return (0);
1539 bad:
1540 dropcount = 0;
1541
1542 if (n== 0)
1543 goto out;
1544 /* undo the iocounts */
1545 for (msk = 0; msk < 3; msk++) {
1546 iptr = (u_int32_t *)&ibits[msk * nw];
1547 for (i = 0; i < nfd; i += NFDBITS) {
1548 bits = iptr[i/NFDBITS];
1549 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1550 bits &= ~(1 << j);
1551 fp = fdp->fd_ofiles[fd];
1552 if (dropcount >= n)
1553 goto out;
1554 fp->f_iocount--;
1555
1556 if (p->p_fpdrainwait && fp->f_iocount == 0) {
1557 p->p_fpdrainwait = 0;
1558 wakeup(&p->p_fpdrainwait);
1559 }
1560 dropcount++;
1561 }
1562 }
1563 }
1564 out:
1565 proc_fdunlock(p);
1566 return(error);
1567 }
1568
1569 static int
1570 seldrop(p, ibits, nfd)
1571 struct proc *p;
1572 u_int32_t *ibits;
1573 int nfd;
1574 {
1575 register struct filedesc *fdp = p->p_fd;
1576 register int msk, i, j, fd;
1577 register u_int32_t bits;
1578 struct fileproc *fp;
1579 int n = 0;
1580 u_int32_t *iptr;
1581 u_int nw;
1582
1583 /*
1584 * Problems when reboot; due to MacOSX signal probs
1585 * in Beaker1C ; verify that the p->p_fd is valid
1586 */
1587 if (fdp == NULL) {
1588 return(EIO);
1589 }
1590
1591 nw = howmany(nfd, NFDBITS);
1592
1593
1594 proc_fdlock(p);
1595 for (msk = 0; msk < 3; msk++) {
1596 iptr = (u_int32_t *)&ibits[msk * nw];
1597 for (i = 0; i < nfd; i += NFDBITS) {
1598 bits = iptr[i/NFDBITS];
1599 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
1600 bits &= ~(1 << j);
1601 fp = fdp->fd_ofiles[fd];
1602 if (fp == NULL
1603 #if 0
1604 /* if you are here then it is being closed */
1605 || (fdp->fd_ofileflags[fd] & UF_RESERVED)
1606 #endif
1607 ) {
1608 proc_fdunlock(p);
1609 return(EBADF);
1610 }
1611 n++;
1612 fp->f_iocount--;
1613 fp->f_flags &= ~FP_INSELECT;
1614
1615 if (p->p_fpdrainwait && fp->f_iocount == 0) {
1616 p->p_fpdrainwait = 0;
1617 wakeup(&p->p_fpdrainwait);
1618 }
1619 }
1620 }
1621 }
1622 proc_fdunlock(p);
1623 return (0);
1624 }
1625
1626 /*
1627 * Record a select request.
1628 */
1629 void
1630 selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql)
1631 {
1632 thread_t cur_act = current_thread();
1633 struct uthread * ut = get_bsdthread_info(cur_act);
1634
1635 /* need to look at collisions */
1636
1637 if ((p_wql == (void *)0) && ((sip->si_flags & SI_INITED) == 0)) {
1638 return;
1639 }
1640
1641 /*do not record if this is second pass of select */
1642 if((p_wql == (void *)0)) {
1643 return;
1644 }
1645
1646 if ((sip->si_flags & SI_INITED) == 0) {
1647 wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
1648 sip->si_flags |= SI_INITED;
1649 sip->si_flags &= ~SI_CLEAR;
1650 }
1651
1652 if (sip->si_flags & SI_RECORDED) {
1653 sip->si_flags |= SI_COLL;
1654 } else
1655 sip->si_flags &= ~SI_COLL;
1656
1657 sip->si_flags |= SI_RECORDED;
1658 if (!wait_queue_member(&sip->si_wait_queue, ut->uu_select.wqset))
1659 wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_select.wqset,
1660 (wait_queue_link_t)p_wql);
1661
1662 return;
1663 }
1664
1665 void
1666 selwakeup(sip)
1667 register struct selinfo *sip;
1668 {
1669
1670 if ((sip->si_flags & SI_INITED) == 0) {
1671 return;
1672 }
1673
1674 if (sip->si_flags & SI_COLL) {
1675 nselcoll++;
1676 sip->si_flags &= ~SI_COLL;
1677 #if 0
1678 /* will not support */
1679 //wakeup((caddr_t)&selwait);
1680 #endif
1681 }
1682
1683 if (sip->si_flags & SI_RECORDED) {
1684 wait_queue_wakeup_all(&sip->si_wait_queue, &selwait, THREAD_AWAKENED);
1685 sip->si_flags &= ~SI_RECORDED;
1686 }
1687
1688 }
1689
1690 void
1691 selthreadclear(sip)
1692 register struct selinfo *sip;
1693 {
1694
1695 if ((sip->si_flags & SI_INITED) == 0) {
1696 return;
1697 }
1698 if (sip->si_flags & SI_RECORDED) {
1699 selwakeup(sip);
1700 sip->si_flags &= ~(SI_RECORDED | SI_COLL);
1701 }
1702 sip->si_flags |= SI_CLEAR;
1703 wait_queue_unlinkall_nofree(&sip->si_wait_queue);
1704 }
1705
1706
1707
1708
1709 #define DBG_EVENT 0x10
1710
1711 #define DBG_POST 0x10
1712 #define DBG_WATCH 0x11
1713 #define DBG_WAIT 0x12
1714 #define DBG_MOD 0x13
1715 #define DBG_EWAKEUP 0x14
1716 #define DBG_ENQUEUE 0x15
1717 #define DBG_DEQUEUE 0x16
1718
1719 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
1720 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
1721 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
1722 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
1723 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
1724 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
1725 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
1726
1727
1728 #define EVPROCDEQUE(p, evq) do { \
1729 proc_lock(p); \
1730 if (evq->ee_flags & EV_QUEUED) { \
1731 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist); \
1732 evq->ee_flags &= ~EV_QUEUED; \
1733 } \
1734 proc_unlock(p); \
1735 } while (0);
1736
1737
1738 /*
1739 * called upon socket close. deque and free all events for
1740 * the socket... socket must be locked by caller.
1741 */
1742 void
1743 evsofree(struct socket *sp)
1744 {
1745 struct eventqelt *evq, *next;
1746 proc_t p;
1747
1748 if (sp == NULL)
1749 return;
1750
1751 for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
1752 next = evq->ee_slist.tqe_next;
1753 p = evq->ee_proc;
1754
1755 if (evq->ee_flags & EV_QUEUED) {
1756 EVPROCDEQUE(p, evq);
1757 }
1758 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
1759 FREE(evq, M_TEMP);
1760 }
1761 }
1762
1763
1764 /*
1765 * called upon pipe close. deque and free all events for
1766 * the pipe... pipe must be locked by caller
1767 */
1768 void
1769 evpipefree(struct pipe *cpipe)
1770 {
1771 struct eventqelt *evq, *next;
1772 proc_t p;
1773
1774 for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
1775 next = evq->ee_slist.tqe_next;
1776 p = evq->ee_proc;
1777
1778 EVPROCDEQUE(p, evq);
1779
1780 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
1781 FREE(evq, M_TEMP);
1782 }
1783 }
1784
1785
1786 /*
1787 * enqueue this event if it's not already queued. wakeup
1788 * the proc if we do queue this event to it...
1789 * entered with proc lock held... we drop it before
1790 * doing the wakeup and return in that state
1791 */
1792 static void
1793 evprocenque(struct eventqelt *evq)
1794 {
1795 proc_t p;
1796
1797 assert(evq);
1798 p = evq->ee_proc;
1799
1800 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, evq, evq->ee_flags, evq->ee_eventmask,0,0);
1801
1802 proc_lock(p);
1803
1804 if (evq->ee_flags & EV_QUEUED) {
1805 proc_unlock(p);
1806
1807 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1808 return;
1809 }
1810 evq->ee_flags |= EV_QUEUED;
1811
1812 TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
1813
1814 proc_unlock(p);
1815
1816 wakeup(&p->p_evlist);
1817
1818 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
1819 }
1820
1821
1822 /*
1823 * pipe lock must be taken by the caller
1824 */
1825 void
1826 postpipeevent(struct pipe *pipep, int event)
1827 {
1828 int mask;
1829 struct eventqelt *evq;
1830
1831 if (pipep == NULL)
1832 return;
1833 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
1834
1835 for (evq = pipep->pipe_evlist.tqh_first;
1836 evq != NULL; evq = evq->ee_slist.tqe_next) {
1837
1838 if (evq->ee_eventmask == 0)
1839 continue;
1840 mask = 0;
1841
1842 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
1843
1844 case EV_RWBYTES:
1845 if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
1846 mask |= EV_RE;
1847 evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
1848 }
1849 if ((evq->ee_eventmask & EV_WR) &&
1850 (pipep->pipe_buffer.size - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
1851
1852 if (pipep->pipe_state & PIPE_EOF) {
1853 mask |= EV_WR|EV_RESET;
1854 break;
1855 }
1856 mask |= EV_WR;
1857 evq->ee_req.er_wcnt = pipep->pipe_buffer.size - pipep->pipe_buffer.cnt;
1858 }
1859 break;
1860
1861 case EV_WCLOSED:
1862 case EV_RCLOSED:
1863 if ((evq->ee_eventmask & EV_RE)) {
1864 mask |= EV_RE|EV_RCLOSED;
1865 }
1866 if ((evq->ee_eventmask & EV_WR)) {
1867 mask |= EV_WR|EV_WCLOSED;
1868 }
1869 break;
1870
1871 default:
1872 return;
1873 }
1874 if (mask) {
1875 /*
1876 * disarm... postevents are nops until this event is 'read' via
1877 * waitevent and then re-armed via modwatch
1878 */
1879 evq->ee_eventmask = 0;
1880
1881 /*
1882 * since events are disarmed until after the waitevent
1883 * the ee_req.er_xxxx fields can't change once we've
1884 * inserted this event into the proc queue...
1885 * therefore, the waitevent will see a 'consistent'
1886 * snapshot of the event, even though it won't hold
1887 * the pipe lock, and we're updating the event outside
1888 * of the proc lock, which it will hold
1889 */
1890 evq->ee_req.er_eventbits |= mask;
1891
1892 KERNEL_DEBUG(DBG_MISC_POST, evq, evq->ee_req.er_eventbits, mask, 1,0);
1893
1894 evprocenque(evq);
1895 }
1896 }
1897 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
1898 }
1899
1900
1901 /*
1902 * given either a sockbuf or a socket run down the
1903 * event list and queue ready events found...
1904 * the socket must be locked by the caller
1905 */
1906 void
1907 postevent(struct socket *sp, struct sockbuf *sb, int event)
1908 {
1909 int mask;
1910 struct eventqelt *evq;
1911 struct tcpcb *tp;
1912
1913 if (sb)
1914 sp = sb->sb_so;
1915 if (sp == NULL)
1916 return;
1917
1918 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
1919
1920 for (evq = sp->so_evlist.tqh_first;
1921 evq != NULL; evq = evq->ee_slist.tqe_next) {
1922
1923 if (evq->ee_eventmask == 0)
1924 continue;
1925 mask = 0;
1926
1927 /* ready for reading:
1928 - byte cnt >= receive low water mark
1929 - read-half of conn closed
1930 - conn pending for listening sock
1931 - socket error pending
1932
1933 ready for writing
1934 - byte cnt avail >= send low water mark
1935 - write half of conn closed
1936 - socket error pending
1937 - non-blocking conn completed successfully
1938
1939 exception pending
1940 - out of band data
1941 - sock at out of band mark
1942 */
1943
1944 switch (event & EV_DMASK) {
1945
1946 case EV_OOB:
1947 if ((evq->ee_eventmask & EV_EX)) {
1948 if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
1949 mask |= EV_EX|EV_OOB;
1950 }
1951 break;
1952
1953 case EV_RWBYTES|EV_OOB:
1954 if ((evq->ee_eventmask & EV_EX)) {
1955 if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
1956 mask |= EV_EX|EV_OOB;
1957 }
1958 /*
1959 * fall into the next case
1960 */
1961 case EV_RWBYTES:
1962 if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
1963 if (sp->so_error) {
1964 if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
1965 if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
1966 (tp->t_state == TCPS_CLOSED)) {
1967 mask |= EV_RE|EV_RESET;
1968 break;
1969 }
1970 }
1971 }
1972 mask |= EV_RE;
1973 evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
1974
1975 if (sp->so_state & SS_CANTRCVMORE) {
1976 mask |= EV_FIN;
1977 break;
1978 }
1979 }
1980 if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
1981 if (sp->so_error) {
1982 if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
1983 if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
1984 (tp->t_state == TCPS_CLOSED)) {
1985 mask |= EV_WR|EV_RESET;
1986 break;
1987 }
1988 }
1989 }
1990 mask |= EV_WR;
1991 evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
1992 }
1993 break;
1994
1995 case EV_RCONN:
1996 if ((evq->ee_eventmask & EV_RE)) {
1997 mask |= EV_RE|EV_RCONN;
1998 evq->ee_req.er_rcnt = sp->so_qlen + 1; // incl this one
1999 }
2000 break;
2001
2002 case EV_WCONN:
2003 if ((evq->ee_eventmask & EV_WR)) {
2004 mask |= EV_WR|EV_WCONN;
2005 }
2006 break;
2007
2008 case EV_RCLOSED:
2009 if ((evq->ee_eventmask & EV_RE)) {
2010 mask |= EV_RE|EV_RCLOSED;
2011 }
2012 break;
2013
2014 case EV_WCLOSED:
2015 if ((evq->ee_eventmask & EV_WR)) {
2016 mask |= EV_WR|EV_WCLOSED;
2017 }
2018 break;
2019
2020 case EV_FIN:
2021 if (evq->ee_eventmask & EV_RE) {
2022 mask |= EV_RE|EV_FIN;
2023 }
2024 break;
2025
2026 case EV_RESET:
2027 case EV_TIMEOUT:
2028 if (evq->ee_eventmask & EV_RE) {
2029 mask |= EV_RE | event;
2030 }
2031 if (evq->ee_eventmask & EV_WR) {
2032 mask |= EV_WR | event;
2033 }
2034 break;
2035
2036 default:
2037 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
2038 return;
2039 } /* switch */
2040
2041 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
2042
2043 if (mask) {
2044 /*
2045 * disarm... postevents are nops until this event is 'read' via
2046 * waitevent and then re-armed via modwatch
2047 */
2048 evq->ee_eventmask = 0;
2049
2050 /*
2051 * since events are disarmed until after the waitevent
2052 * the ee_req.er_xxxx fields can't change once we've
2053 * inserted this event into the proc queue...
2054 * since waitevent can't see this event until we
2055 * enqueue it, waitevent will see a 'consistent'
2056 * snapshot of the event, even though it won't hold
2057 * the socket lock, and we're updating the event outside
2058 * of the proc lock, which it will hold
2059 */
2060 evq->ee_req.er_eventbits |= mask;
2061
2062 evprocenque(evq);
2063 }
2064 }
2065 KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
2066 }
2067
2068
2069 /*
2070 * watchevent system call. user passes us an event to watch
2071 * for. we malloc an event object, initialize it, and queue
2072 * it to the open socket. when the event occurs, postevent()
2073 * will enque it back to our proc where we can retrieve it
2074 * via waitevent().
2075 *
2076 * should this prevent duplicate events on same socket?
2077 */
2078 int
2079 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
2080 {
2081 struct eventqelt *evq = (struct eventqelt *)0;
2082 struct eventqelt *np = NULL;
2083 struct eventreq *erp;
2084 struct fileproc *fp = NULL;
2085 int error;
2086
2087 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
2088
2089 // get a qelt and fill with users req
2090 MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
2091
2092 if (evq == NULL)
2093 panic("can't MALLOC evq");
2094 erp = &evq->ee_req;
2095
2096 // get users request pkt
2097 if ( (error = copyin(CAST_USER_ADDR_T(uap->u_req), (caddr_t)erp,
2098 sizeof(struct eventreq))) ) {
2099 FREE(evq, M_TEMP);
2100
2101 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2102 return(error);
2103 }
2104 KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,evq,0,0);
2105
2106 // validate, freeing qelt if errors
2107 error = 0;
2108 proc_fdlock(p);
2109
2110 if (erp->er_type != EV_FD) {
2111 error = EINVAL;
2112 } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2113 error = EBADF;
2114 } else if (fp->f_type == DTYPE_SOCKET) {
2115 socket_lock((struct socket *)fp->f_data, 1);
2116 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2117 } else if (fp->f_type == DTYPE_PIPE) {
2118 PIPE_LOCK((struct pipe *)fp->f_data);
2119 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2120 } else {
2121 fp_drop(p, erp->er_handle, fp, 1);
2122 error = EINVAL;
2123 }
2124 proc_fdunlock(p);
2125
2126 if (error) {
2127 FREE(evq, M_TEMP);
2128
2129 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
2130 return(error);
2131 }
2132
2133 /*
2134 * only allow one watch per file per proc
2135 */
2136 for ( ; np != NULL; np = np->ee_slist.tqe_next) {
2137 if (np->ee_proc == p) {
2138 if (fp->f_type == DTYPE_SOCKET)
2139 socket_unlock((struct socket *)fp->f_data, 1);
2140 else
2141 PIPE_UNLOCK((struct pipe *)fp->f_data);
2142 fp_drop(p, erp->er_handle, fp, 0);
2143 FREE(evq, M_TEMP);
2144
2145 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2146 return(EINVAL);
2147 }
2148 }
2149 erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
2150 evq->ee_proc = p;
2151 evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2152 evq->ee_flags = 0;
2153
2154 if (fp->f_type == DTYPE_SOCKET) {
2155 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2156 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
2157
2158 socket_unlock((struct socket *)fp->f_data, 1);
2159 } else {
2160 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2161 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
2162
2163 PIPE_UNLOCK((struct pipe *)fp->f_data);
2164 }
2165 fp_drop_event(p, erp->er_handle, fp);
2166
2167 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
2168 return(0);
2169 }
2170
2171
2172
2173 /*
2174 * waitevent system call.
2175 * grabs the next waiting event for this proc and returns
2176 * it. if no events, user can request to sleep with timeout
2177 * or poll mode (tv=NULL);
2178 */
2179 int
2180 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
2181 {
2182 int error = 0;
2183 struct eventqelt *evq;
2184 struct eventreq er;
2185 uint64_t abstime, interval;
2186
2187 if (uap->tv) {
2188 struct timeval atv;
2189
2190 error = copyin(CAST_USER_ADDR_T(uap->tv), (caddr_t)&atv, sizeof (atv));
2191 if (error)
2192 return(error);
2193 if (itimerfix(&atv)) {
2194 error = EINVAL;
2195 return(error);
2196 }
2197 interval = tvtoabstime(&atv);
2198 } else
2199 interval = 0;
2200
2201 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
2202
2203 proc_lock(p);
2204 retry:
2205 if ((evq = p->p_evlist.tqh_first) != NULL) {
2206 /*
2207 * found one... make a local copy while it's still on the queue
2208 * to prevent it from changing while in the midst of copying
2209 * don't want to hold the proc lock across a copyout because
2210 * it might block on a page fault at the target in user space
2211 */
2212 bcopy((caddr_t)&evq->ee_req, (caddr_t)&er, sizeof (struct eventreq));
2213
2214 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
2215
2216 evq->ee_flags &= ~EV_QUEUED;
2217
2218 proc_unlock(p);
2219
2220 error = copyout((caddr_t)&er, CAST_USER_ADDR_T(uap->u_req), sizeof(struct eventreq));
2221
2222 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
2223 evq->ee_req.er_handle,evq->ee_req.er_eventbits,evq,0);
2224 return (error);
2225 }
2226 else {
2227 if (uap->tv && interval == 0) {
2228 proc_unlock(p);
2229 *retval = 1; // poll failed
2230
2231 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
2232 return (error);
2233 }
2234 if (interval != 0)
2235 clock_absolutetime_interval_to_deadline(interval, &abstime);
2236 else
2237 abstime = 0;
2238
2239 KERNEL_DEBUG(DBG_MISC_WAIT, 1,&p->p_evlist,0,0,0);
2240
2241 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
2242
2243 KERNEL_DEBUG(DBG_MISC_WAIT, 2,&p->p_evlist,0,0,0);
2244
2245 if (error == 0)
2246 goto retry;
2247 if (error == ERESTART)
2248 error = EINTR;
2249 if (error == EWOULDBLOCK) {
2250 *retval = 1;
2251 error = 0;
2252 }
2253 }
2254 proc_unlock(p);
2255
2256 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
2257 return (error);
2258 }
2259
2260
2261 /*
2262 * modwatch system call. user passes in event to modify.
2263 * if we find it we reset the event bits and que/deque event
2264 * it needed.
2265 */
2266 int
2267 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
2268 {
2269 struct eventreq er;
2270 struct eventreq *erp = &er;
2271 struct eventqelt *evq;
2272 int error;
2273 struct fileproc *fp;
2274 int flag;
2275
2276 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
2277
2278 /*
2279 * get user's request pkt
2280 */
2281 if ((error = copyin(CAST_USER_ADDR_T(uap->u_req), (caddr_t)erp,
2282 sizeof(struct eventreq)))) {
2283 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2284 return(error);
2285 }
2286 proc_fdlock(p);
2287
2288 if (erp->er_type != EV_FD) {
2289 error = EINVAL;
2290 } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
2291 error = EBADF;
2292 } else if (fp->f_type == DTYPE_SOCKET) {
2293 socket_lock((struct socket *)fp->f_data, 1);
2294 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2295 } else if (fp->f_type == DTYPE_PIPE) {
2296 PIPE_LOCK((struct pipe *)fp->f_data);
2297 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2298 } else {
2299 fp_drop(p, erp->er_handle, fp, 1);
2300 error = EINVAL;
2301 }
2302
2303 if (error) {
2304 proc_fdunlock(p);
2305 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
2306 return(error);
2307 }
2308
2309 if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
2310 fp->f_flags &= ~FP_WAITEVENT;
2311 }
2312 proc_fdunlock(p);
2313
2314 // locate event if possible
2315 for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2316 if (evq->ee_proc == p)
2317 break;
2318 }
2319 if (evq == NULL) {
2320 if (fp->f_type == DTYPE_SOCKET)
2321 socket_unlock((struct socket *)fp->f_data, 1);
2322 else
2323 PIPE_UNLOCK((struct pipe *)fp->f_data);
2324 fp_drop(p, erp->er_handle, fp, 0);
2325 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
2326 return(EINVAL);
2327 }
2328 KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,evq,0,0);
2329
2330 if (uap->u_eventmask == EV_RM) {
2331 EVPROCDEQUE(p, evq);
2332
2333 if (fp->f_type == DTYPE_SOCKET) {
2334 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2335 socket_unlock((struct socket *)fp->f_data, 1);
2336 } else {
2337 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2338 PIPE_UNLOCK((struct pipe *)fp->f_data);
2339 }
2340 fp_drop(p, erp->er_handle, fp, 0);
2341 FREE(evq, M_TEMP);
2342 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
2343 return(0);
2344 }
2345 switch (uap->u_eventmask & EV_MASK) {
2346
2347 case 0:
2348 flag = 0;
2349 break;
2350
2351 case EV_RE:
2352 case EV_WR:
2353 case EV_RE|EV_WR:
2354 flag = EV_RWBYTES;
2355 break;
2356
2357 case EV_EX:
2358 flag = EV_OOB;
2359 break;
2360
2361 case EV_EX|EV_RE:
2362 case EV_EX|EV_WR:
2363 case EV_EX|EV_RE|EV_WR:
2364 flag = EV_OOB|EV_RWBYTES;
2365 break;
2366
2367 default:
2368 if (fp->f_type == DTYPE_SOCKET)
2369 socket_unlock((struct socket *)fp->f_data, 1);
2370 else
2371 PIPE_UNLOCK((struct pipe *)fp->f_data);
2372 fp_drop(p, erp->er_handle, fp, 0);
2373 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
2374 return(EINVAL);
2375 }
2376 /*
2377 * since we're holding the socket/pipe lock, the event
2378 * cannot go from the unqueued state to the queued state
2379 * however, it can go from the queued state to the unqueued state
2380 * since that direction is protected by the proc_lock...
2381 * so do a quick check for EV_QUEUED w/o holding the proc lock
2382 * since by far the common case will be NOT EV_QUEUED, this saves
2383 * us taking the proc_lock the majority of the time
2384 */
2385 if (evq->ee_flags & EV_QUEUED) {
2386 /*
2387 * EVPROCDEQUE will recheck the state after it grabs the proc_lock
2388 */
2389 EVPROCDEQUE(p, evq);
2390 }
2391 /*
2392 * while the event is off the proc queue and
2393 * we're holding the socket/pipe lock
2394 * it's safe to update these fields...
2395 */
2396 evq->ee_req.er_eventbits = 0;
2397 evq->ee_eventmask = uap->u_eventmask & EV_MASK;
2398
2399 if (fp->f_type == DTYPE_SOCKET) {
2400 postevent((struct socket *)fp->f_data, 0, flag);
2401 socket_unlock((struct socket *)fp->f_data, 1);
2402 }
2403 else {
2404 postpipeevent((struct pipe *)fp->f_data, flag);
2405 PIPE_UNLOCK((struct pipe *)fp->f_data);
2406 }
2407 fp_drop(p, erp->er_handle, fp, 0);
2408 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,fp->f_data,flag,0);
2409 return(0);
2410 }
2411
2412 /* this routine is called from the close of fd with proc_fdlock held */
2413 int
2414 waitevent_close(struct proc *p, struct fileproc *fp)
2415 {
2416 struct eventqelt *evq;
2417
2418
2419 fp->f_flags &= ~FP_WAITEVENT;
2420
2421 if (fp->f_type == DTYPE_SOCKET) {
2422 socket_lock((struct socket *)fp->f_data, 1);
2423 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
2424 }
2425 else if (fp->f_type == DTYPE_PIPE) {
2426 PIPE_LOCK((struct pipe *)fp->f_data);
2427 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
2428 }
2429 else {
2430 return(EINVAL);
2431 }
2432 proc_fdunlock(p);
2433
2434
2435 // locate event if possible
2436 for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
2437 if (evq->ee_proc == p)
2438 break;
2439 }
2440 if (evq == NULL) {
2441 if (fp->f_type == DTYPE_SOCKET)
2442 socket_unlock((struct socket *)fp->f_data, 1);
2443 else
2444 PIPE_UNLOCK((struct pipe *)fp->f_data);
2445
2446 proc_fdlock(p);
2447
2448 return(EINVAL);
2449 }
2450 EVPROCDEQUE(p, evq);
2451
2452 if (fp->f_type == DTYPE_SOCKET) {
2453 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
2454 socket_unlock((struct socket *)fp->f_data, 1);
2455 } else {
2456 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
2457 PIPE_UNLOCK((struct pipe *)fp->f_data);
2458 }
2459 FREE(evq, M_TEMP);
2460
2461 proc_fdlock(p);
2462
2463 return(0);
2464 }
2465