2 * Copyright (c) 1998-2007 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
65 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
66 * support for mandatory and extensible security protections. This notice
67 * is included in support of clause 2.2 (b) of the Apple Public License,
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/filedesc.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/file_internal.h>
78 #include <sys/fcntl.h>
79 #include <sys/malloc.h>
81 #include <sys/domain.h>
82 #include <sys/kernel.h>
83 #include <sys/event.h>
85 #include <sys/protosw.h>
86 #include <sys/socket.h>
87 #include <sys/socketvar.h>
88 #include <sys/resourcevar.h>
89 #include <sys/signalvar.h>
90 #include <sys/sysctl.h>
93 #include <sys/kdebug.h>
95 #include <net/route.h>
96 #include <netinet/in.h>
97 #include <netinet/in_pcb.h>
98 #include <kern/zalloc.h>
99 #include <kern/locks.h>
100 #include <machine/limits.h>
101 #include <libkern/OSAtomic.h>
102 #include <pexpert/pexpert.h>
105 #include <security/mac.h>
106 #include <security/mac_framework.h>
109 /* how a timeval looks to a 64-bit process */
116 int so_cache_timeouts
= 0;
117 int so_cache_max_freed
= 0;
118 int cached_sock_count
= 0;
119 struct socket
*socket_cache_head
= 0;
120 struct socket
*socket_cache_tail
= 0;
121 u_long so_cache_time
= 0;
122 int so_cache_init_done
= 0;
123 struct zone
*so_cache_zone
;
125 static lck_grp_t
*so_cache_mtx_grp
;
126 static lck_attr_t
*so_cache_mtx_attr
;
127 static lck_grp_attr_t
*so_cache_mtx_grp_attr
;
128 lck_mtx_t
*so_cache_mtx
;
130 #include <machine/limits.h>
132 static void filt_sordetach(struct knote
*kn
);
133 static int filt_soread(struct knote
*kn
, long hint
);
134 static void filt_sowdetach(struct knote
*kn
);
135 static int filt_sowrite(struct knote
*kn
, long hint
);
136 static int filt_solisten(struct knote
*kn
, long hint
);
139 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
* tv_p
);
142 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
* tv_p
);
144 static struct filterops solisten_filtops
=
145 { 1, NULL
, filt_sordetach
, filt_solisten
};
146 static struct filterops soread_filtops
=
147 { 1, NULL
, filt_sordetach
, filt_soread
};
148 static struct filterops sowrite_filtops
=
149 { 1, NULL
, filt_sowdetach
, filt_sowrite
};
151 #define EVEN_MORE_LOCKING_DEBUG 0
152 int socket_debug
= 0;
153 int socket_zone
= M_SOCKET
;
154 so_gen_t so_gencnt
; /* generation count for sockets */
156 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
157 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
159 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
160 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
161 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
162 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
163 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
164 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
165 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
167 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
170 SYSCTL_DECL(_kern_ipc
);
172 int somaxconn
= SOMAXCONN
;
173 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
, CTLFLAG_RW
, &somaxconn
, 0, "");
175 /* Should we get a maximum also ??? */
176 static int sosendmaxchain
= 65536;
177 static int sosendminchain
= 16384;
178 static int sorecvmincopy
= 16384;
179 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
, CTLFLAG_RW
, &sosendminchain
,
181 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
, CTLFLAG_RW
, &sorecvmincopy
,
185 * Set to enable jumbo clusters (if available) for large writes when
186 * the socket is marked with SOF_MULTIPAGES; see below.
189 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
, CTLFLAG_RW
, &sosendjcl
, 0, "");
192 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
193 * writes on the socket for all protocols on any network interfaces,
194 * depending upon sosendjcl above. Be extra careful when setting this
195 * to 1, because sending down packets that cross physical pages down to
196 * broken drivers (those that falsely assume that the physical pages
197 * are contiguous) might lead to system panics or silent data corruption.
198 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
199 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
200 * capable. Set this to 1 only for testing/debugging purposes.
202 int sosendjcl_ignore_capab
= 0;
203 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
, CTLFLAG_RW
,
204 &sosendjcl_ignore_capab
, 0, "");
207 * Socket operation routines.
208 * These routines are called by the routines in
209 * sys_socket.c or from a system process, and
210 * implement the semantics of socket operations by
211 * switching out to the protocol specific routines.
215 extern void postevent(struct socket
*, struct sockbuf
*, int);
216 extern void evsofree(struct socket
*);
218 /* TODO: these should be in header file */
219 extern int get_inpcb_str_size(void);
220 extern int get_tcp_str_size(void);
221 extern struct domain
*pffinddomain(int);
222 extern struct protosw
*pffindprotonotype(int, int);
223 extern int soclose_locked(struct socket
*);
224 extern int soo_kqfilter(struct fileproc
*, struct knote
*, struct proc
*);
228 vm_size_t so_cache_zone_element_size
;
230 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**, int *);
231 static void cached_sock_alloc(struct socket
**, int);
232 static void cached_sock_free(struct socket
*);
233 static void so_cache_timer(void *);
235 void soclose_wait_locked(struct socket
*so
);
243 if (so_cache_init_done
) {
244 printf("socketinit: already called...\n");
248 PE_parse_boot_argn("socket_debug", &socket_debug
, sizeof (socket_debug
));
251 * allocate lock group attribute and group for socket cache mutex
253 so_cache_mtx_grp_attr
= lck_grp_attr_alloc_init();
255 so_cache_mtx_grp
= lck_grp_alloc_init("so_cache",
256 so_cache_mtx_grp_attr
);
259 * allocate the lock attribute for socket cache mutex
261 so_cache_mtx_attr
= lck_attr_alloc_init();
263 so_cache_init_done
= 1;
265 /* cached sockets mutex */
266 so_cache_mtx
= lck_mtx_alloc_init(so_cache_mtx_grp
, so_cache_mtx_attr
);
268 if (so_cache_mtx
== NULL
)
269 return; /* we're hosed... */
271 str_size
= (vm_size_t
)(sizeof (struct socket
) + 4 +
272 get_inpcb_str_size() + 4 + get_tcp_str_size());
274 so_cache_zone
= zinit(str_size
, 120000*str_size
, 8192, "socache zone");
276 printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size
);
278 timeout(so_cache_timer
, NULL
, (SO_CACHE_FLUSH_INTERVAL
* hz
));
280 so_cache_zone_element_size
= str_size
;
286 cached_sock_alloc(struct socket
**so
, int waitok
)
289 register u_long offset
;
291 lck_mtx_lock(so_cache_mtx
);
293 if (cached_sock_count
) {
295 *so
= socket_cache_head
;
297 panic("cached_sock_alloc: cached sock is null");
299 socket_cache_head
= socket_cache_head
->cache_next
;
300 if (socket_cache_head
)
301 socket_cache_head
->cache_prev
= 0;
303 socket_cache_tail
= 0;
305 lck_mtx_unlock(so_cache_mtx
);
307 temp
= (*so
)->so_saved_pcb
;
308 bzero((caddr_t
)*so
, sizeof (struct socket
));
310 kprintf("cached_sock_alloc - retreiving cached sock %p - "
311 "count == %d\n", *so
, cached_sock_count
);
313 (*so
)->so_saved_pcb
= temp
;
314 (*so
)->cached_in_sock_layer
= 1;
317 kprintf("Allocating cached sock %p from memory\n", *so
);
320 lck_mtx_unlock(so_cache_mtx
);
323 *so
= (struct socket
*)zalloc(so_cache_zone
);
325 *so
= (struct socket
*)zalloc_noblock(so_cache_zone
);
330 bzero((caddr_t
)*so
, sizeof (struct socket
));
333 * Define offsets for extra structures into our single block of
334 * memory. Align extra structures on longword boundaries.
336 offset
= (u_long
) *so
;
337 offset
+= sizeof (struct socket
);
340 offset
&= 0xfffffffc;
342 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
343 offset
+= get_inpcb_str_size();
346 offset
&= 0xfffffffc;
349 ((struct inpcb
*)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
352 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
353 *so
, (*so
)->so_saved_pcb
,
354 ((struct inpcb
*)(*so
)->so_saved_pcb
)->inp_saved_ppcb
);
358 (*so
)->cached_in_sock_layer
= 1;
362 cached_sock_free(struct socket
*so
)
365 lck_mtx_lock(so_cache_mtx
);
367 if (++cached_sock_count
> MAX_CACHED_SOCKETS
) {
369 lck_mtx_unlock(so_cache_mtx
);
371 kprintf("Freeing overflowed cached socket %p\n", so
);
373 zfree(so_cache_zone
, so
);
376 kprintf("Freeing socket %p into cache\n", so
);
378 if (so_cache_hw
< cached_sock_count
)
379 so_cache_hw
= cached_sock_count
;
381 so
->cache_next
= socket_cache_head
;
383 if (socket_cache_head
)
384 socket_cache_head
->cache_prev
= so
;
386 socket_cache_tail
= so
;
388 so
->cache_timestamp
= so_cache_time
;
389 socket_cache_head
= so
;
390 lck_mtx_unlock(so_cache_mtx
);
394 kprintf("Freed cached sock %p into cache - count is %d\n",
395 so
, cached_sock_count
);
400 so_cache_timer(__unused
void *dummy
)
402 register struct socket
*p
;
403 register int n_freed
= 0;
405 lck_mtx_lock(so_cache_mtx
);
409 while ((p
= socket_cache_tail
)) {
410 if ((so_cache_time
- p
->cache_timestamp
) < SO_CACHE_TIME_LIMIT
)
415 if ((socket_cache_tail
= p
->cache_prev
))
416 p
->cache_prev
->cache_next
= 0;
417 if (--cached_sock_count
== 0)
418 socket_cache_head
= 0;
420 zfree(so_cache_zone
, p
);
422 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
423 so_cache_max_freed
++;
427 lck_mtx_unlock(so_cache_mtx
);
429 timeout(so_cache_timer
, NULL
, (SO_CACHE_FLUSH_INTERVAL
* hz
));
431 #endif /* __APPLE__ */
434 * Get a socket structure from our zone, and initialize it.
435 * We don't implement `waitok' yet (see comments in uipc_domain.c).
436 * Note that it would probably be better to allocate socket
437 * and PCB at the same time, but I'm not convinced that all
438 * the protocols can be easily modified to do this.
441 soalloc(int waitok
, int dom
, int type
)
445 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
446 cached_sock_alloc(&so
, waitok
);
448 MALLOC_ZONE(so
, struct socket
*, sizeof (*so
), socket_zone
,
451 bzero(so
, sizeof (*so
));
453 /* XXX race condition for reentrant kernel */
454 //###LD Atomic add for so_gencnt
456 so
->so_gencnt
= ++so_gencnt
;
457 so
->so_zone
= socket_zone
;
458 #if CONFIG_MACF_SOCKET
459 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
460 if (mac_socket_label_init(so
, !waitok
) != 0) {
464 #endif /* MAC_SOCKET */
476 * <pru_attach>:ENOBUFS[AF_UNIX]
477 * <pru_attach>:ENOBUFS[TCP]
478 * <pru_attach>:ENOMEM[TCP]
479 * <pru_attach>:EISCONN[TCP]
480 * <pru_attach>:??? [other protocol families, IPSEC]
483 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
485 struct proc
*p
= current_proc();
486 register struct protosw
*prp
;
487 register struct socket
*so
;
488 register int error
= 0;
490 extern int tcpconsdebug
;
493 prp
= pffindproto(dom
, proto
, type
);
495 prp
= pffindtype(dom
, type
);
497 if (prp
== 0 || prp
->pr_usrreqs
->pru_attach
== 0) {
498 if (pffinddomain(dom
) == NULL
) {
499 return (EAFNOSUPPORT
);
502 if (pffindprotonotype(dom
, proto
) != NULL
) {
506 return (EPROTONOSUPPORT
);
508 if (prp
->pr_type
!= type
)
510 so
= soalloc(p
!= 0, dom
, type
);
514 TAILQ_INIT(&so
->so_incomp
);
515 TAILQ_INIT(&so
->so_comp
);
519 so
->so_uid
= kauth_cred_getuid(kauth_cred_get());
520 if (!suser(kauth_cred_get(), NULL
))
521 so
->so_state
= SS_PRIV
;
525 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
526 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
528 so
->next_lock_lr
= 0;
529 so
->next_unlock_lr
= 0;
531 #if CONFIG_MACF_SOCKET
532 mac_socket_label_associate(kauth_cred_get(), so
);
533 #endif /* MAC_SOCKET */
535 //### Attachement will create the per pcb lock if necessary and increase refcount
537 * for creation, make sure it's done before
538 * socket is inserted in lists
542 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
546 * If so_pcb is not zero, the socket will be leaked,
547 * so protocol attachment handler must be coded carefuly
549 so
->so_state
|= SS_NOFDREF
;
551 sofreelastref(so
, 1); /* will deallocate the socket */
555 prp
->pr_domain
->dom_refs
++;
556 TAILQ_INIT(&so
->so_evlist
);
558 /* Attach socket filters for this protocol */
561 if (tcpconsdebug
== 2)
562 so
->so_options
|= SO_DEBUG
;
571 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
572 * <pru_bind>:EAFNOSUPPORT Address family not supported
573 * <pru_bind>:EADDRNOTAVAIL Address not available.
574 * <pru_bind>:EINVAL Invalid argument
575 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
576 * <pru_bind>:EACCES Permission denied
577 * <pru_bind>:EADDRINUSE Address in use
578 * <pru_bind>:EAGAIN Resource unavailable, try again
579 * <pru_bind>:EPERM Operation not permitted
583 * Notes: It's not possible to fully enumerate the return codes above,
584 * since socket filter authors and protocol family authors may
585 * not choose to limit their error returns to those listed, even
586 * though this may result in some software operating incorrectly.
588 * The error codes which are enumerated above are those known to
589 * be returned by the tcp_usr_bind function supplied.
592 sobind(struct socket
*so
, struct sockaddr
*nam
)
594 struct proc
*p
= current_proc();
596 struct socket_filter_entry
*filter
;
602 * If this is a bind request on a previously-accepted socket
603 * that has been marked as inactive, reject it now before
606 if (so
->so_flags
& SOF_DEFUNCT
) {
613 for (filter
= so
->so_filt
; filter
&& (error
== 0);
614 filter
= filter
->sfe_next_onsocket
) {
615 if (filter
->sfe_filter
->sf_filter
.sf_bind
) {
619 socket_unlock(so
, 0);
621 error
= filter
->sfe_filter
->sf_filter
.
622 sf_bind(filter
->sfe_cookie
, so
, nam
);
629 /* End socket filter */
632 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
634 socket_unlock(so
, 1);
636 if (error
== EJUSTRETURN
)
643 sodealloc(struct socket
*so
)
645 so
->so_gencnt
= ++so_gencnt
;
647 #if CONFIG_MACF_SOCKET
648 mac_socket_label_destroy(so
);
649 #endif /* MAC_SOCKET */
650 if (so
->cached_in_sock_layer
== 1) {
651 cached_sock_free(so
);
653 if (so
->cached_in_sock_layer
== -1)
654 panic("sodealloc: double dealloc: so=%p\n", so
);
655 so
->cached_in_sock_layer
= -1;
656 FREE_ZONE(so
, sizeof (*so
), so
->so_zone
);
664 * <pru_listen>:EINVAL[AF_UNIX]
665 * <pru_listen>:EINVAL[TCP]
666 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
667 * <pru_listen>:EINVAL[TCP] Invalid argument
668 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
669 * <pru_listen>:EACCES[TCP] Permission denied
670 * <pru_listen>:EADDRINUSE[TCP] Address in use
671 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
672 * <pru_listen>:EPERM[TCP] Operation not permitted
675 * Notes: Other <pru_listen> returns depend on the protocol family; all
676 * <sf_listen> returns depend on what the filter author causes
677 * their filter to return.
680 solisten(struct socket
*so
, int backlog
)
682 struct proc
*p
= current_proc();
684 struct socket_filter_entry
*filter
;
688 if (so
->so_proto
== NULL
) {
692 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
698 * If the listen request is made on a socket that is not fully
699 * disconnected, or on a previously-accepted socket that has
700 * been marked as inactive, reject the request now.
703 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) ||
704 (so
->so_flags
& SOF_DEFUNCT
)) {
709 if ((so
->so_restrictions
& SO_RESTRICT_DENYIN
) != 0) {
715 for (filter
= so
->so_filt
; filter
&& (error
== 0);
716 filter
= filter
->sfe_next_onsocket
) {
717 if (filter
->sfe_filter
->sf_filter
.sf_listen
) {
721 socket_unlock(so
, 0);
723 error
= filter
->sfe_filter
->sf_filter
.
724 sf_listen(filter
->sfe_cookie
, so
);
733 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
737 if (error
== EJUSTRETURN
)
742 if (TAILQ_EMPTY(&so
->so_comp
))
743 so
->so_options
|= SO_ACCEPTCONN
;
745 * POSIX: The implementation may have an upper limit on the length of
746 * the listen queue-either global or per accepting socket. If backlog
747 * exceeds this limit, the length of the listen queue is set to the
750 * If listen() is called with a backlog argument value that is less
751 * than 0, the function behaves as if it had been called with a backlog
752 * argument value of 0.
754 * A backlog argument of 0 may allow the socket to accept connections,
755 * in which case the length of the listen queue may be set to an
756 * implementation-defined minimum value.
758 if (backlog
<= 0 || backlog
> somaxconn
)
761 so
->so_qlimit
= backlog
;
763 socket_unlock(so
, 1);
768 sofreelastref(struct socket
*so
, int dealloc
)
770 struct socket
*head
= so
->so_head
;
772 /* Assume socket is locked */
774 /* Remove any filters - may be called more than once */
777 if ((!(so
->so_flags
& SOF_PCBCLEARING
)) ||
778 ((so
->so_state
& SS_NOFDREF
) == 0)) {
780 selthreadclear(&so
->so_snd
.sb_sel
);
781 selthreadclear(&so
->so_rcv
.sb_sel
);
782 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
783 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
788 socket_lock(head
, 1);
789 if (so
->so_state
& SS_INCOMP
) {
790 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
792 } else if (so
->so_state
& SS_COMP
) {
794 * We must not decommission a socket that's
795 * on the accept(2) queue. If we do, then
796 * accept(2) may hang after select(2) indicated
797 * that the listening socket was ready.
800 selthreadclear(&so
->so_snd
.sb_sel
);
801 selthreadclear(&so
->so_rcv
.sb_sel
);
802 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
803 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
805 socket_unlock(head
, 1);
808 panic("sofree: not queued");
811 so
->so_state
&= ~SS_INCOMP
;
813 socket_unlock(head
, 1);
816 selthreadclear(&so
->so_snd
.sb_sel
);
817 sbrelease(&so
->so_snd
);
821 /* 3932268: disable upcall */
822 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
823 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
830 soclose_wait_locked(struct socket
*so
)
832 lck_mtx_t
*mutex_held
;
834 if (so
->so_proto
->pr_getlock
!= NULL
)
835 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
837 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
838 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
841 * Double check here and return if there's no outstanding upcall;
842 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
844 if (!(so
->so_flags
& SOF_UPCALLINUSE
) ||
845 !(so
->so_flags
& SOF_UPCALLCLOSEWAIT
))
848 so
->so_flags
|= SOF_CLOSEWAIT
;
849 (void) msleep((caddr_t
)&so
->so_upcall
, mutex_held
, (PZERO
- 1),
850 "soclose_wait_locked", NULL
);
851 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
852 so
->so_flags
&= ~SOF_CLOSEWAIT
;
856 * Close a socket on last file table reference removal.
857 * Initiate disconnect if connected.
858 * Free socket when disconnect complete.
861 soclose_locked(struct socket
*so
)
864 lck_mtx_t
*mutex_held
;
867 if (so
->so_usecount
== 0) {
868 panic("soclose: so=%p refcount=0\n", so
);
871 sflt_notify(so
, sock_evt_closing
, NULL
);
873 if ((so
->so_options
& SO_ACCEPTCONN
)) {
874 struct socket
*sp
, *sonext
;
878 * We do not want new connection to be added
879 * to the connection queues
881 so
->so_options
&= ~SO_ACCEPTCONN
;
883 for (sp
= TAILQ_FIRST(&so
->so_incomp
); sp
!= NULL
; sp
= sonext
) {
884 sonext
= TAILQ_NEXT(sp
, so_list
);
887 * skip sockets thrown away by tcpdropdropblreq
888 * they will get cleanup by the garbage collection.
889 * otherwise, remove the incomp socket from the queue
890 * and let soabort trigger the appropriate cleanup.
892 if (sp
->so_flags
& SOF_OVERFLOW
)
895 if (so
->so_proto
->pr_getlock
!= NULL
) {
896 /* lock ordering for consistency with the rest of the stack,
897 * we lock the socket first and then grabb the head.
899 socket_unlock(so
, 0);
905 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
908 if (sp
->so_state
& SS_INCOMP
) {
909 sp
->so_state
&= ~SS_INCOMP
;
916 socket_unlock(sp
, 1);
919 while ((sp
= TAILQ_FIRST(&so
->so_comp
)) != NULL
) {
920 /* Dequeue from so_comp since sofree() won't do it */
921 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
924 if (so
->so_proto
->pr_getlock
!= NULL
) {
925 socket_unlock(so
, 0);
929 if (sp
->so_state
& SS_COMP
) {
930 sp
->so_state
&= ~SS_COMP
;
936 if (so
->so_proto
->pr_getlock
!= NULL
) {
937 socket_unlock(sp
, 1);
942 if (so
->so_pcb
== 0) {
943 /* 3915887: mark the socket as ready for dealloc */
944 so
->so_flags
|= SOF_PCBCLEARING
;
947 if (so
->so_state
& SS_ISCONNECTED
) {
948 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
949 error
= sodisconnectlocked(so
);
953 if (so
->so_options
& SO_LINGER
) {
954 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
955 (so
->so_state
& SS_NBIO
))
957 if (so
->so_proto
->pr_getlock
!= NULL
)
958 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
960 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
961 while (so
->so_state
& SS_ISCONNECTED
) {
962 ts
.tv_sec
= (so
->so_linger
/100);
963 ts
.tv_nsec
= (so
->so_linger
% 100) *
964 NSEC_PER_USEC
* 1000 * 10;
965 error
= msleep((caddr_t
)&so
->so_timeo
,
966 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
969 * It's OK when the time fires,
970 * don't report an error
972 if (error
== EWOULDBLOCK
)
980 if (so
->so_usecount
== 0)
981 panic("soclose: usecount is zero so=%p\n", so
);
982 if (so
->so_pcb
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
983 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
987 if (so
->so_usecount
<= 0)
988 panic("soclose: usecount is zero so=%p\n", so
);
990 if (so
->so_pcb
&& so
->so_state
& SS_NOFDREF
)
991 panic("soclose: NOFDREF");
992 so
->so_state
|= SS_NOFDREF
;
994 so
->so_proto
->pr_domain
->dom_refs
--;
1003 soclose(struct socket
*so
)
1008 if (so
->so_flags
& SOF_UPCALLINUSE
)
1009 soclose_wait_locked(so
);
1011 if (so
->so_retaincnt
== 0) {
1012 error
= soclose_locked(so
);
1015 * if the FD is going away, but socket is
1016 * retained in kernel remove its reference
1019 if (so
->so_usecount
< 2)
1020 panic("soclose: retaincnt non null and so=%p "
1021 "usecount=%d\n", so
, so
->so_usecount
);
1023 socket_unlock(so
, 1);
1028 * Must be called at splnet...
1030 /* Should already be locked */
1032 soabort(struct socket
*so
)
1036 #ifdef MORE_LOCKING_DEBUG
1037 lck_mtx_t
*mutex_held
;
1039 if (so
->so_proto
->pr_getlock
!= NULL
)
1040 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1042 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1043 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1046 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1047 so
->so_flags
|= SOF_ABORTED
;
1048 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1058 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1065 if ((so
->so_state
& SS_NOFDREF
) == 0)
1066 panic("soaccept: !NOFDREF");
1067 so
->so_state
&= ~SS_NOFDREF
;
1068 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1071 socket_unlock(so
, 1);
1076 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1078 return (soacceptlock(so
, nam
, 1));
1082 soacceptfilter(struct socket
*so
)
1084 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1085 struct socket_filter_entry
*filter
;
1086 int error
= 0, filtered
= 0;
1087 struct socket
*head
= so
->so_head
;
1090 * There's no need to hold the lock; this socket
1091 * has not been made visible to the filter(s).
1093 if ((sock_getaddr(so
, &remote
, 1) != 0) ||
1094 sock_getaddr(so
, &local
, 0) != 0) {
1095 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1098 /* Out of resources; try it again next time */
1099 error
= ECONNABORTED
;
1104 * At this point, we have a reference on the listening socket
1105 * so we know it won't be going away. Do the same for the newly
1106 * accepted socket while we invoke the accept callback routine.
1109 for (filter
= so
->so_filt
; filter
!= NULL
&& error
== 0;
1110 filter
= filter
->sfe_next_onsocket
) {
1111 if (filter
->sfe_filter
->sf_filter
.sf_accept
!= NULL
) {
1115 socket_unlock(so
, 0);
1117 error
= filter
->sfe_filter
->sf_filter
.
1118 sf_accept(filter
->sfe_cookie
,
1119 head
, so
, local
, remote
);
1129 * If we get EJUSTRETURN from one of the filters, mark this socket
1130 * as inactive and return it anyway. This newly accepted socket
1131 * will be disconnected later before we hand it off to the caller.
1133 if (error
== EJUSTRETURN
) {
1135 so
->so_flags
|= SOF_DEFUNCT
;
1136 /* Prevent data from being appended to the socket buffers */
1137 so
->so_snd
.sb_flags
|= SB_DROP
;
1138 so
->so_rcv
.sb_flags
|= SB_DROP
;
1143 * This may seem like a duplication to the above error
1144 * handling part when we return ECONNABORTED, except
1145 * the following is done while holding the lock since
1146 * the socket has been exposed to the filter(s) earlier.
1148 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1150 socket_unlock(so
, 1);
1152 /* Propagate socket filter's error code to the caller */
1154 socket_unlock(so
, 1);
1157 /* Callee checks for NULL pointer */
1158 sock_freeaddr(remote
);
1159 sock_freeaddr(local
);
1164 * Returns: 0 Success
1165 * EOPNOTSUPP Operation not supported on socket
1166 * EISCONN Socket is connected
1167 * <pru_connect>:EADDRNOTAVAIL Address not available.
1168 * <pru_connect>:EINVAL Invalid argument
1169 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1170 * <pru_connect>:EACCES Permission denied
1171 * <pru_connect>:EADDRINUSE Address in use
1172 * <pru_connect>:EAGAIN Resource unavailable, try again
1173 * <pru_connect>:EPERM Operation not permitted
1174 * <sf_connect_out>:??? [anything a filter writer might set]
1177 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1180 struct proc
*p
= current_proc();
1186 * If this is a listening socket or if this is a previously-accepted
1187 * socket that has been marked as inactive, reject the connect request.
1189 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1191 socket_unlock(so
, 1);
1192 return (EOPNOTSUPP
);
1195 if ((so
->so_restrictions
& SO_RESTRICT_DENYOUT
) != 0) {
1197 socket_unlock(so
, 1);
1202 * If protocol is connection-based, can only connect once.
1203 * Otherwise, if connected, try to disconnect first.
1204 * This allows user to disconnect by connecting to, e.g.,
1207 if (so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
) &&
1208 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1209 (error
= sodisconnectlocked(so
)))) {
1213 * Run connect filter before calling protocol:
1214 * - non-blocking connect returns before completion;
1216 struct socket_filter_entry
*filter
;
1220 for (filter
= so
->so_filt
; filter
&& (error
== 0);
1221 filter
= filter
->sfe_next_onsocket
) {
1222 if (filter
->sfe_filter
->sf_filter
.sf_connect_out
) {
1223 if (filtered
== 0) {
1226 socket_unlock(so
, 0);
1228 error
= filter
->sfe_filter
->sf_filter
.
1229 sf_connect_out(filter
->sfe_cookie
, so
, nam
);
1232 if (filtered
!= 0) {
1238 if (error
== EJUSTRETURN
)
1241 socket_unlock(so
, 1);
1245 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)(so
, nam
, p
);
1248 socket_unlock(so
, 1);
1253 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1255 return (soconnectlock(so
, nam
, 1));
1259 * Returns: 0 Success
1260 * <pru_connect2>:EINVAL[AF_UNIX]
1261 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1262 * <pru_connect2>:??? [other protocol families]
1264 * Notes: <pru_connect2> is not supported by [TCP].
1267 soconnect2(struct socket
*so1
, struct socket
*so2
)
1271 socket_lock(so1
, 1);
1272 if (so2
->so_proto
->pr_lock
)
1273 socket_lock(so2
, 1);
1275 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1277 socket_unlock(so1
, 1);
1278 if (so2
->so_proto
->pr_lock
)
1279 socket_unlock(so2
, 1);
1284 sodisconnectlocked(struct socket
*so
)
1288 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1292 if (so
->so_state
& SS_ISDISCONNECTING
) {
1297 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1300 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1306 /* Locking version */
1308 sodisconnect(struct socket
*so
)
1313 error
= sodisconnectlocked(so
);
1314 socket_unlock(so
, 1);
1318 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1321 * sosendcheck will lock the socket buffer if it isn't locked and
1322 * verify that there is space for the data being inserted.
1324 * Returns: 0 Success
1326 * sblock:EWOULDBLOCK
1333 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, long resid
, long clen
,
1334 long atomic
, int flags
, int *sblocked
)
1341 if (*sblocked
== 0) {
1342 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1343 so
->so_send_filt_thread
!= 0 &&
1344 so
->so_send_filt_thread
== current_thread()) {
1346 * We're being called recursively from a filter,
1347 * allow this to continue. Radar 4150520.
1348 * Don't set sblocked because we don't want
1349 * to perform an unlock later.
1353 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1362 * If a send attempt is made on a previously-accepted socket
1363 * that has been marked as inactive (disconnected), reject
1366 if (so
->so_flags
& SOF_DEFUNCT
)
1369 if (so
->so_state
& SS_CANTSENDMORE
)
1373 error
= so
->so_error
;
1378 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1379 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1380 if ((so
->so_state
& SS_ISCONFIRMING
) == 0 &&
1381 !(resid
== 0 && clen
!= 0))
1383 } else if (addr
== 0 && !(flags
&MSG_HOLD
)) {
1384 return ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1385 ENOTCONN
: EDESTADDRREQ
);
1388 space
= sbspace(&so
->so_snd
);
1389 if (flags
& MSG_OOB
)
1391 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
1392 clen
> so
->so_snd
.sb_hiwat
)
1394 if (space
< resid
+ clen
&&
1395 (atomic
|| space
< (long)so
->so_snd
.sb_lowat
|| space
< clen
)) {
1396 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
1398 return (EWOULDBLOCK
);
1400 sbunlock(&so
->so_snd
, 1);
1401 error
= sbwait(&so
->so_snd
);
1413 * If send must go all at once and message is larger than
1414 * send buffering, then hard error.
1415 * Lock against other senders.
1416 * If must go all at once and not enough room now, then
1417 * inform user that this would block and do nothing.
1418 * Otherwise, if nonblocking, send as much as possible.
1419 * The data to be sent is described by "uio" if nonzero,
1420 * otherwise by the mbuf chain "top" (which must be null
1421 * if uio is not). Data provided in mbuf chain must be small
1422 * enough to send all at once.
1424 * Returns nonzero on error, timeout or signal; callers
1425 * must check for short counts if EINTR/ERESTART are returned.
1426 * Data and control buffers are freed on return.
1428 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1429 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1430 * point at the mbuf chain being constructed and go from there.
1432 * Returns: 0 Success
1438 * sosendcheck:EWOULDBLOCK
1442 * sosendcheck:??? [value from so_error]
1443 * <pru_send>:ECONNRESET[TCP]
1444 * <pru_send>:EINVAL[TCP]
1445 * <pru_send>:ENOBUFS[TCP]
1446 * <pru_send>:EADDRINUSE[TCP]
1447 * <pru_send>:EADDRNOTAVAIL[TCP]
1448 * <pru_send>:EAFNOSUPPORT[TCP]
1449 * <pru_send>:EACCES[TCP]
1450 * <pru_send>:EAGAIN[TCP]
1451 * <pru_send>:EPERM[TCP]
1452 * <pru_send>:EMSGSIZE[TCP]
1453 * <pru_send>:EHOSTUNREACH[TCP]
1454 * <pru_send>:ENETUNREACH[TCP]
1455 * <pru_send>:ENETDOWN[TCP]
1456 * <pru_send>:ENOMEM[TCP]
1457 * <pru_send>:ENOBUFS[TCP]
1458 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1459 * <pru_send>:EINVAL[AF_UNIX]
1460 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1461 * <pru_send>:EPIPE[AF_UNIX]
1462 * <pru_send>:ENOTCONN[AF_UNIX]
1463 * <pru_send>:EISCONN[AF_UNIX]
1464 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1465 * <sf_data_out>:??? [whatever a filter author chooses]
1467 * Notes: Other <pru_send> returns depend on the protocol family; all
1468 * <sf_data_out> returns depend on what the filter author causes
1469 * their filter to return.
1472 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
1473 struct mbuf
*top
, struct mbuf
*control
, int flags
)
1476 register struct mbuf
*m
, *freelist
= NULL
;
1477 register long space
, len
, resid
;
1478 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
1479 int atomic
= sosendallatonce(so
) || top
;
1481 struct proc
*p
= current_proc();
1484 // LP64todo - fix this!
1485 resid
= uio_resid(uio
);
1487 resid
= top
->m_pkthdr
.len
;
1489 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
1490 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
1493 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
1495 socket_unlock(so
, 1);
1500 * In theory resid should be unsigned.
1501 * However, space must be signed, as it might be less than 0
1502 * if we over-committed, and we must use a signed comparison
1503 * of space and resid. On the other hand, a negative resid
1504 * causes us to loop sending 0-length segments to the protocol.
1506 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1507 * type sockets since that's an error.
1509 if (resid
< 0 || (so
->so_type
== SOCK_STREAM
&& (flags
& MSG_EOR
))) {
1511 socket_unlock(so
, 1);
1516 (flags
& MSG_DONTROUTE
) && (so
->so_options
& SO_DONTROUTE
) == 0 &&
1517 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1519 OSIncrementAtomic(&p
->p_stats
->p_ru
.ru_msgsnd
);
1521 clen
= control
->m_len
;
1524 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
1530 space
= sbspace(&so
->so_snd
) - clen
+ ((flags
& MSG_OOB
) ?
1534 struct socket_filter_entry
*filter
;
1536 boolean_t recursive
;
1540 * Data is prepackaged in "top".
1543 if (flags
& MSG_EOR
)
1544 top
->m_flags
|= M_EOR
;
1550 bytes_to_copy
= min(resid
, space
);
1552 if (sosendminchain
> 0) {
1555 chainlength
= sosendmaxchain
;
1559 * Attempt to use larger than system page-size
1560 * clusters for large writes only if there is
1561 * a jumbo cluster pool and if the socket is
1562 * marked accordingly.
1564 jumbocl
= sosendjcl
&& njcl
> 0 &&
1565 ((so
->so_flags
& SOF_MULTIPAGES
) ||
1566 sosendjcl_ignore_capab
);
1568 socket_unlock(so
, 0);
1572 int hdrs_needed
= (top
== 0) ? 1 : 0;
1575 * try to maintain a local cache of mbuf
1576 * clusters needed to complete this
1577 * write the list is further limited to
1578 * the number that are currently needed
1579 * to fill the socket this mechanism
1580 * allows a large number of mbufs/
1581 * clusters to be grabbed under a single
1582 * mbuf lock... if we can't get any
1583 * clusters, than fall back to trying
1584 * for mbufs if we fail early (or
1585 * miscalcluate the number needed) make
1586 * sure to release any clusters we
1587 * haven't yet consumed.
1589 if (freelist
== NULL
&&
1590 bytes_to_copy
> NBPG
&& jumbocl
) {
1592 bytes_to_copy
/ M16KCLBYTES
;
1594 if ((bytes_to_copy
-
1595 (num_needed
* M16KCLBYTES
))
1600 m_getpackets_internal(
1601 (unsigned int *)&num_needed
,
1602 hdrs_needed
, M_WAIT
, 0,
1605 * Fall back to 4K cluster size
1606 * if allocation failed
1610 if (freelist
== NULL
&&
1611 bytes_to_copy
> MCLBYTES
) {
1613 bytes_to_copy
/ NBPG
;
1615 if ((bytes_to_copy
-
1616 (num_needed
* NBPG
)) >=
1621 m_getpackets_internal(
1622 (unsigned int *)&num_needed
,
1623 hdrs_needed
, M_WAIT
, 0,
1626 * Fall back to cluster size
1627 * if allocation failed
1631 if (freelist
== NULL
&&
1632 bytes_to_copy
> MINCLSIZE
) {
1634 bytes_to_copy
/ MCLBYTES
;
1636 if ((bytes_to_copy
-
1637 (num_needed
* MCLBYTES
)) >=
1642 m_getpackets_internal(
1643 (unsigned int *)&num_needed
,
1644 hdrs_needed
, M_WAIT
, 0,
1647 * Fall back to a single mbuf
1648 * if allocation failed
1652 if (freelist
== NULL
) {
1660 if (freelist
== NULL
) {
1666 * For datagram protocols,
1667 * leave room for protocol
1668 * headers in first mbuf.
1670 if (atomic
&& top
== 0 &&
1671 bytes_to_copy
< MHLEN
) {
1677 freelist
= m
->m_next
;
1680 if ((m
->m_flags
& M_EXT
))
1681 mlen
= m
->m_ext
.ext_size
;
1682 else if ((m
->m_flags
& M_PKTHDR
))
1684 MHLEN
- m_leadingspace(m
);
1687 len
= min(mlen
, bytes_to_copy
);
1693 error
= uiomove(mtod(m
, caddr_t
),
1696 // LP64todo - fix this!
1697 resid
= uio_resid(uio
);
1701 top
->m_pkthdr
.len
+= len
;
1706 if (flags
& MSG_EOR
)
1707 top
->m_flags
|= M_EOR
;
1710 bytes_to_copy
= min(resid
, space
);
1712 } while (space
> 0 &&
1713 (chainlength
< sosendmaxchain
|| atomic
||
1714 resid
< MINCLSIZE
));
1722 if (flags
& (MSG_HOLD
|MSG_SEND
)) {
1723 /* Enqueue for later, go away if HOLD */
1724 register struct mbuf
*mb1
;
1725 if (so
->so_temp
&& (flags
& MSG_FLUSH
)) {
1726 m_freem(so
->so_temp
);
1730 so
->so_tail
->m_next
= top
;
1737 if (flags
& MSG_HOLD
) {
1744 so
->so_options
|= SO_DONTROUTE
;
1746 /* Compute flags here, for pru_send and NKEs */
1747 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
1749 * If the user set MSG_EOF, the protocol
1750 * understands this flag and nothing left to
1751 * send then use PRU_SEND_EOF instead of PRU_SEND.
1753 ((flags
& MSG_EOF
) &&
1754 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
1757 /* If there is more to send set PRUS_MORETOCOME */
1758 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
1761 * Socket filter processing
1763 recursive
= (so
->so_send_filt_thread
!= NULL
);
1766 for (filter
= so
->so_filt
; filter
&& (error
== 0);
1767 filter
= filter
->sfe_next_onsocket
) {
1768 if (filter
->sfe_filter
->sf_filter
.sf_data_out
) {
1770 if (filtered
== 0) {
1772 so
->so_send_filt_thread
=
1775 socket_unlock(so
, 0);
1777 (sendflags
& MSG_OOB
) ?
1778 sock_data_filt_flag_oob
: 0;
1780 error
= filter
->sfe_filter
->sf_filter
.
1781 sf_data_out(filter
->sfe_cookie
, so
,
1782 addr
, &top
, &control
, so_flags
);
1788 * At this point, we've run at least one
1789 * filter. The socket is unlocked as is
1790 * the socket buffer. Clear the recorded
1791 * filter thread only when we are outside
1792 * of a filter's context. This allows for
1793 * a filter to issue multiple inject calls
1794 * from its sf_data_out callback routine.
1799 so
->so_send_filt_thread
= 0;
1801 if (error
== EJUSTRETURN
) {
1812 * End Socket filter processing
1815 if (error
== EJUSTRETURN
) {
1816 /* A socket filter handled this data */
1819 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
1820 (so
, sendflags
, top
, addr
, control
, p
);
1823 if (flags
& MSG_SEND
)
1827 so
->so_options
&= ~SO_DONTROUTE
;
1835 } while (resid
&& space
> 0);
1840 sbunlock(&so
->so_snd
, 0); /* will unlock socket */
1842 socket_unlock(so
, 1);
1849 m_freem_list(freelist
);
1851 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
, so
->so_snd
.sb_cc
,
1858 * Implement receive operations on a socket.
1859 * We depend on the way that records are added to the sockbuf
1860 * by sbappend*. In particular, each record (mbufs linked through m_next)
1861 * must begin with an address if the protocol so specifies,
1862 * followed by an optional mbuf or mbufs containing ancillary data,
1863 * and then zero or more mbufs of data.
1864 * In order to avoid blocking network interrupts for the entire time here,
1865 * we splx() while doing the actual copy to user space.
1866 * Although the sockbuf is locked, new data may still be appended,
1867 * and thus we must maintain consistency of the sockbuf during that time.
1869 * The caller may receive the data as a single mbuf chain by supplying
1870 * an mbuf **mp0 for use in returning the chain. The uio is then used
1871 * only for the count in uio_resid.
1873 * Returns: 0 Success
1878 * sblock:EWOULDBLOCK
1882 * sodelayed_copy:EFAULT
1883 * <pru_rcvoob>:EINVAL[TCP]
1884 * <pru_rcvoob>:EWOULDBLOCK[TCP]
1886 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1887 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1888 * <pr_domain->dom_externalize>:???
1890 * Notes: Additional return values from calls through <pru_rcvoob> and
1891 * <pr_domain->dom_externalize> depend on protocols other than
1892 * TCP or AF_UNIX, which are documented above.
1895 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
1896 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
1898 register struct mbuf
*m
, **mp
, *ml
= NULL
;
1899 register int flags
, len
, error
, offset
;
1900 struct protosw
*pr
= so
->so_proto
;
1901 struct mbuf
*nextrecord
;
1903 // LP64todo - fix this!
1904 int orig_resid
= uio_resid(uio
);
1905 struct mbuf
*free_list
;
1906 int delayed_copy_len
;
1909 struct proc
*p
= current_proc();
1911 // LP64todo - fix this!
1912 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
, uio_resid(uio
),
1913 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
1917 #ifdef MORE_LOCKING_DEBUG
1918 if (so
->so_usecount
== 1)
1919 panic("soreceive: so=%x no other reference on socket\n", so
);
1927 flags
= *flagsp
&~ MSG_EOR
;
1932 * If a recv attempt is made on a previously-accepted socket
1933 * that has been marked as inactive (disconnected), reject
1936 if (so
->so_flags
& SOF_DEFUNCT
) {
1937 struct sockbuf
*sb
= &so
->so_rcv
;
1940 * This socket should have been disconnected and flushed
1941 * prior to being returned from accept; there should be
1942 * no data on its receive list, so panic otherwise.
1944 sb_empty_assert(sb
, __func__
);
1945 socket_unlock(so
, 1);
1950 * When SO_WANTOOBFLAG is set we try to get out-of-band data
1951 * regardless of the flags argument. Here is the case were
1952 * out-of-band data is not inline.
1954 if ((flags
& MSG_OOB
) ||
1955 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
1956 (so
->so_options
& SO_OOBINLINE
) == 0 &&
1957 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
1958 m
= m_get(M_WAIT
, MT_DATA
);
1960 socket_unlock(so
, 1);
1961 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
1962 ENOBUFS
, 0, 0, 0, 0);
1965 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
1968 socket_unlock(so
, 0);
1970 // LP64todo - fix this!
1971 error
= uiomove(mtod(m
, caddr_t
),
1972 (int)min(uio_resid(uio
), m
->m_len
), uio
);
1974 } while (uio_resid(uio
) && error
== 0 && m
);
1980 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
1981 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
1983 * Let's try to get normal data:
1984 * EWOULDBLOCK: out-of-band data not
1985 * receive yet. EINVAL: out-of-band data
1990 } else if (error
== 0 && flagsp
) {
1994 socket_unlock(so
, 1);
1995 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2002 *mp
= (struct mbuf
*)0;
2003 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
))
2004 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
2007 free_list
= (struct mbuf
*)0;
2008 delayed_copy_len
= 0;
2010 #ifdef MORE_LOCKING_DEBUG
2011 if (so
->so_usecount
<= 1)
2012 printf("soreceive: sblock so=%p ref=%d on socket\n",
2013 so
, so
->so_usecount
);
2016 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2017 * and if so just return to the caller. This could happen when
2018 * soreceive() is called by a socket upcall function during the
2019 * time the socket is freed. The socket buffer would have been
2020 * locked across the upcall, therefore we cannot put this thread
2021 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2022 * we may livelock), because the lock on the socket buffer will
2023 * only be released when the upcall routine returns to its caller.
2024 * Because the socket has been officially closed, there can be
2025 * no further read on it.
2027 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
2028 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
2029 socket_unlock(so
, 1);
2033 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
2035 socket_unlock(so
, 1);
2036 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2041 m
= so
->so_rcv
.sb_mb
;
2043 * If we have less data than requested, block awaiting more
2044 * (subject to any timeout) if:
2045 * 1. the current count is less than the low water mark, or
2046 * 2. MSG_WAITALL is set, and it is possible to do the entire
2047 * receive operation at once if we block (resid <= hiwat).
2048 * 3. MSG_DONTWAIT is not set
2049 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2050 * we have to do the receive in sections, and thus risk returning
2051 * a short count if a timeout or signal occurs after we start.
2053 if (m
== 0 || (((flags
& MSG_DONTWAIT
) == 0 &&
2054 so
->so_rcv
.sb_cc
< uio_resid(uio
)) &&
2055 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
2056 ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
)) &&
2057 m
->m_nextpkt
== 0 && (pr
->pr_flags
& PR_ATOMIC
) == 0)) {
2059 * Panic if we notice inconsistencies in the socket's
2060 * receive list; both sb_mb and sb_cc should correctly
2061 * reflect the contents of the list, otherwise we may
2062 * end up with false positives during select() or poll()
2063 * which could put the application in a bad state.
2065 if (m
== NULL
&& so
->so_rcv
.sb_cc
!= 0)
2066 panic("soreceive corrupted so_rcv: m %p cc %lu",
2067 m
, so
->so_rcv
.sb_cc
);
2072 error
= so
->so_error
;
2073 if ((flags
& MSG_PEEK
) == 0)
2077 if (so
->so_state
& SS_CANTRCVMORE
) {
2083 for (; m
; m
= m
->m_next
)
2084 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
2085 m
= so
->so_rcv
.sb_mb
;
2088 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
2089 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
2093 if (uio_resid(uio
) == 0)
2095 if ((so
->so_state
& SS_NBIO
) ||
2096 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
2097 error
= EWOULDBLOCK
;
2100 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
2101 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
2102 sbunlock(&so
->so_rcv
, 1);
2103 #if EVEN_MORE_LOCKING_DEBUG
2105 printf("Waiting for socket data\n");
2108 error
= sbwait(&so
->so_rcv
);
2109 #if EVEN_MORE_LOCKING_DEBUG
2111 printf("SORECEIVE - sbwait returned %d\n", error
);
2113 if (so
->so_usecount
< 1)
2114 panic("soreceive: after 2nd sblock so=%p ref=%d on "
2115 "socket\n", so
, so
->so_usecount
);
2117 socket_unlock(so
, 1);
2118 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2127 uio
->uio_procp
->p_stats
->p_ru
.ru_msgrcv
++;
2128 #else /* __APPLE__ */
2131 * This should be uio->uio-procp; however, some callers of this
2132 * function use auto variables with stack garbage, and fail to
2133 * fill out the uio structure properly.
2136 OSIncrementAtomic(&p
->p_stats
->p_ru
.ru_msgrcv
);
2137 #endif /* __APPLE__ */
2138 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
2139 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
2140 nextrecord
= m
->m_nextpkt
;
2141 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
2142 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2143 #if CONFIG_MACF_SOCKET_SUBSET
2145 * Call the MAC framework for policy checking if we're in
2146 * the user process context and the socket isn't connected.
2148 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2149 struct mbuf
*m0
= m
;
2151 * Dequeue this record (temporarily) from the receive
2152 * list since we're about to drop the socket's lock
2153 * where a new record may arrive and be appended to
2154 * the list. Upon MAC policy failure, the record
2155 * will be freed. Otherwise, we'll add it back to
2156 * the head of the list. We cannot rely on SB_LOCK
2157 * because append operation uses the socket's lock.
2160 m
->m_nextpkt
= NULL
;
2161 sbfree(&so
->so_rcv
, m
);
2163 } while (m
!= NULL
);
2165 so
->so_rcv
.sb_mb
= nextrecord
;
2166 SB_EMPTY_FIXUP(&so
->so_rcv
);
2167 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2168 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2169 socket_unlock(so
, 0);
2170 if (mac_socket_check_received(proc_ucred(p
), so
,
2171 mtod(m
, struct sockaddr
*)) != 0) {
2173 * MAC policy failure; free this record and
2174 * process the next record (or block until
2175 * one is available). We have adjusted sb_cc
2176 * and sb_mbcnt above so there is no need to
2177 * call sbfree() again.
2181 } while (m
!= NULL
);
2183 * Clear SB_LOCK but don't unlock the socket.
2184 * Process the next record or wait for one.
2187 sbunlock(&so
->so_rcv
, 1);
2192 * Re-adjust the socket receive list and re-enqueue
2193 * the record in front of any packets which may have
2194 * been appended while we dropped the lock.
2196 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
)
2197 sballoc(&so
->so_rcv
, m
);
2198 sballoc(&so
->so_rcv
, m
);
2199 if (so
->so_rcv
.sb_mb
== NULL
) {
2200 so
->so_rcv
.sb_lastrecord
= m0
;
2201 so
->so_rcv
.sb_mbtail
= m
;
2204 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
2205 so
->so_rcv
.sb_mb
= m
;
2206 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
2207 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
2209 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2212 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*),
2214 if ((*psa
== 0) && (flags
& MSG_NEEDSA
)) {
2215 error
= EWOULDBLOCK
;
2219 if (flags
& MSG_PEEK
) {
2222 sbfree(&so
->so_rcv
, m
);
2223 if (m
->m_next
== 0 && so
->so_rcv
.sb_cc
!= 0)
2224 panic("soreceive: about to create invalid "
2226 MFREE(m
, so
->so_rcv
.sb_mb
);
2227 m
= so
->so_rcv
.sb_mb
;
2229 m
->m_nextpkt
= nextrecord
;
2231 so
->so_rcv
.sb_mb
= nextrecord
;
2232 SB_EMPTY_FIXUP(&so
->so_rcv
);
2238 * Process one or more MT_CONTROL mbufs present before any data mbufs
2239 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2240 * just copy the data; if !MSG_PEEK, we call into the protocol to
2241 * perform externalization.
2243 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
2244 struct mbuf
*cm
= NULL
, *cmn
;
2245 struct mbuf
**cme
= &cm
;
2246 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
2249 * Externalizing the control messages would require us to
2250 * drop the socket's lock below. Once we re-acquire the
2251 * lock, the mbuf chain might change. In order to preserve
2252 * consistency, we unlink all control messages from the
2253 * first mbuf chain in one shot and link them separately
2254 * onto a different chain.
2257 if (flags
& MSG_PEEK
) {
2258 if (controlp
!= NULL
) {
2259 *controlp
= m_copy(m
, 0, m
->m_len
);
2260 controlp
= &(*controlp
)->m_next
;
2264 m
->m_nextpkt
= NULL
;
2266 sb_rcv
->sb_mb
= m
->m_next
;
2269 cme
= &(*cme
)->m_next
;
2272 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
2274 if (!(flags
& MSG_PEEK
)) {
2275 if (sb_rcv
->sb_mb
!= NULL
) {
2276 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
2278 sb_rcv
->sb_mb
= nextrecord
;
2279 SB_EMPTY_FIXUP(sb_rcv
);
2281 if (nextrecord
== NULL
)
2282 sb_rcv
->sb_lastrecord
= m
;
2285 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
2286 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
2288 while (cm
!= NULL
) {
2293 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
2296 * Call the protocol to externalize SCM_RIGHTS message
2297 * and return the modified message to the caller upon
2298 * success. Otherwise, all other control messages are
2299 * returned unmodified to the caller. Note that we
2300 * only get into this loop if MSG_PEEK is not set.
2302 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
2303 cmsg_type
== SCM_RIGHTS
) {
2305 * Release socket lock: see 3903171. This
2306 * would also allow more records to be appended
2307 * to the socket buffer. We still have SB_LOCK
2308 * set on it, so we can be sure that the head
2309 * of the mbuf chain won't change.
2311 socket_unlock(so
, 0);
2312 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
2318 if (controlp
!= NULL
&& error
== 0) {
2320 controlp
= &(*controlp
)->m_next
;
2328 if (sb_rcv
->sb_mb
!= NULL
)
2329 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
2335 if (!(flags
& MSG_PEEK
)) {
2337 * We get here because m points to an mbuf following
2338 * any MT_SONAME or MT_CONTROL mbufs which have been
2339 * processed above. In any case, m should be pointing
2340 * to the head of the mbuf chain, and the nextrecord
2341 * should be either NULL or equal to m->m_nextpkt.
2342 * See comments above about SB_LOCK.
2344 if (m
!= so
->so_rcv
.sb_mb
|| m
->m_nextpkt
!= nextrecord
)
2345 panic("soreceive: post-control !sync so=%p "
2346 "m=%p nextrecord=%p\n", so
, m
, nextrecord
);
2348 if (nextrecord
== NULL
)
2349 so
->so_rcv
.sb_lastrecord
= m
;
2352 if (type
== MT_OOBDATA
)
2355 if (!(flags
& MSG_PEEK
)) {
2356 so
->so_rcv
.sb_mb
= nextrecord
;
2357 SB_EMPTY_FIXUP(&so
->so_rcv
);
2360 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
2361 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
2366 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
)
2373 while (m
&& (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
2374 if (m
->m_type
== MT_OOBDATA
) {
2375 if (type
!= MT_OOBDATA
)
2377 } else if (type
== MT_OOBDATA
) {
2381 * Make sure to allways set MSG_OOB event when getting
2382 * out of band data inline.
2384 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
2385 (so
->so_options
& SO_OOBINLINE
) != 0 &&
2386 (so
->so_state
& SS_RCVATMARK
) != 0) {
2389 so
->so_state
&= ~SS_RCVATMARK
;
2390 // LP64todo - fix this!
2391 len
= uio_resid(uio
) - delayed_copy_len
;
2392 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
)
2393 len
= so
->so_oobmark
- offset
;
2394 if (len
> m
->m_len
- moff
)
2395 len
= m
->m_len
- moff
;
2397 * If mp is set, just pass back the mbufs.
2398 * Otherwise copy them out via the uio, then free.
2399 * Sockbuf must be consistent here (points to current mbuf,
2400 * it points to next record) when we drop priority;
2401 * we must note any additions to the sockbuf when we
2402 * block interrupts again.
2405 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
2406 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
2407 if (can_delay
&& len
== m
->m_len
) {
2409 * only delay the copy if we're consuming the
2410 * mbuf and we're NOT in MSG_PEEK mode
2411 * and we have enough data to make it worthwile
2412 * to drop and retake the lock... can_delay
2413 * reflects the state of the 2 latter
2414 * constraints moff should always be zero
2417 delayed_copy_len
+= len
;
2419 if (delayed_copy_len
) {
2420 error
= sodelayed_copy(so
, uio
,
2421 &free_list
, &delayed_copy_len
);
2427 * can only get here if MSG_PEEK is not
2428 * set therefore, m should point at the
2429 * head of the rcv queue; if it doesn't,
2430 * it means something drastically
2431 * changed while we were out from behind
2432 * the lock in sodelayed_copy. perhaps
2433 * a RST on the stream. in any event,
2434 * the stream has been interrupted. it's
2435 * probably best just to return whatever
2436 * data we've moved and let the caller
2439 if (m
!= so
->so_rcv
.sb_mb
) {
2443 socket_unlock(so
, 0);
2444 error
= uiomove(mtod(m
, caddr_t
) + moff
,
2452 uio_setresid(uio
, (uio_resid(uio
) - len
));
2454 if (len
== m
->m_len
- moff
) {
2455 if (m
->m_flags
& M_EOR
)
2457 if (flags
& MSG_PEEK
) {
2461 nextrecord
= m
->m_nextpkt
;
2462 sbfree(&so
->so_rcv
, m
);
2463 m
->m_nextpkt
= NULL
;
2468 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
2469 *mp
= (struct mbuf
*)0;
2471 if (free_list
== NULL
)
2476 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
2480 m
->m_nextpkt
= nextrecord
;
2481 if (nextrecord
== NULL
)
2482 so
->so_rcv
.sb_lastrecord
= m
;
2484 so
->so_rcv
.sb_mb
= nextrecord
;
2485 SB_EMPTY_FIXUP(&so
->so_rcv
);
2487 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
2488 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
2491 if (flags
& MSG_PEEK
) {
2495 *mp
= m_copym(m
, 0, len
, M_WAIT
);
2498 so
->so_rcv
.sb_cc
-= len
;
2501 if (so
->so_oobmark
) {
2502 if ((flags
& MSG_PEEK
) == 0) {
2503 so
->so_oobmark
-= len
;
2504 if (so
->so_oobmark
== 0) {
2505 so
->so_state
|= SS_RCVATMARK
;
2507 * delay posting the actual event until
2508 * after any delayed copy processing
2516 if (offset
== so
->so_oobmark
)
2520 if (flags
& MSG_EOR
)
2523 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2524 * (for non-atomic socket), we must not quit until
2525 * "uio->uio_resid == 0" or an error termination.
2526 * If a signal/timeout occurs, return with a short
2527 * count but without error. Keep sockbuf locked
2528 * against other readers.
2530 while (flags
& (MSG_WAITALL
|MSG_WAITSTREAM
) && m
== 0 &&
2531 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
2532 !sosendallatonce(so
) && !nextrecord
) {
2533 if (so
->so_error
|| so
->so_state
& SS_CANTRCVMORE
)
2537 * Depending on the protocol (e.g. TCP), the following
2538 * might cause the socket lock to be dropped and later
2539 * be reacquired, and more data could have arrived and
2540 * have been appended to the receive socket buffer by
2541 * the time it returns. Therefore, we only sleep in
2542 * sbwait() below if and only if the socket buffer is
2543 * empty, in order to avoid a false sleep.
2545 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
&&
2546 (((struct inpcb
*)so
->so_pcb
)->inp_state
!=
2548 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
2550 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
2551 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
2553 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
2558 * have to wait until after we get back from the sbwait
2559 * to do the copy because we will drop the lock if we
2560 * have enough data that has been delayed... by dropping
2561 * the lock we open up a window allowing the netisr
2562 * thread to process the incoming packets and to change
2563 * the state of this socket... we're issuing the sbwait
2564 * because the socket is empty and we're expecting the
2565 * netisr thread to wake us up when more packets arrive;
2566 * if we allow that processing to happen and then sbwait
2567 * we could stall forever with packets sitting in the
2568 * socket if no further packets arrive from the remote
2571 * we want to copy before we've collected all the data
2572 * to satisfy this request to allow the copy to overlap
2573 * the incoming packet processing on an MP system
2575 if (delayed_copy_len
> sorecvmincopy
&&
2576 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
2577 error
= sodelayed_copy(so
, uio
,
2578 &free_list
, &delayed_copy_len
);
2583 m
= so
->so_rcv
.sb_mb
;
2585 nextrecord
= m
->m_nextpkt
;
2589 #ifdef MORE_LOCKING_DEBUG
2590 if (so
->so_usecount
<= 1)
2591 panic("soreceive: after big while so=%p ref=%d on socket\n",
2592 so
, so
->so_usecount
);
2595 if (m
&& pr
->pr_flags
& PR_ATOMIC
) {
2597 if (so
->so_options
& SO_DONTTRUNC
) {
2598 flags
|= MSG_RCVMORE
;
2602 if ((flags
& MSG_PEEK
) == 0)
2603 (void) sbdroprecord(&so
->so_rcv
);
2610 * pru_rcvd below (for TCP) may cause more data to be received
2611 * if the socket lock is dropped prior to sending the ACK; some
2612 * legacy OpenTransport applications don't handle this well
2613 * (if it receives less data than requested while MSG_HAVEMORE
2614 * is set), and so we set the flag now based on what we know
2615 * prior to calling pru_rcvd.
2617 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
2618 flags
|= MSG_HAVEMORE
;
2620 if ((flags
& MSG_PEEK
) == 0) {
2622 so
->so_rcv
.sb_mb
= nextrecord
;
2624 * First part is an inline SB_EMPTY_FIXUP(). Second
2625 * part makes sure sb_lastrecord is up-to-date if
2626 * there is still data in the socket buffer.
2628 if (so
->so_rcv
.sb_mb
== NULL
) {
2629 so
->so_rcv
.sb_mbtail
= NULL
;
2630 so
->so_rcv
.sb_lastrecord
= NULL
;
2631 } else if (nextrecord
->m_nextpkt
== NULL
) {
2632 so
->so_rcv
.sb_lastrecord
= nextrecord
;
2635 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
2636 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
2637 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
2638 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
2641 if (delayed_copy_len
) {
2642 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
2648 m_freem_list((struct mbuf
*)free_list
);
2649 free_list
= (struct mbuf
*)0;
2652 postevent(so
, 0, EV_OOB
);
2654 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
2655 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
2656 sbunlock(&so
->so_rcv
, 1);
2663 #ifdef MORE_LOCKING_DEBUG
2664 if (so
->so_usecount
<= 1)
2665 panic("soreceive: release so=%p ref=%d on socket\n",
2666 so
, so
->so_usecount
);
2668 if (delayed_copy_len
) {
2669 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
2672 m_freem_list((struct mbuf
*)free_list
);
2674 sbunlock(&so
->so_rcv
, 0); /* will unlock socket */
2676 // LP64todo - fix this!
2677 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
2678 so
->so_rcv
.sb_cc
, 0, error
);
2684 * Returns: 0 Success
2688 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
2696 socket_unlock(so
, 0);
2698 while (m
&& error
== 0) {
2700 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
2704 m_freem_list(*free_list
);
2706 *free_list
= (struct mbuf
*)NULL
;
2716 * Returns: 0 Success
2719 * <pru_shutdown>:EINVAL
2720 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
2721 * <pru_shutdown>:ENOBUFS[TCP]
2722 * <pru_shutdown>:EMSGSIZE[TCP]
2723 * <pru_shutdown>:EHOSTUNREACH[TCP]
2724 * <pru_shutdown>:ENETUNREACH[TCP]
2725 * <pru_shutdown>:ENETDOWN[TCP]
2726 * <pru_shutdown>:ENOMEM[TCP]
2727 * <pru_shutdown>:EACCES[TCP]
2728 * <pru_shutdown>:EMSGSIZE[TCP]
2729 * <pru_shutdown>:ENOBUFS[TCP]
2730 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2731 * <pru_shutdown>:??? [other protocol families]
2734 soshutdown(struct socket
*so
, int how
)
2744 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) == 0) {
2747 error
= soshutdownlock(so
, how
);
2749 socket_unlock(so
, 1);
2760 soshutdownlock(struct socket
*so
, int how
)
2762 struct protosw
*pr
= so
->so_proto
;
2765 sflt_notify(so
, sock_evt_shutdown
, &how
);
2767 if (how
!= SHUT_WR
) {
2768 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
2769 /* read already shut down */
2774 postevent(so
, 0, EV_RCLOSED
);
2776 if (how
!= SHUT_RD
) {
2777 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
2778 /* write already shut down */
2782 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
2783 postevent(so
, 0, EV_WCLOSED
);
2786 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
2791 sorflush(struct socket
*so
)
2793 register struct sockbuf
*sb
= &so
->so_rcv
;
2794 register struct protosw
*pr
= so
->so_proto
;
2797 #ifdef MORE_LOCKING_DEBUG
2798 lck_mtx_t
*mutex_held
;
2800 if (so
->so_proto
->pr_getlock
!= NULL
)
2801 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
2803 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
2804 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
2807 sflt_notify(so
, sock_evt_flush_read
, NULL
);
2809 sb
->sb_flags
|= SB_NOINTR
;
2810 (void) sblock(sb
, M_WAIT
);
2814 selthreadclear(&sb
->sb_sel
);
2817 bzero((caddr_t
)sb
, sizeof (*sb
));
2818 sb
->sb_so
= so
; /* reestablish link to socket */
2819 if (asb
.sb_flags
& SB_KNOTE
) {
2820 sb
->sb_sel
.si_note
= asb
.sb_sel
.si_note
;
2821 sb
->sb_flags
= SB_KNOTE
;
2823 if (asb
.sb_flags
& SB_DROP
)
2824 sb
->sb_flags
|= SB_DROP
;
2825 if (asb
.sb_flags
& SB_UNIX
)
2826 sb
->sb_flags
|= SB_UNIX
;
2827 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
) {
2828 boolean_t unp
= (pr
->pr_domain
->dom_dispose
== unp_dispose
);
2830 * Currently AF_UNIX domain uses a global domain mutex;
2831 * unp_dispose() may end up calling soclose() on another
2832 * AF_UNIX socket and therefore the lock must not be held
2836 socket_unlock(so
, 0);
2837 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
2845 * Perhaps this routine, and sooptcopyout(), below, ought to come in
2846 * an additional variant to handle the case where the option value needs
2847 * to be some kind of integer, but not a specific size.
2848 * In addition to their use here, these functions are also called by the
2849 * protocol-level pr_ctloutput() routines.
2851 * Returns: 0 Success
2856 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
2861 * If the user gives us more than we wanted, we ignore it,
2862 * but if we don't get the minimum length the caller
2863 * wants, we return EINVAL. On success, sopt->sopt_valsize
2864 * is set to however much we actually retrieved.
2866 if ((valsize
= sopt
->sopt_valsize
) < minlen
)
2869 sopt
->sopt_valsize
= valsize
= len
;
2871 if (sopt
->sopt_p
!= 0)
2872 return (copyin(sopt
->sopt_val
, buf
, valsize
));
2874 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
2879 * sooptcopyin_timeval
2880 * Copy in a timeval value into tv_p, and take into account whether the
2881 * the calling process is 64-bit or 32-bit. Moved the sanity checking
2882 * code here so that we can verify the 64-bit tv_sec value before we lose
2883 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2886 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
* tv_p
)
2890 if (proc_is64bit(sopt
->sopt_p
)) {
2891 struct timeval64 tv64
;
2893 if (sopt
->sopt_valsize
< sizeof(tv64
)) {
2896 sopt
->sopt_valsize
= sizeof(tv64
);
2897 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof(tv64
));
2901 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
2902 || tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000) {
2905 tv_p
->tv_sec
= tv64
.tv_sec
;
2906 tv_p
->tv_usec
= tv64
.tv_usec
;
2908 if (sopt
->sopt_valsize
< sizeof(*tv_p
)) {
2911 sopt
->sopt_valsize
= sizeof(*tv_p
);
2912 if (sopt
->sopt_p
!= 0) {
2913 error
= copyin(sopt
->sopt_val
, tv_p
, sizeof(*tv_p
));
2918 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), tv_p
,
2921 if (tv_p
->tv_sec
< 0 || tv_p
->tv_sec
> LONG_MAX
2922 || tv_p
->tv_usec
< 0 || tv_p
->tv_usec
>= 1000000) {
2930 * Returns: 0 Success
2935 * sooptcopyin:EINVAL
2936 * sooptcopyin:EFAULT
2937 * sooptcopyin_timeval:EINVAL
2938 * sooptcopyin_timeval:EFAULT
2939 * sooptcopyin_timeval:EDOM
2940 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2941 * <pr_ctloutput>:???w
2942 * sflt_attach_private:??? [whatever a filter author chooses]
2943 * <sf_setoption>:??? [whatever a filter author chooses]
2945 * Notes: Other <pru_listen> returns depend on the protocol family; all
2946 * <sf_listen> returns depend on what the filter author causes
2947 * their filter to return.
2950 sosetopt(struct socket
*so
, struct sockopt
*sopt
)
2955 struct socket_filter_entry
*filter
;
2957 #if CONFIG_MACF_SOCKET
2959 #endif /* MAC_SOCKET */
2962 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
))
2963 == (SS_CANTRCVMORE
| SS_CANTSENDMORE
)) {
2964 /* the socket has been shutdown, no more sockopt's */
2969 if (sopt
->sopt_dir
!= SOPT_SET
) {
2970 sopt
->sopt_dir
= SOPT_SET
;
2974 for (filter
= so
->so_filt
; filter
&& (error
== 0);
2975 filter
= filter
->sfe_next_onsocket
) {
2976 if (filter
->sfe_filter
->sf_filter
.sf_setoption
) {
2977 if (filtered
== 0) {
2980 socket_unlock(so
, 0);
2982 error
= filter
->sfe_filter
->sf_filter
.
2983 sf_setoption(filter
->sfe_cookie
, so
, sopt
);
2987 if (filtered
!= 0) {
2992 if (error
== EJUSTRETURN
)
2999 if (sopt
->sopt_level
!= SOL_SOCKET
) {
3000 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3001 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
3002 socket_unlock(so
, 1);
3005 error
= ENOPROTOOPT
;
3007 switch (sopt
->sopt_name
) {
3010 error
= sooptcopyin(sopt
, &l
, sizeof (l
), sizeof (l
));
3014 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
3015 l
.l_linger
: l
.l_linger
* hz
;
3017 so
->so_options
|= SO_LINGER
;
3019 so
->so_options
&= ~SO_LINGER
;
3025 case SO_USELOOPBACK
:
3034 case SO_WANTOOBFLAG
:
3036 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3041 so
->so_options
|= sopt
->sopt_name
;
3043 so
->so_options
&= ~sopt
->sopt_name
;
3050 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3056 * Values < 1 make no sense for any of these
3057 * options, so disallow them.
3064 switch (sopt
->sopt_name
) {
3067 if (sbreserve(sopt
->sopt_name
== SO_SNDBUF
?
3068 &so
->so_snd
: &so
->so_rcv
,
3069 (u_long
) optval
) == 0) {
3073 if (sopt
->sopt_name
== SO_SNDBUF
)
3074 so
->so_snd
.sb_flags
|= SB_USRSIZE
;
3076 so
->so_rcv
.sb_flags
|= SB_USRSIZE
;
3080 * Make sure the low-water is never greater than
3084 so
->so_snd
.sb_lowat
=
3085 (optval
> so
->so_snd
.sb_hiwat
) ?
3086 so
->so_snd
.sb_hiwat
: optval
;
3089 so
->so_rcv
.sb_lowat
=
3090 (optval
> so
->so_rcv
.sb_hiwat
) ?
3091 so
->so_rcv
.sb_hiwat
: optval
;
3098 error
= sooptcopyin_timeval(sopt
, &tv
);
3102 switch (sopt
->sopt_name
) {
3104 so
->so_snd
.sb_timeo
= tv
;
3107 so
->so_rcv
.sb_timeo
= tv
;
3116 error
= sooptcopyin(sopt
, &nke
, sizeof (nke
),
3121 error
= sflt_attach_private(so
, NULL
,
3127 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3132 so
->so_flags
|= SOF_NOSIGPIPE
;
3134 so
->so_flags
&= ~SOF_NOSIGPIPE
;
3139 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3144 so
->so_flags
|= SOF_NOADDRAVAIL
;
3146 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
3150 case SO_REUSESHAREUID
:
3151 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3156 so
->so_flags
|= SOF_REUSESHAREUID
;
3158 so
->so_flags
&= ~SOF_REUSESHAREUID
;
3160 #ifdef __APPLE_API_PRIVATE
3161 case SO_NOTIFYCONFLICT
:
3162 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3166 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3171 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
3173 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
3176 case SO_RESTRICTIONS
:
3177 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3181 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3185 so
->so_restrictions
= (optval
& (SO_RESTRICT_DENYIN
|
3186 SO_RESTRICT_DENYOUT
| SO_RESTRICT_DENYSET
));
3190 #if CONFIG_MACF_SOCKET
3191 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3192 sizeof (extmac
))) != 0)
3195 error
= mac_setsockopt_label(proc_ucred(sopt
->sopt_p
),
3199 #endif /* MAC_SOCKET */
3202 #ifdef __APPLE_API_PRIVATE
3203 case SO_UPCALLCLOSEWAIT
:
3204 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3209 so
->so_flags
|= SOF_UPCALLCLOSEWAIT
;
3211 so
->so_flags
&= ~SOF_UPCALLCLOSEWAIT
;
3216 error
= ENOPROTOOPT
;
3219 if (error
== 0 && so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3220 (void) ((*so
->so_proto
->pr_ctloutput
)(so
, sopt
));
3224 socket_unlock(so
, 1);
3228 /* Helper routines for getsockopt */
3230 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
3238 * Documented get behavior is that we always return a value,
3239 * possibly truncated to fit in the user's buffer.
3240 * Traditional behavior is that we always tell the user
3241 * precisely how much we copied, rather than something useful
3242 * like the total amount we had available for her.
3243 * Note that this interface is not idempotent; the entire answer must
3244 * generated ahead of time.
3246 valsize
= min(len
, sopt
->sopt_valsize
);
3247 sopt
->sopt_valsize
= valsize
;
3248 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
3249 if (sopt
->sopt_p
!= 0)
3250 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
3252 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
3258 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
* tv_p
)
3262 struct timeval64 tv64
;
3267 if (proc_is64bit(sopt
->sopt_p
)) {
3268 len
= sizeof(struct timeval64
);
3269 tv64
.tv_sec
= tv_p
->tv_sec
;
3270 tv64
.tv_usec
= tv_p
->tv_usec
;
3273 len
= sizeof(struct timeval
);
3276 valsize
= min(len
, sopt
->sopt_valsize
);
3277 sopt
->sopt_valsize
= valsize
;
3278 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
3279 if (sopt
->sopt_p
!= 0)
3280 error
= copyout(val
, sopt
->sopt_val
, valsize
);
3282 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
3290 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3291 * <pr_ctloutput>:???
3292 * <sf_getoption>:???
3295 sogetopt(struct socket
*so
, struct sockopt
*sopt
)
3300 struct socket_filter_entry
*filter
;
3302 #if CONFIG_MACF_SOCKET
3304 #endif /* MAC_SOCKET */
3306 if (sopt
->sopt_dir
!= SOPT_GET
) {
3307 sopt
->sopt_dir
= SOPT_GET
;
3313 for (filter
= so
->so_filt
; filter
&& (error
== 0);
3314 filter
= filter
->sfe_next_onsocket
) {
3315 if (filter
->sfe_filter
->sf_filter
.sf_getoption
) {
3316 if (filtered
== 0) {
3319 socket_unlock(so
, 0);
3321 error
= filter
->sfe_filter
->sf_filter
.
3322 sf_getoption(filter
->sfe_cookie
, so
, sopt
);
3325 if (filtered
!= 0) {
3330 if (error
== EJUSTRETURN
)
3332 socket_unlock(so
, 1);
3338 if (sopt
->sopt_level
!= SOL_SOCKET
) {
3339 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3340 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
3341 socket_unlock(so
, 1);
3344 socket_unlock(so
, 1);
3345 return (ENOPROTOOPT
);
3348 switch (sopt
->sopt_name
) {
3351 l
.l_onoff
= so
->so_options
& SO_LINGER
;
3352 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
3353 so
->so_linger
: so
->so_linger
/ hz
;
3354 error
= sooptcopyout(sopt
, &l
, sizeof (l
));
3357 case SO_USELOOPBACK
:
3369 case SO_WANTOOBFLAG
:
3371 optval
= so
->so_options
& sopt
->sopt_name
;
3373 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
3377 optval
= so
->so_type
;
3382 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
3387 m1
= so
->so_rcv
.sb_mb
;
3389 if (m1
->m_type
== MT_DATA
|| m1
->m_type
== MT_HEADER
||
3390 m1
->m_type
== MT_OOBDATA
)
3391 pkt_total
+= m1
->m_len
;
3396 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
3401 optval
= so
->so_snd
.sb_cc
;
3405 optval
= so
->so_error
;
3410 optval
= so
->so_snd
.sb_hiwat
;
3414 optval
= so
->so_rcv
.sb_hiwat
;
3418 optval
= so
->so_snd
.sb_lowat
;
3422 optval
= so
->so_rcv
.sb_lowat
;
3427 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
3428 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
3430 error
= sooptcopyout_timeval(sopt
, &tv
);
3434 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
3438 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
3441 case SO_REUSESHAREUID
:
3442 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
3445 #ifdef __APPLE_API_PRIVATE
3446 case SO_NOTIFYCONFLICT
:
3447 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
3450 case SO_RESTRICTIONS
:
3451 optval
= so
->so_restrictions
& (SO_RESTRICT_DENYIN
|
3452 SO_RESTRICT_DENYOUT
| SO_RESTRICT_DENYSET
);
3456 #if CONFIG_MACF_SOCKET
3457 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3458 sizeof (extmac
))) != 0 ||
3459 (error
= mac_socket_label_get(proc_ucred(
3460 sopt
->sopt_p
), so
, &extmac
)) != 0)
3463 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
3466 #endif /* MAC_SOCKET */
3470 #if CONFIG_MACF_SOCKET
3471 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3472 sizeof (extmac
))) != 0 ||
3473 (error
= mac_socketpeer_label_get(proc_ucred(
3474 sopt
->sopt_p
), so
, &extmac
)) != 0)
3477 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
3480 #endif /* MAC_SOCKET */
3483 #ifdef __APPLE_API_PRIVATE
3484 case SO_UPCALLCLOSEWAIT
:
3485 optval
= (so
->so_flags
& SOF_UPCALLCLOSEWAIT
);
3490 error
= ENOPROTOOPT
;
3493 socket_unlock(so
, 1);
3498 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
3500 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
3502 struct mbuf
*m
, *m_prev
;
3503 int sopt_size
= sopt
->sopt_valsize
;
3505 if (sopt_size
> MAX_SOOPTGETM_SIZE
)
3508 MGET(m
, sopt
->sopt_p
? M_WAIT
: M_DONTWAIT
, MT_DATA
);
3511 if (sopt_size
> MLEN
) {
3512 MCLGET(m
, sopt
->sopt_p
? M_WAIT
: M_DONTWAIT
);
3513 if ((m
->m_flags
& M_EXT
) == 0) {
3517 m
->m_len
= min(MCLBYTES
, sopt_size
);
3519 m
->m_len
= min(MLEN
, sopt_size
);
3521 sopt_size
-= m
->m_len
;
3526 MGET(m
, sopt
->sopt_p
? M_WAIT
: M_DONTWAIT
, MT_DATA
);
3531 if (sopt_size
> MLEN
) {
3532 MCLGET(m
, sopt
->sopt_p
? M_WAIT
: M_DONTWAIT
);
3533 if ((m
->m_flags
& M_EXT
) == 0) {
3537 m
->m_len
= min(MCLBYTES
, sopt_size
);
3539 m
->m_len
= min(MLEN
, sopt_size
);
3541 sopt_size
-= m
->m_len
;
3548 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
3550 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
3552 struct mbuf
*m0
= m
;
3554 if (sopt
->sopt_val
== USER_ADDR_NULL
)
3556 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
3557 if (sopt
->sopt_p
!= NULL
) {
3560 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
3567 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
3568 mtod(m
, char *), m
->m_len
);
3570 sopt
->sopt_valsize
-= m
->m_len
;
3571 sopt
->sopt_val
+= m
->m_len
;
3574 if (m
!= NULL
) /* should be allocated enoughly at ip6_sooptmcopyin() */
3575 panic("soopt_mcopyin");
3579 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
3581 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
3583 struct mbuf
*m0
= m
;
3586 if (sopt
->sopt_val
== USER_ADDR_NULL
)
3588 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
3589 if (sopt
->sopt_p
!= NULL
) {
3592 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
3599 bcopy(mtod(m
, char *),
3600 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
3602 sopt
->sopt_valsize
-= m
->m_len
;
3603 sopt
->sopt_val
+= m
->m_len
;
3604 valsize
+= m
->m_len
;
3608 /* enough soopt buffer should be given from user-land */
3612 sopt
->sopt_valsize
= valsize
;
3617 sohasoutofband(struct socket
*so
)
3620 if (so
->so_pgid
< 0)
3621 gsignal(-so
->so_pgid
, SIGURG
);
3622 else if (so
->so_pgid
> 0)
3623 proc_signal(so
->so_pgid
, SIGURG
);
3624 selwakeup(&so
->so_rcv
.sb_sel
);
3628 sopoll(struct socket
*so
, int events
, __unused kauth_cred_t cred
, void * wql
)
3630 struct proc
*p
= current_proc();
3635 if (events
& (POLLIN
| POLLRDNORM
))
3637 revents
|= events
& (POLLIN
| POLLRDNORM
);
3639 if (events
& (POLLOUT
| POLLWRNORM
))
3640 if (sowriteable(so
))
3641 revents
|= events
& (POLLOUT
| POLLWRNORM
);
3643 if (events
& (POLLPRI
| POLLRDBAND
))
3644 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
))
3645 revents
|= events
& (POLLPRI
| POLLRDBAND
);
3648 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
3650 * Darwin sets the flag first,
3651 * BSD calls selrecord first
3653 so
->so_rcv
.sb_flags
|= SB_SEL
;
3654 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
3657 if (events
& (POLLOUT
| POLLWRNORM
)) {
3659 * Darwin sets the flag first,
3660 * BSD calls selrecord first
3662 so
->so_snd
.sb_flags
|= SB_SEL
;
3663 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
3667 socket_unlock(so
, 1);
3672 soo_kqfilter(__unused
struct fileproc
*fp
, struct knote
*kn
,
3673 __unused
struct proc
*p
)
3675 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3680 #if CONFIG_MACF_SOCKET
3681 if (mac_socket_check_kqfilter(proc_ucred(p
), kn
, so
) != 0) {
3682 socket_unlock(so
, 1);
3685 #endif /* MAC_SOCKET */
3687 switch (kn
->kn_filter
) {
3689 if (so
->so_options
& SO_ACCEPTCONN
)
3690 kn
->kn_fop
= &solisten_filtops
;
3692 kn
->kn_fop
= &soread_filtops
;
3696 kn
->kn_fop
= &sowrite_filtops
;
3700 socket_unlock(so
, 1);
3704 if (KNOTE_ATTACH(&sb
->sb_sel
.si_note
, kn
))
3705 sb
->sb_flags
|= SB_KNOTE
;
3706 socket_unlock(so
, 1);
3711 filt_sordetach(struct knote
*kn
)
3713 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3716 if (so
->so_rcv
.sb_flags
& SB_KNOTE
)
3717 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
3718 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
3719 socket_unlock(so
, 1);
3724 filt_soread(struct knote
*kn
, long hint
)
3726 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3728 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3731 kn
->kn_data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
3733 if (so
->so_oobmark
) {
3734 if (kn
->kn_flags
& EV_OOBAND
) {
3735 kn
->kn_data
-= so
->so_oobmark
;
3736 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3737 socket_unlock(so
, 1);
3740 kn
->kn_data
= so
->so_oobmark
;
3741 kn
->kn_flags
|= EV_OOBAND
;
3743 if (so
->so_state
& SS_CANTRCVMORE
) {
3744 kn
->kn_flags
|= EV_EOF
;
3745 kn
->kn_fflags
= so
->so_error
;
3746 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3747 socket_unlock(so
, 1);
3752 if (so
->so_state
& SS_RCVATMARK
) {
3753 if (kn
->kn_flags
& EV_OOBAND
) {
3754 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3755 socket_unlock(so
, 1);
3758 kn
->kn_flags
|= EV_OOBAND
;
3759 } else if (kn
->kn_flags
& EV_OOBAND
) {
3761 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3762 socket_unlock(so
, 1);
3766 if (so
->so_error
) { /* temporary udp error */
3767 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3768 socket_unlock(so
, 1);
3772 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3773 socket_unlock(so
, 1);
3775 return ((kn
->kn_flags
& EV_OOBAND
) ||
3776 kn
->kn_data
>= ((kn
->kn_sfflags
& NOTE_LOWAT
) ?
3777 kn
->kn_sdata
: so
->so_rcv
.sb_lowat
));
3781 filt_sowdetach(struct knote
*kn
)
3783 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3786 if (so
->so_snd
.sb_flags
& SB_KNOTE
)
3787 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
3788 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
3789 socket_unlock(so
, 1);
3794 filt_sowrite(struct knote
*kn
, long hint
)
3796 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3798 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3801 kn
->kn_data
= sbspace(&so
->so_snd
);
3802 if (so
->so_state
& SS_CANTSENDMORE
) {
3803 kn
->kn_flags
|= EV_EOF
;
3804 kn
->kn_fflags
= so
->so_error
;
3805 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3806 socket_unlock(so
, 1);
3809 if (so
->so_error
) { /* temporary udp error */
3810 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3811 socket_unlock(so
, 1);
3814 if (((so
->so_state
& SS_ISCONNECTED
) == 0) &&
3815 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3816 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3817 socket_unlock(so
, 1);
3820 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3821 socket_unlock(so
, 1);
3822 if (kn
->kn_sfflags
& NOTE_LOWAT
)
3823 return (kn
->kn_data
>= kn
->kn_sdata
);
3824 return (kn
->kn_data
>= so
->so_snd
.sb_lowat
);
3829 filt_solisten(struct knote
*kn
, long hint
)
3831 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3834 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3836 kn
->kn_data
= so
->so_qlen
;
3837 isempty
= ! TAILQ_EMPTY(&so
->so_comp
);
3838 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3839 socket_unlock(so
, 1);
3845 socket_lock(struct socket
*so
, int refcount
)
3847 int error
= 0, lr_saved
;
3849 lr_saved
= (unsigned int) __builtin_return_address(0);
3851 if (so
->so_proto
->pr_lock
) {
3852 error
= (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
3854 #ifdef MORE_LOCKING_DEBUG
3855 lck_mtx_assert(so
->so_proto
->pr_domain
->dom_mtx
,
3856 LCK_MTX_ASSERT_NOTOWNED
);
3858 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
3861 so
->lock_lr
[so
->next_lock_lr
] = (u_int32_t
)lr_saved
;
3862 so
->next_lock_lr
= (so
->next_lock_lr
+1) % SO_LCKDBG_MAX
;
3869 socket_unlock(struct socket
*so
, int refcount
)
3871 int error
= 0, lr_saved
;
3872 lck_mtx_t
*mutex_held
;
3874 lr_saved
= (unsigned int) __builtin_return_address(0);
3876 if (so
->so_proto
== NULL
)
3877 panic("socket_unlock null so_proto so=%p\n", so
);
3879 if (so
&& so
->so_proto
->pr_unlock
) {
3880 error
= (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
3882 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
3883 #ifdef MORE_LOCKING_DEBUG
3884 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
3886 so
->unlock_lr
[so
->next_unlock_lr
] = (u_int32_t
)lr_saved
;
3887 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
3890 if (so
->so_usecount
<= 0)
3891 panic("socket_unlock: bad refcount so=%p "
3892 "value=%d\n", so
, so
->so_usecount
);
3894 if (so
->so_usecount
== 0) {
3895 sofreelastref(so
, 1);
3898 lck_mtx_unlock(mutex_held
);
3904 /* Called with socket locked, will unlock socket */
3906 sofree(struct socket
*so
)
3909 lck_mtx_t
*mutex_held
;
3910 if (so
->so_proto
->pr_getlock
!= NULL
)
3911 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
3913 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
3914 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
3916 sofreelastref(so
, 0);
3920 soreference(struct socket
*so
)
3922 socket_lock(so
, 1); /* locks & take one reference on socket */
3923 socket_unlock(so
, 0); /* unlock only */
3927 sodereference(struct socket
*so
)
3930 socket_unlock(so
, 1);
3934 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
3935 * possibility of using jumbo clusters. Caller must ensure to hold
3939 somultipages(struct socket
*so
, boolean_t set
)
3942 so
->so_flags
|= SOF_MULTIPAGES
;
3944 so
->so_flags
&= ~SOF_MULTIPAGES
;