2 * Copyright (c) 1998-2008 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
65 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
66 * support for mandatory and extensible security protections. This notice
67 * is included in support of clause 2.2 (b) of the Apple Public License,
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/filedesc.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/file_internal.h>
78 #include <sys/fcntl.h>
79 #include <sys/malloc.h>
81 #include <sys/domain.h>
82 #include <sys/kernel.h>
83 #include <sys/event.h>
85 #include <sys/protosw.h>
86 #include <sys/socket.h>
87 #include <sys/socketvar.h>
88 #include <sys/resourcevar.h>
89 #include <sys/signalvar.h>
90 #include <sys/sysctl.h>
93 #include <sys/kdebug.h>
96 #include <net/route.h>
97 #include <netinet/in.h>
98 #include <netinet/in_pcb.h>
99 #include <kern/zalloc.h>
100 #include <kern/locks.h>
101 #include <machine/limits.h>
102 #include <libkern/OSAtomic.h>
103 #include <pexpert/pexpert.h>
104 #include <kern/assert.h>
107 #include <security/mac.h>
108 #include <security/mac_framework.h>
112 int so_cache_timeouts
= 0;
113 int so_cache_max_freed
= 0;
114 int cached_sock_count
= 0;
115 __private_extern__
int max_cached_sock_count
= MAX_CACHED_SOCKETS
;
116 struct socket
*socket_cache_head
= 0;
117 struct socket
*socket_cache_tail
= 0;
118 u_int32_t so_cache_time
= 0;
119 int so_cache_init_done
= 0;
120 struct zone
*so_cache_zone
;
122 static lck_grp_t
*so_cache_mtx_grp
;
123 static lck_attr_t
*so_cache_mtx_attr
;
124 static lck_grp_attr_t
*so_cache_mtx_grp_attr
;
125 lck_mtx_t
*so_cache_mtx
;
127 #include <machine/limits.h>
129 static void filt_sordetach(struct knote
*kn
);
130 static int filt_soread(struct knote
*kn
, long hint
);
131 static void filt_sowdetach(struct knote
*kn
);
132 static int filt_sowrite(struct knote
*kn
, long hint
);
135 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
* tv_p
);
138 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
* tv_p
);
140 static struct filterops soread_filtops
= {
142 .f_detach
= filt_sordetach
,
143 .f_event
= filt_soread
,
145 static struct filterops sowrite_filtops
= {
147 .f_detach
= filt_sowdetach
,
148 .f_event
= filt_sowrite
,
151 #define EVEN_MORE_LOCKING_DEBUG 0
152 int socket_debug
= 0;
153 int socket_zone
= M_SOCKET
;
154 so_gen_t so_gencnt
; /* generation count for sockets */
156 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
157 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
159 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
160 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
161 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
162 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
163 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
164 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
165 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
167 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
170 SYSCTL_DECL(_kern_ipc
);
172 int somaxconn
= SOMAXCONN
;
173 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
, CTLFLAG_RW
, &somaxconn
, 0, "");
175 /* Should we get a maximum also ??? */
176 static int sosendmaxchain
= 65536;
177 static int sosendminchain
= 16384;
178 static int sorecvmincopy
= 16384;
179 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
, CTLFLAG_RW
, &sosendminchain
,
181 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
, CTLFLAG_RW
, &sorecvmincopy
,
185 * Set to enable jumbo clusters (if available) for large writes when
186 * the socket is marked with SOF_MULTIPAGES; see below.
189 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
, CTLFLAG_RW
, &sosendjcl
, 0, "");
192 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
193 * writes on the socket for all protocols on any network interfaces,
194 * depending upon sosendjcl above. Be extra careful when setting this
195 * to 1, because sending down packets that cross physical pages down to
196 * broken drivers (those that falsely assume that the physical pages
197 * are contiguous) might lead to system panics or silent data corruption.
198 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
199 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
200 * capable. Set this to 1 only for testing/debugging purposes.
202 int sosendjcl_ignore_capab
= 0;
203 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
, CTLFLAG_RW
,
204 &sosendjcl_ignore_capab
, 0, "");
207 * Socket operation routines.
208 * These routines are called by the routines in
209 * sys_socket.c or from a system process, and
210 * implement the semantics of socket operations by
211 * switching out to the protocol specific routines.
215 extern void postevent(struct socket
*, struct sockbuf
*, int);
216 extern void evsofree(struct socket
*);
218 /* TODO: these should be in header file */
219 extern int get_inpcb_str_size(void);
220 extern int get_tcp_str_size(void);
221 extern struct domain
*pffinddomain(int);
222 extern struct protosw
*pffindprotonotype(int, int);
223 extern int soclose_locked(struct socket
*);
224 extern int soo_kqfilter(struct fileproc
*, struct knote
*, struct proc
*);
226 extern int uthread_get_background_state(uthread_t
);
230 vm_size_t so_cache_zone_element_size
;
232 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**, int *);
233 static void cached_sock_alloc(struct socket
**, int);
234 static void cached_sock_free(struct socket
*);
235 static void so_cache_timer(void *);
237 void soclose_wait_locked(struct socket
*so
);
238 int so_isdstlocal(struct socket
*so
);
246 if (so_cache_init_done
) {
247 printf("socketinit: already called...\n");
251 PE_parse_boot_argn("socket_debug", &socket_debug
, sizeof (socket_debug
));
254 * allocate lock group attribute and group for socket cache mutex
256 so_cache_mtx_grp_attr
= lck_grp_attr_alloc_init();
258 so_cache_mtx_grp
= lck_grp_alloc_init("so_cache",
259 so_cache_mtx_grp_attr
);
262 * allocate the lock attribute for socket cache mutex
264 so_cache_mtx_attr
= lck_attr_alloc_init();
266 so_cache_init_done
= 1;
268 /* cached sockets mutex */
269 so_cache_mtx
= lck_mtx_alloc_init(so_cache_mtx_grp
, so_cache_mtx_attr
);
271 if (so_cache_mtx
== NULL
)
272 return; /* we're hosed... */
274 str_size
= (vm_size_t
)(sizeof (struct socket
) + 4 +
275 get_inpcb_str_size() + 4 + get_tcp_str_size());
277 so_cache_zone
= zinit(str_size
, 120000*str_size
, 8192, "socache zone");
278 zone_change(so_cache_zone
, Z_NOENCRYPT
, TRUE
);
280 printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size
);
282 timeout(so_cache_timer
, NULL
, (SO_CACHE_FLUSH_INTERVAL
* hz
));
284 so_cache_zone_element_size
= str_size
;
290 cached_sock_alloc(struct socket
**so
, int waitok
)
293 register uintptr_t offset
;
295 lck_mtx_lock(so_cache_mtx
);
297 if (cached_sock_count
) {
299 *so
= socket_cache_head
;
301 panic("cached_sock_alloc: cached sock is null");
303 socket_cache_head
= socket_cache_head
->cache_next
;
304 if (socket_cache_head
)
305 socket_cache_head
->cache_prev
= 0;
307 socket_cache_tail
= 0;
309 lck_mtx_unlock(so_cache_mtx
);
311 temp
= (*so
)->so_saved_pcb
;
312 bzero((caddr_t
)*so
, sizeof (struct socket
));
314 kprintf("cached_sock_alloc - retreiving cached sock %p - "
315 "count == %d\n", *so
, cached_sock_count
);
317 (*so
)->so_saved_pcb
= temp
;
318 (*so
)->cached_in_sock_layer
= 1;
321 kprintf("Allocating cached sock %p from memory\n", *so
);
324 lck_mtx_unlock(so_cache_mtx
);
327 *so
= (struct socket
*)zalloc(so_cache_zone
);
329 *so
= (struct socket
*)zalloc_noblock(so_cache_zone
);
334 bzero((caddr_t
)*so
, sizeof (struct socket
));
337 * Define offsets for extra structures into our single block of
338 * memory. Align extra structures on longword boundaries.
341 offset
= (uintptr_t) *so
;
342 offset
+= sizeof (struct socket
);
344 offset
= ALIGN(offset
);
346 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
347 offset
+= get_inpcb_str_size();
349 offset
= ALIGN(offset
);
351 ((struct inpcb
*)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
354 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
355 *so
, (*so
)->so_saved_pcb
,
356 ((struct inpcb
*)(*so
)->so_saved_pcb
)->inp_saved_ppcb
);
360 (*so
)->cached_in_sock_layer
= 1;
364 cached_sock_free(struct socket
*so
)
367 lck_mtx_lock(so_cache_mtx
);
369 if (++cached_sock_count
> max_cached_sock_count
) {
371 lck_mtx_unlock(so_cache_mtx
);
373 kprintf("Freeing overflowed cached socket %p\n", so
);
375 zfree(so_cache_zone
, so
);
378 kprintf("Freeing socket %p into cache\n", so
);
380 if (so_cache_hw
< cached_sock_count
)
381 so_cache_hw
= cached_sock_count
;
383 so
->cache_next
= socket_cache_head
;
385 if (socket_cache_head
)
386 socket_cache_head
->cache_prev
= so
;
388 socket_cache_tail
= so
;
390 so
->cache_timestamp
= so_cache_time
;
391 socket_cache_head
= so
;
392 lck_mtx_unlock(so_cache_mtx
);
396 kprintf("Freed cached sock %p into cache - count is %d\n",
397 so
, cached_sock_count
);
402 so_cache_timer(__unused
void *dummy
)
404 register struct socket
*p
;
405 register int n_freed
= 0;
407 lck_mtx_lock(so_cache_mtx
);
411 while ((p
= socket_cache_tail
)) {
412 if ((so_cache_time
- p
->cache_timestamp
) < SO_CACHE_TIME_LIMIT
)
417 if ((socket_cache_tail
= p
->cache_prev
))
418 p
->cache_prev
->cache_next
= 0;
419 if (--cached_sock_count
== 0)
420 socket_cache_head
= 0;
422 zfree(so_cache_zone
, p
);
424 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
425 so_cache_max_freed
++;
429 lck_mtx_unlock(so_cache_mtx
);
431 timeout(so_cache_timer
, NULL
, (SO_CACHE_FLUSH_INTERVAL
* hz
));
433 #endif /* __APPLE__ */
436 * Get a socket structure from our zone, and initialize it.
437 * We don't implement `waitok' yet (see comments in uipc_domain.c).
438 * Note that it would probably be better to allocate socket
439 * and PCB at the same time, but I'm not convinced that all
440 * the protocols can be easily modified to do this.
443 soalloc(int waitok
, int dom
, int type
)
447 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
448 cached_sock_alloc(&so
, waitok
);
450 MALLOC_ZONE(so
, struct socket
*, sizeof (*so
), socket_zone
,
453 bzero(so
, sizeof (*so
));
455 /* XXX race condition for reentrant kernel */
456 //###LD Atomic add for so_gencnt
458 so
->so_gencnt
= ++so_gencnt
;
459 so
->so_zone
= socket_zone
;
460 #if CONFIG_MACF_SOCKET
461 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
462 if (mac_socket_label_init(so
, !waitok
) != 0) {
466 #endif /* MAC_SOCKET */
478 * <pru_attach>:ENOBUFS[AF_UNIX]
479 * <pru_attach>:ENOBUFS[TCP]
480 * <pru_attach>:ENOMEM[TCP]
481 * <pru_attach>:EISCONN[TCP]
482 * <pru_attach>:??? [other protocol families, IPSEC]
485 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
487 struct proc
*p
= current_proc();
488 register struct protosw
*prp
;
489 register struct socket
*so
;
490 register int error
= 0;
495 extern int tcpconsdebug
;
498 prp
= pffindproto(dom
, proto
, type
);
500 prp
= pffindtype(dom
, type
);
502 if (prp
== 0 || prp
->pr_usrreqs
->pru_attach
== 0) {
503 if (pffinddomain(dom
) == NULL
) {
504 return (EAFNOSUPPORT
);
507 if (pffindprotonotype(dom
, proto
) != NULL
) {
511 return (EPROTONOSUPPORT
);
513 if (prp
->pr_type
!= type
)
515 so
= soalloc(1, dom
, type
);
519 TAILQ_INIT(&so
->so_incomp
);
520 TAILQ_INIT(&so
->so_comp
);
523 so
->so_uid
= kauth_cred_getuid(kauth_cred_get());
524 if (!suser(kauth_cred_get(), NULL
))
525 so
->so_state
= SS_PRIV
;
529 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
530 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
532 so
->next_lock_lr
= 0;
533 so
->next_unlock_lr
= 0;
535 #if CONFIG_MACF_SOCKET
536 mac_socket_label_associate(kauth_cred_get(), so
);
537 #endif /* MAC_SOCKET */
539 //### Attachement will create the per pcb lock if necessary and increase refcount
541 * for creation, make sure it's done before
542 * socket is inserted in lists
546 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
550 * If so_pcb is not zero, the socket will be leaked,
551 * so protocol attachment handler must be coded carefuly
553 so
->so_state
|= SS_NOFDREF
;
555 sofreelastref(so
, 1); /* will deallocate the socket */
559 prp
->pr_domain
->dom_refs
++;
560 TAILQ_INIT(&so
->so_evlist
);
562 /* Attach socket filters for this protocol */
565 if (tcpconsdebug
== 2)
566 so
->so_options
|= SO_DEBUG
;
570 * If this is a background thread/task, mark the socket as such.
572 thread
= current_thread();
573 ut
= get_bsdthread_info(thread
);
574 if (uthread_get_background_state(ut
)) {
575 socket_set_traffic_mgt_flags(so
, TRAFFIC_MGT_SO_BACKGROUND
);
576 so
->so_background_thread
= thread
;
578 * In case setpriority(PRIO_DARWIN_THREAD) was called
579 * on this thread, regulate network (TCP) traffics.
581 if (ut
->uu_flag
& UT_BACKGROUND_TRAFFIC_MGT
) {
582 socket_set_traffic_mgt_flags(so
,
583 TRAFFIC_MGT_SO_BG_REGULATE
);
593 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
594 * <pru_bind>:EAFNOSUPPORT Address family not supported
595 * <pru_bind>:EADDRNOTAVAIL Address not available.
596 * <pru_bind>:EINVAL Invalid argument
597 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
598 * <pru_bind>:EACCES Permission denied
599 * <pru_bind>:EADDRINUSE Address in use
600 * <pru_bind>:EAGAIN Resource unavailable, try again
601 * <pru_bind>:EPERM Operation not permitted
605 * Notes: It's not possible to fully enumerate the return codes above,
606 * since socket filter authors and protocol family authors may
607 * not choose to limit their error returns to those listed, even
608 * though this may result in some software operating incorrectly.
610 * The error codes which are enumerated above are those known to
611 * be returned by the tcp_usr_bind function supplied.
614 sobind(struct socket
*so
, struct sockaddr
*nam
)
616 struct proc
*p
= current_proc();
618 struct socket_filter_entry
*filter
;
624 * If this is a bind request on a previously-accepted socket
625 * that has been marked as inactive, reject it now before
628 if (so
->so_flags
& SOF_DEFUNCT
) {
635 for (filter
= so
->so_filt
; filter
&& (error
== 0);
636 filter
= filter
->sfe_next_onsocket
) {
637 if (filter
->sfe_filter
->sf_filter
.sf_bind
) {
641 socket_unlock(so
, 0);
643 error
= filter
->sfe_filter
->sf_filter
.
644 sf_bind(filter
->sfe_cookie
, so
, nam
);
651 /* End socket filter */
654 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
656 socket_unlock(so
, 1);
658 if (error
== EJUSTRETURN
)
665 sodealloc(struct socket
*so
)
667 so
->so_gencnt
= ++so_gencnt
;
669 #if CONFIG_MACF_SOCKET
670 mac_socket_label_destroy(so
);
671 #endif /* MAC_SOCKET */
672 if (so
->cached_in_sock_layer
== 1) {
673 cached_sock_free(so
);
675 if (so
->cached_in_sock_layer
== -1)
676 panic("sodealloc: double dealloc: so=%p\n", so
);
677 so
->cached_in_sock_layer
= -1;
678 FREE_ZONE(so
, sizeof (*so
), so
->so_zone
);
686 * <pru_listen>:EINVAL[AF_UNIX]
687 * <pru_listen>:EINVAL[TCP]
688 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
689 * <pru_listen>:EINVAL[TCP] Invalid argument
690 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
691 * <pru_listen>:EACCES[TCP] Permission denied
692 * <pru_listen>:EADDRINUSE[TCP] Address in use
693 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
694 * <pru_listen>:EPERM[TCP] Operation not permitted
697 * Notes: Other <pru_listen> returns depend on the protocol family; all
698 * <sf_listen> returns depend on what the filter author causes
699 * their filter to return.
702 solisten(struct socket
*so
, int backlog
)
704 struct proc
*p
= current_proc();
706 struct socket_filter_entry
*filter
;
710 if (so
->so_proto
== NULL
) {
714 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
720 * If the listen request is made on a socket that is not fully
721 * disconnected, or on a previously-accepted socket that has
722 * been marked as inactive, reject the request now.
725 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) ||
726 (so
->so_flags
& SOF_DEFUNCT
)) {
731 if ((so
->so_restrictions
& SO_RESTRICT_DENYIN
) != 0) {
737 for (filter
= so
->so_filt
; filter
&& (error
== 0);
738 filter
= filter
->sfe_next_onsocket
) {
739 if (filter
->sfe_filter
->sf_filter
.sf_listen
) {
743 socket_unlock(so
, 0);
745 error
= filter
->sfe_filter
->sf_filter
.
746 sf_listen(filter
->sfe_cookie
, so
);
755 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
759 if (error
== EJUSTRETURN
)
764 if (TAILQ_EMPTY(&so
->so_comp
))
765 so
->so_options
|= SO_ACCEPTCONN
;
767 * POSIX: The implementation may have an upper limit on the length of
768 * the listen queue-either global or per accepting socket. If backlog
769 * exceeds this limit, the length of the listen queue is set to the
772 * If listen() is called with a backlog argument value that is less
773 * than 0, the function behaves as if it had been called with a backlog
774 * argument value of 0.
776 * A backlog argument of 0 may allow the socket to accept connections,
777 * in which case the length of the listen queue may be set to an
778 * implementation-defined minimum value.
780 if (backlog
<= 0 || backlog
> somaxconn
)
783 so
->so_qlimit
= backlog
;
785 socket_unlock(so
, 1);
790 sofreelastref(struct socket
*so
, int dealloc
)
792 struct socket
*head
= so
->so_head
;
794 /* Assume socket is locked */
796 /* Remove any filters - may be called more than once */
799 if ((!(so
->so_flags
& SOF_PCBCLEARING
)) ||
800 ((so
->so_state
& SS_NOFDREF
) == 0)) {
802 selthreadclear(&so
->so_snd
.sb_sel
);
803 selthreadclear(&so
->so_rcv
.sb_sel
);
804 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
805 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
810 socket_lock(head
, 1);
811 if (so
->so_state
& SS_INCOMP
) {
812 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
814 } else if (so
->so_state
& SS_COMP
) {
816 * We must not decommission a socket that's
817 * on the accept(2) queue. If we do, then
818 * accept(2) may hang after select(2) indicated
819 * that the listening socket was ready.
822 selthreadclear(&so
->so_snd
.sb_sel
);
823 selthreadclear(&so
->so_rcv
.sb_sel
);
824 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
825 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
827 socket_unlock(head
, 1);
830 panic("sofree: not queued");
833 so
->so_state
&= ~SS_INCOMP
;
835 socket_unlock(head
, 1);
838 selthreadclear(&so
->so_snd
.sb_sel
);
839 sbrelease(&so
->so_snd
);
843 /* 3932268: disable upcall */
844 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
845 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
852 soclose_wait_locked(struct socket
*so
)
854 lck_mtx_t
*mutex_held
;
856 if (so
->so_proto
->pr_getlock
!= NULL
)
857 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
859 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
860 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
863 * Double check here and return if there's no outstanding upcall;
864 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
866 if (!(so
->so_flags
& SOF_UPCALLINUSE
) ||
867 !(so
->so_flags
& SOF_UPCALLCLOSEWAIT
))
870 so
->so_flags
|= SOF_CLOSEWAIT
;
871 (void) msleep((caddr_t
)&so
->so_upcall
, mutex_held
, (PZERO
- 1),
872 "soclose_wait_locked", NULL
);
873 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
874 so
->so_flags
&= ~SOF_CLOSEWAIT
;
878 * Close a socket on last file table reference removal.
879 * Initiate disconnect if connected.
880 * Free socket when disconnect complete.
883 soclose_locked(struct socket
*so
)
886 lck_mtx_t
*mutex_held
;
889 if (so
->so_usecount
== 0) {
890 panic("soclose: so=%p refcount=0\n", so
);
893 sflt_notify(so
, sock_evt_closing
, NULL
);
895 if ((so
->so_options
& SO_ACCEPTCONN
)) {
896 struct socket
*sp
, *sonext
;
900 * We do not want new connection to be added
901 * to the connection queues
903 so
->so_options
&= ~SO_ACCEPTCONN
;
905 for (sp
= TAILQ_FIRST(&so
->so_incomp
); sp
!= NULL
; sp
= sonext
) {
906 sonext
= TAILQ_NEXT(sp
, so_list
);
909 * skip sockets thrown away by tcpdropdropblreq
910 * they will get cleanup by the garbage collection.
911 * otherwise, remove the incomp socket from the queue
912 * and let soabort trigger the appropriate cleanup.
914 if (sp
->so_flags
& SOF_OVERFLOW
)
917 if (so
->so_proto
->pr_getlock
!= NULL
) {
918 /* lock ordering for consistency with the rest of the stack,
919 * we lock the socket first and then grabb the head.
921 socket_unlock(so
, 0);
927 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
930 if (sp
->so_state
& SS_INCOMP
) {
931 sp
->so_state
&= ~SS_INCOMP
;
938 socket_unlock(sp
, 1);
941 while ((sp
= TAILQ_FIRST(&so
->so_comp
)) != NULL
) {
942 /* Dequeue from so_comp since sofree() won't do it */
943 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
946 if (so
->so_proto
->pr_getlock
!= NULL
) {
947 socket_unlock(so
, 0);
951 if (sp
->so_state
& SS_COMP
) {
952 sp
->so_state
&= ~SS_COMP
;
958 if (so
->so_proto
->pr_getlock
!= NULL
) {
959 socket_unlock(sp
, 1);
964 if (so
->so_pcb
== 0) {
965 /* 3915887: mark the socket as ready for dealloc */
966 so
->so_flags
|= SOF_PCBCLEARING
;
969 if (so
->so_state
& SS_ISCONNECTED
) {
970 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
971 error
= sodisconnectlocked(so
);
975 if (so
->so_options
& SO_LINGER
) {
976 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
977 (so
->so_state
& SS_NBIO
))
979 if (so
->so_proto
->pr_getlock
!= NULL
)
980 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
982 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
983 while (so
->so_state
& SS_ISCONNECTED
) {
984 ts
.tv_sec
= (so
->so_linger
/100);
985 ts
.tv_nsec
= (so
->so_linger
% 100) *
986 NSEC_PER_USEC
* 1000 * 10;
987 error
= msleep((caddr_t
)&so
->so_timeo
,
988 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
991 * It's OK when the time fires,
992 * don't report an error
994 if (error
== EWOULDBLOCK
)
1002 if (so
->so_usecount
== 0)
1003 panic("soclose: usecount is zero so=%p\n", so
);
1004 if (so
->so_pcb
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
1005 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
1009 if (so
->so_usecount
<= 0)
1010 panic("soclose: usecount is zero so=%p\n", so
);
1012 if (so
->so_pcb
&& so
->so_state
& SS_NOFDREF
)
1013 panic("soclose: NOFDREF");
1014 so
->so_state
|= SS_NOFDREF
;
1016 so
->so_proto
->pr_domain
->dom_refs
--;
1025 soclose(struct socket
*so
)
1030 if (so
->so_flags
& SOF_UPCALLINUSE
)
1031 soclose_wait_locked(so
);
1033 if (so
->so_retaincnt
== 0) {
1034 error
= soclose_locked(so
);
1037 * if the FD is going away, but socket is
1038 * retained in kernel remove its reference
1041 if (so
->so_usecount
< 2)
1042 panic("soclose: retaincnt non null and so=%p "
1043 "usecount=%d\n", so
, so
->so_usecount
);
1045 socket_unlock(so
, 1);
1050 * Must be called at splnet...
1052 /* Should already be locked */
1054 soabort(struct socket
*so
)
1058 #ifdef MORE_LOCKING_DEBUG
1059 lck_mtx_t
*mutex_held
;
1061 if (so
->so_proto
->pr_getlock
!= NULL
)
1062 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1064 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1065 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1068 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1069 so
->so_flags
|= SOF_ABORTED
;
1070 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1080 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1087 if ((so
->so_state
& SS_NOFDREF
) == 0)
1088 panic("soaccept: !NOFDREF");
1089 so
->so_state
&= ~SS_NOFDREF
;
1090 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1093 socket_unlock(so
, 1);
1098 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1100 return (soacceptlock(so
, nam
, 1));
1104 soacceptfilter(struct socket
*so
)
1106 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1107 struct socket_filter_entry
*filter
;
1108 int error
= 0, filtered
= 0;
1109 struct socket
*head
= so
->so_head
;
1112 * Hold the lock even if this socket
1113 * has not been made visible to the filter(s).
1114 * For sockets with global locks, this protect against the
1115 * head or peer going away
1118 if (sogetaddr_locked(so
, &remote
, 1) != 0 ||
1119 sogetaddr_locked(so
, &local
, 0) != 0) {
1120 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1122 socket_unlock(so
, 1);
1124 /* Out of resources; try it again next time */
1125 error
= ECONNABORTED
;
1130 * At this point, we have a reference on the listening socket
1131 * so we know it won't be going away. Do the same for the newly
1132 * accepted socket while we invoke the accept callback routine.
1134 for (filter
= so
->so_filt
; filter
!= NULL
&& error
== 0;
1135 filter
= filter
->sfe_next_onsocket
) {
1136 if (filter
->sfe_filter
->sf_filter
.sf_accept
!= NULL
) {
1140 socket_unlock(so
, 0);
1142 error
= filter
->sfe_filter
->sf_filter
.
1143 sf_accept(filter
->sfe_cookie
,
1144 head
, so
, local
, remote
);
1154 * If we get EJUSTRETURN from one of the filters, mark this socket
1155 * as inactive and return it anyway. This newly accepted socket
1156 * will be disconnected later before we hand it off to the caller.
1158 if (error
== EJUSTRETURN
) {
1160 so
->so_flags
|= SOF_DEFUNCT
;
1161 /* Prevent data from being appended to the socket buffers */
1162 so
->so_snd
.sb_flags
|= SB_DROP
;
1163 so
->so_rcv
.sb_flags
|= SB_DROP
;
1168 * This may seem like a duplication to the above error
1169 * handling part when we return ECONNABORTED, except
1170 * the following is done while holding the lock since
1171 * the socket has been exposed to the filter(s) earlier.
1173 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1175 socket_unlock(so
, 1);
1177 /* Propagate socket filter's error code to the caller */
1179 socket_unlock(so
, 1);
1182 /* Callee checks for NULL pointer */
1183 sock_freeaddr(remote
);
1184 sock_freeaddr(local
);
1189 * Returns: 0 Success
1190 * EOPNOTSUPP Operation not supported on socket
1191 * EISCONN Socket is connected
1192 * <pru_connect>:EADDRNOTAVAIL Address not available.
1193 * <pru_connect>:EINVAL Invalid argument
1194 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1195 * <pru_connect>:EACCES Permission denied
1196 * <pru_connect>:EADDRINUSE Address in use
1197 * <pru_connect>:EAGAIN Resource unavailable, try again
1198 * <pru_connect>:EPERM Operation not permitted
1199 * <sf_connect_out>:??? [anything a filter writer might set]
1202 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1205 struct proc
*p
= current_proc();
1211 * If this is a listening socket or if this is a previously-accepted
1212 * socket that has been marked as inactive, reject the connect request.
1214 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1216 socket_unlock(so
, 1);
1217 return (EOPNOTSUPP
);
1220 if ((so
->so_restrictions
& SO_RESTRICT_DENYOUT
) != 0) {
1222 socket_unlock(so
, 1);
1227 * If protocol is connection-based, can only connect once.
1228 * Otherwise, if connected, try to disconnect first.
1229 * This allows user to disconnect by connecting to, e.g.,
1232 if (so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
) &&
1233 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1234 (error
= sodisconnectlocked(so
)))) {
1238 * Run connect filter before calling protocol:
1239 * - non-blocking connect returns before completion;
1241 struct socket_filter_entry
*filter
;
1245 for (filter
= so
->so_filt
; filter
&& (error
== 0);
1246 filter
= filter
->sfe_next_onsocket
) {
1247 if (filter
->sfe_filter
->sf_filter
.sf_connect_out
) {
1248 if (filtered
== 0) {
1251 socket_unlock(so
, 0);
1253 error
= filter
->sfe_filter
->sf_filter
.
1254 sf_connect_out(filter
->sfe_cookie
, so
, nam
);
1257 if (filtered
!= 0) {
1263 if (error
== EJUSTRETURN
)
1266 socket_unlock(so
, 1);
1270 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)(so
, nam
, p
);
1273 socket_unlock(so
, 1);
1278 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1280 return (soconnectlock(so
, nam
, 1));
1284 * Returns: 0 Success
1285 * <pru_connect2>:EINVAL[AF_UNIX]
1286 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1287 * <pru_connect2>:??? [other protocol families]
1289 * Notes: <pru_connect2> is not supported by [TCP].
1292 soconnect2(struct socket
*so1
, struct socket
*so2
)
1296 socket_lock(so1
, 1);
1297 if (so2
->so_proto
->pr_lock
)
1298 socket_lock(so2
, 1);
1300 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1302 socket_unlock(so1
, 1);
1303 if (so2
->so_proto
->pr_lock
)
1304 socket_unlock(so2
, 1);
1309 sodisconnectlocked(struct socket
*so
)
1313 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1317 if (so
->so_state
& SS_ISDISCONNECTING
) {
1322 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1325 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1331 /* Locking version */
1333 sodisconnect(struct socket
*so
)
1338 error
= sodisconnectlocked(so
);
1339 socket_unlock(so
, 1);
1343 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1346 * sosendcheck will lock the socket buffer if it isn't locked and
1347 * verify that there is space for the data being inserted.
1349 * Returns: 0 Success
1351 * sblock:EWOULDBLOCK
1358 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, int32_t resid
, int32_t clen
,
1359 int32_t atomic
, int flags
, int *sblocked
)
1366 if (*sblocked
== 0) {
1367 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1368 so
->so_send_filt_thread
!= 0 &&
1369 so
->so_send_filt_thread
== current_thread()) {
1371 * We're being called recursively from a filter,
1372 * allow this to continue. Radar 4150520.
1373 * Don't set sblocked because we don't want
1374 * to perform an unlock later.
1378 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1387 * If a send attempt is made on a previously-accepted socket
1388 * that has been marked as inactive (disconnected), reject
1391 if (so
->so_flags
& SOF_DEFUNCT
)
1394 if (so
->so_state
& SS_CANTSENDMORE
)
1398 error
= so
->so_error
;
1403 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1404 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1405 if ((so
->so_state
& SS_ISCONFIRMING
) == 0 &&
1406 !(resid
== 0 && clen
!= 0))
1408 } else if (addr
== 0 && !(flags
&MSG_HOLD
)) {
1409 return ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1410 ENOTCONN
: EDESTADDRREQ
);
1413 space
= sbspace(&so
->so_snd
);
1414 if (flags
& MSG_OOB
)
1416 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
1417 clen
> so
->so_snd
.sb_hiwat
)
1419 if (space
< resid
+ clen
&&
1420 (atomic
|| space
< (int32_t)so
->so_snd
.sb_lowat
|| space
< clen
)) {
1421 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
1423 return (EWOULDBLOCK
);
1425 sbunlock(&so
->so_snd
, 1);
1426 error
= sbwait(&so
->so_snd
);
1438 * If send must go all at once and message is larger than
1439 * send buffering, then hard error.
1440 * Lock against other senders.
1441 * If must go all at once and not enough room now, then
1442 * inform user that this would block and do nothing.
1443 * Otherwise, if nonblocking, send as much as possible.
1444 * The data to be sent is described by "uio" if nonzero,
1445 * otherwise by the mbuf chain "top" (which must be null
1446 * if uio is not). Data provided in mbuf chain must be small
1447 * enough to send all at once.
1449 * Returns nonzero on error, timeout or signal; callers
1450 * must check for short counts if EINTR/ERESTART are returned.
1451 * Data and control buffers are freed on return.
1453 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1454 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1455 * point at the mbuf chain being constructed and go from there.
1457 * Returns: 0 Success
1463 * sosendcheck:EWOULDBLOCK
1467 * sosendcheck:??? [value from so_error]
1468 * <pru_send>:ECONNRESET[TCP]
1469 * <pru_send>:EINVAL[TCP]
1470 * <pru_send>:ENOBUFS[TCP]
1471 * <pru_send>:EADDRINUSE[TCP]
1472 * <pru_send>:EADDRNOTAVAIL[TCP]
1473 * <pru_send>:EAFNOSUPPORT[TCP]
1474 * <pru_send>:EACCES[TCP]
1475 * <pru_send>:EAGAIN[TCP]
1476 * <pru_send>:EPERM[TCP]
1477 * <pru_send>:EMSGSIZE[TCP]
1478 * <pru_send>:EHOSTUNREACH[TCP]
1479 * <pru_send>:ENETUNREACH[TCP]
1480 * <pru_send>:ENETDOWN[TCP]
1481 * <pru_send>:ENOMEM[TCP]
1482 * <pru_send>:ENOBUFS[TCP]
1483 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1484 * <pru_send>:EINVAL[AF_UNIX]
1485 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1486 * <pru_send>:EPIPE[AF_UNIX]
1487 * <pru_send>:ENOTCONN[AF_UNIX]
1488 * <pru_send>:EISCONN[AF_UNIX]
1489 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1490 * <sf_data_out>:??? [whatever a filter author chooses]
1492 * Notes: Other <pru_send> returns depend on the protocol family; all
1493 * <sf_data_out> returns depend on what the filter author causes
1494 * their filter to return.
1497 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
1498 struct mbuf
*top
, struct mbuf
*control
, int flags
)
1501 register struct mbuf
*m
, *freelist
= NULL
;
1502 register int32_t space
, len
, resid
;
1503 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
1504 int atomic
= sosendallatonce(so
) || top
;
1506 struct proc
*p
= current_proc();
1509 // LP64todo - fix this!
1510 resid
= uio_resid(uio
);
1512 resid
= top
->m_pkthdr
.len
;
1514 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
1515 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
1518 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
1520 socket_unlock(so
, 1);
1525 * In theory resid should be unsigned.
1526 * However, space must be signed, as it might be less than 0
1527 * if we over-committed, and we must use a signed comparison
1528 * of space and resid. On the other hand, a negative resid
1529 * causes us to loop sending 0-length segments to the protocol.
1531 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1532 * type sockets since that's an error.
1534 if (resid
< 0 || (so
->so_type
== SOCK_STREAM
&& (flags
& MSG_EOR
))) {
1536 socket_unlock(so
, 1);
1541 (flags
& MSG_DONTROUTE
) && (so
->so_options
& SO_DONTROUTE
) == 0 &&
1542 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1543 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
1545 clen
= control
->m_len
;
1548 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
1554 space
= sbspace(&so
->so_snd
) - clen
+ ((flags
& MSG_OOB
) ?
1558 struct socket_filter_entry
*filter
;
1560 boolean_t recursive
;
1564 * Data is prepackaged in "top".
1567 if (flags
& MSG_EOR
)
1568 top
->m_flags
|= M_EOR
;
1574 bytes_to_copy
= imin(resid
, space
);
1576 if (sosendminchain
> 0) {
1579 chainlength
= sosendmaxchain
;
1583 * Attempt to use larger than system page-size
1584 * clusters for large writes only if there is
1585 * a jumbo cluster pool and if the socket is
1586 * marked accordingly.
1588 jumbocl
= sosendjcl
&& njcl
> 0 &&
1589 ((so
->so_flags
& SOF_MULTIPAGES
) ||
1590 sosendjcl_ignore_capab
);
1592 socket_unlock(so
, 0);
1596 int hdrs_needed
= (top
== 0) ? 1 : 0;
1599 * try to maintain a local cache of mbuf
1600 * clusters needed to complete this
1601 * write the list is further limited to
1602 * the number that are currently needed
1603 * to fill the socket this mechanism
1604 * allows a large number of mbufs/
1605 * clusters to be grabbed under a single
1606 * mbuf lock... if we can't get any
1607 * clusters, than fall back to trying
1608 * for mbufs if we fail early (or
1609 * miscalcluate the number needed) make
1610 * sure to release any clusters we
1611 * haven't yet consumed.
1613 if (freelist
== NULL
&&
1614 bytes_to_copy
> NBPG
&& jumbocl
) {
1616 bytes_to_copy
/ M16KCLBYTES
;
1618 if ((bytes_to_copy
-
1619 (num_needed
* M16KCLBYTES
))
1624 m_getpackets_internal(
1625 (unsigned int *)&num_needed
,
1626 hdrs_needed
, M_WAIT
, 0,
1629 * Fall back to 4K cluster size
1630 * if allocation failed
1634 if (freelist
== NULL
&&
1635 bytes_to_copy
> MCLBYTES
) {
1637 bytes_to_copy
/ NBPG
;
1639 if ((bytes_to_copy
-
1640 (num_needed
* NBPG
)) >=
1645 m_getpackets_internal(
1646 (unsigned int *)&num_needed
,
1647 hdrs_needed
, M_WAIT
, 0,
1650 * Fall back to cluster size
1651 * if allocation failed
1655 if (freelist
== NULL
&&
1656 bytes_to_copy
> MINCLSIZE
) {
1658 bytes_to_copy
/ MCLBYTES
;
1660 if ((bytes_to_copy
-
1661 (num_needed
* MCLBYTES
)) >=
1666 m_getpackets_internal(
1667 (unsigned int *)&num_needed
,
1668 hdrs_needed
, M_WAIT
, 0,
1671 * Fall back to a single mbuf
1672 * if allocation failed
1676 if (freelist
== NULL
) {
1684 if (freelist
== NULL
) {
1690 * For datagram protocols,
1691 * leave room for protocol
1692 * headers in first mbuf.
1694 if (atomic
&& top
== 0 &&
1695 bytes_to_copy
< MHLEN
) {
1701 freelist
= m
->m_next
;
1704 if ((m
->m_flags
& M_EXT
))
1705 mlen
= m
->m_ext
.ext_size
;
1706 else if ((m
->m_flags
& M_PKTHDR
))
1708 MHLEN
- m_leadingspace(m
);
1711 len
= imin(mlen
, bytes_to_copy
);
1717 error
= uiomove(mtod(m
, caddr_t
),
1720 resid
= uio_resid(uio
);
1724 top
->m_pkthdr
.len
+= len
;
1729 if (flags
& MSG_EOR
)
1730 top
->m_flags
|= M_EOR
;
1733 bytes_to_copy
= min(resid
, space
);
1735 } while (space
> 0 &&
1736 (chainlength
< sosendmaxchain
|| atomic
||
1737 resid
< MINCLSIZE
));
1745 if (flags
& (MSG_HOLD
|MSG_SEND
)) {
1746 /* Enqueue for later, go away if HOLD */
1747 register struct mbuf
*mb1
;
1748 if (so
->so_temp
&& (flags
& MSG_FLUSH
)) {
1749 m_freem(so
->so_temp
);
1753 so
->so_tail
->m_next
= top
;
1760 if (flags
& MSG_HOLD
) {
1767 so
->so_options
|= SO_DONTROUTE
;
1769 /* Compute flags here, for pru_send and NKEs */
1770 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
1772 * If the user set MSG_EOF, the protocol
1773 * understands this flag and nothing left to
1774 * send then use PRU_SEND_EOF instead of PRU_SEND.
1776 ((flags
& MSG_EOF
) &&
1777 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
1780 /* If there is more to send set PRUS_MORETOCOME */
1781 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
1784 * Socket filter processing
1786 recursive
= (so
->so_send_filt_thread
!= NULL
);
1789 for (filter
= so
->so_filt
; filter
&& (error
== 0);
1790 filter
= filter
->sfe_next_onsocket
) {
1791 if (filter
->sfe_filter
->sf_filter
.sf_data_out
) {
1793 if (filtered
== 0) {
1795 so
->so_send_filt_thread
=
1798 socket_unlock(so
, 0);
1800 (sendflags
& MSG_OOB
) ?
1801 sock_data_filt_flag_oob
: 0;
1803 error
= filter
->sfe_filter
->sf_filter
.
1804 sf_data_out(filter
->sfe_cookie
, so
,
1805 addr
, &top
, &control
, so_flags
);
1811 * At this point, we've run at least one
1812 * filter. The socket is unlocked as is
1813 * the socket buffer. Clear the recorded
1814 * filter thread only when we are outside
1815 * of a filter's context. This allows for
1816 * a filter to issue multiple inject calls
1817 * from its sf_data_out callback routine.
1822 so
->so_send_filt_thread
= 0;
1824 if (error
== EJUSTRETURN
) {
1835 * End Socket filter processing
1838 if (error
== EJUSTRETURN
) {
1839 /* A socket filter handled this data */
1842 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
1843 (so
, sendflags
, top
, addr
, control
, p
);
1846 if (flags
& MSG_SEND
)
1850 so
->so_options
&= ~SO_DONTROUTE
;
1858 } while (resid
&& space
> 0);
1863 sbunlock(&so
->so_snd
, 0); /* will unlock socket */
1865 socket_unlock(so
, 1);
1872 m_freem_list(freelist
);
1874 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
, so
->so_snd
.sb_cc
,
1881 * Implement receive operations on a socket.
1882 * We depend on the way that records are added to the sockbuf
1883 * by sbappend*. In particular, each record (mbufs linked through m_next)
1884 * must begin with an address if the protocol so specifies,
1885 * followed by an optional mbuf or mbufs containing ancillary data,
1886 * and then zero or more mbufs of data.
1887 * In order to avoid blocking network interrupts for the entire time here,
1888 * we splx() while doing the actual copy to user space.
1889 * Although the sockbuf is locked, new data may still be appended,
1890 * and thus we must maintain consistency of the sockbuf during that time.
1892 * The caller may receive the data as a single mbuf chain by supplying
1893 * an mbuf **mp0 for use in returning the chain. The uio is then used
1894 * only for the count in uio_resid.
1896 * Returns: 0 Success
1901 * sblock:EWOULDBLOCK
1905 * sodelayed_copy:EFAULT
1906 * <pru_rcvoob>:EINVAL[TCP]
1907 * <pru_rcvoob>:EWOULDBLOCK[TCP]
1909 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1910 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1911 * <pr_domain->dom_externalize>:???
1913 * Notes: Additional return values from calls through <pru_rcvoob> and
1914 * <pr_domain->dom_externalize> depend on protocols other than
1915 * TCP or AF_UNIX, which are documented above.
1918 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
1919 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
1921 register struct mbuf
*m
, **mp
, *ml
= NULL
;
1922 register int flags
, len
, error
, offset
;
1923 struct protosw
*pr
= so
->so_proto
;
1924 struct mbuf
*nextrecord
;
1926 int orig_resid
= uio_resid(uio
);
1927 struct mbuf
*free_list
;
1928 int delayed_copy_len
;
1931 struct proc
*p
= current_proc();
1933 // LP64todo - fix this!
1934 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
, uio_resid(uio
),
1935 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
1939 #ifdef MORE_LOCKING_DEBUG
1940 if (so
->so_usecount
== 1)
1941 panic("soreceive: so=%x no other reference on socket\n", so
);
1949 flags
= *flagsp
&~ MSG_EOR
;
1954 * If a recv attempt is made on a previously-accepted socket
1955 * that has been marked as inactive (disconnected), reject
1958 if (so
->so_flags
& SOF_DEFUNCT
) {
1959 struct sockbuf
*sb
= &so
->so_rcv
;
1962 * This socket should have been disconnected and flushed
1963 * prior to being returned from accept; there should be
1964 * no data on its receive list, so panic otherwise.
1966 sb_empty_assert(sb
, __func__
);
1967 socket_unlock(so
, 1);
1972 * When SO_WANTOOBFLAG is set we try to get out-of-band data
1973 * regardless of the flags argument. Here is the case were
1974 * out-of-band data is not inline.
1976 if ((flags
& MSG_OOB
) ||
1977 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
1978 (so
->so_options
& SO_OOBINLINE
) == 0 &&
1979 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
1980 m
= m_get(M_WAIT
, MT_DATA
);
1982 socket_unlock(so
, 1);
1983 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
1984 ENOBUFS
, 0, 0, 0, 0);
1987 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
1990 socket_unlock(so
, 0);
1992 error
= uiomove(mtod(m
, caddr_t
),
1993 imin(uio_resid(uio
), m
->m_len
), uio
);
1995 } while (uio_resid(uio
) && error
== 0 && m
);
2001 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
2002 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
2004 * Let's try to get normal data:
2005 * EWOULDBLOCK: out-of-band data not
2006 * receive yet. EINVAL: out-of-band data
2011 } else if (error
== 0 && flagsp
) {
2015 socket_unlock(so
, 1);
2016 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2023 *mp
= (struct mbuf
*)0;
2024 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
))
2025 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
2028 free_list
= (struct mbuf
*)0;
2029 delayed_copy_len
= 0;
2031 #ifdef MORE_LOCKING_DEBUG
2032 if (so
->so_usecount
<= 1)
2033 printf("soreceive: sblock so=%p ref=%d on socket\n",
2034 so
, so
->so_usecount
);
2037 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2038 * and if so just return to the caller. This could happen when
2039 * soreceive() is called by a socket upcall function during the
2040 * time the socket is freed. The socket buffer would have been
2041 * locked across the upcall, therefore we cannot put this thread
2042 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2043 * we may livelock), because the lock on the socket buffer will
2044 * only be released when the upcall routine returns to its caller.
2045 * Because the socket has been officially closed, there can be
2046 * no further read on it.
2048 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
2049 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
2050 socket_unlock(so
, 1);
2054 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
2056 socket_unlock(so
, 1);
2057 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2062 m
= so
->so_rcv
.sb_mb
;
2064 * If we have less data than requested, block awaiting more
2065 * (subject to any timeout) if:
2066 * 1. the current count is less than the low water mark, or
2067 * 2. MSG_WAITALL is set, and it is possible to do the entire
2068 * receive operation at once if we block (resid <= hiwat).
2069 * 3. MSG_DONTWAIT is not set
2070 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2071 * we have to do the receive in sections, and thus risk returning
2072 * a short count if a timeout or signal occurs after we start.
2074 if (m
== 0 || (((flags
& MSG_DONTWAIT
) == 0 &&
2075 so
->so_rcv
.sb_cc
< uio_resid(uio
)) &&
2076 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
2077 ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
)) &&
2078 m
->m_nextpkt
== 0 && (pr
->pr_flags
& PR_ATOMIC
) == 0)) {
2080 * Panic if we notice inconsistencies in the socket's
2081 * receive list; both sb_mb and sb_cc should correctly
2082 * reflect the contents of the list, otherwise we may
2083 * end up with false positives during select() or poll()
2084 * which could put the application in a bad state.
2086 if (m
== NULL
&& so
->so_rcv
.sb_cc
!= 0)
2087 panic("soreceive corrupted so_rcv: m %p cc %u",
2088 m
, so
->so_rcv
.sb_cc
);
2093 error
= so
->so_error
;
2094 if ((flags
& MSG_PEEK
) == 0)
2098 if (so
->so_state
& SS_CANTRCVMORE
) {
2104 for (; m
; m
= m
->m_next
)
2105 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
2106 m
= so
->so_rcv
.sb_mb
;
2109 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
2110 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
2114 if (uio_resid(uio
) == 0)
2116 if ((so
->so_state
& SS_NBIO
) ||
2117 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
2118 error
= EWOULDBLOCK
;
2121 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
2122 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
2123 sbunlock(&so
->so_rcv
, 1);
2124 #if EVEN_MORE_LOCKING_DEBUG
2126 printf("Waiting for socket data\n");
2129 error
= sbwait(&so
->so_rcv
);
2130 #if EVEN_MORE_LOCKING_DEBUG
2132 printf("SORECEIVE - sbwait returned %d\n", error
);
2134 if (so
->so_usecount
< 1)
2135 panic("soreceive: after 2nd sblock so=%p ref=%d on "
2136 "socket\n", so
, so
->so_usecount
);
2138 socket_unlock(so
, 1);
2139 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2146 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
2147 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
2148 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
2149 nextrecord
= m
->m_nextpkt
;
2150 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
2151 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2152 #if CONFIG_MACF_SOCKET_SUBSET
2154 * Call the MAC framework for policy checking if we're in
2155 * the user process context and the socket isn't connected.
2157 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2158 struct mbuf
*m0
= m
;
2160 * Dequeue this record (temporarily) from the receive
2161 * list since we're about to drop the socket's lock
2162 * where a new record may arrive and be appended to
2163 * the list. Upon MAC policy failure, the record
2164 * will be freed. Otherwise, we'll add it back to
2165 * the head of the list. We cannot rely on SB_LOCK
2166 * because append operation uses the socket's lock.
2169 m
->m_nextpkt
= NULL
;
2170 sbfree(&so
->so_rcv
, m
);
2172 } while (m
!= NULL
);
2174 so
->so_rcv
.sb_mb
= nextrecord
;
2175 SB_EMPTY_FIXUP(&so
->so_rcv
);
2176 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2177 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2178 socket_unlock(so
, 0);
2179 if (mac_socket_check_received(proc_ucred(p
), so
,
2180 mtod(m
, struct sockaddr
*)) != 0) {
2182 * MAC policy failure; free this record and
2183 * process the next record (or block until
2184 * one is available). We have adjusted sb_cc
2185 * and sb_mbcnt above so there is no need to
2186 * call sbfree() again.
2190 } while (m
!= NULL
);
2192 * Clear SB_LOCK but don't unlock the socket.
2193 * Process the next record or wait for one.
2196 sbunlock(&so
->so_rcv
, 1);
2201 * Re-adjust the socket receive list and re-enqueue
2202 * the record in front of any packets which may have
2203 * been appended while we dropped the lock.
2205 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
)
2206 sballoc(&so
->so_rcv
, m
);
2207 sballoc(&so
->so_rcv
, m
);
2208 if (so
->so_rcv
.sb_mb
== NULL
) {
2209 so
->so_rcv
.sb_lastrecord
= m0
;
2210 so
->so_rcv
.sb_mbtail
= m
;
2213 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
2214 so
->so_rcv
.sb_mb
= m
;
2215 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
2216 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
2218 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2221 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*),
2223 if ((*psa
== 0) && (flags
& MSG_NEEDSA
)) {
2224 error
= EWOULDBLOCK
;
2228 if (flags
& MSG_PEEK
) {
2231 sbfree(&so
->so_rcv
, m
);
2232 if (m
->m_next
== 0 && so
->so_rcv
.sb_cc
!= 0)
2233 panic("soreceive: about to create invalid "
2235 MFREE(m
, so
->so_rcv
.sb_mb
);
2236 m
= so
->so_rcv
.sb_mb
;
2238 m
->m_nextpkt
= nextrecord
;
2240 so
->so_rcv
.sb_mb
= nextrecord
;
2241 SB_EMPTY_FIXUP(&so
->so_rcv
);
2247 * Process one or more MT_CONTROL mbufs present before any data mbufs
2248 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2249 * just copy the data; if !MSG_PEEK, we call into the protocol to
2250 * perform externalization.
2252 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
2253 struct mbuf
*cm
= NULL
, *cmn
;
2254 struct mbuf
**cme
= &cm
;
2255 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
2258 * Externalizing the control messages would require us to
2259 * drop the socket's lock below. Once we re-acquire the
2260 * lock, the mbuf chain might change. In order to preserve
2261 * consistency, we unlink all control messages from the
2262 * first mbuf chain in one shot and link them separately
2263 * onto a different chain.
2266 if (flags
& MSG_PEEK
) {
2267 if (controlp
!= NULL
) {
2268 *controlp
= m_copy(m
, 0, m
->m_len
);
2269 controlp
= &(*controlp
)->m_next
;
2273 m
->m_nextpkt
= NULL
;
2275 sb_rcv
->sb_mb
= m
->m_next
;
2278 cme
= &(*cme
)->m_next
;
2281 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
2283 if (!(flags
& MSG_PEEK
)) {
2284 if (sb_rcv
->sb_mb
!= NULL
) {
2285 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
2287 sb_rcv
->sb_mb
= nextrecord
;
2288 SB_EMPTY_FIXUP(sb_rcv
);
2290 if (nextrecord
== NULL
)
2291 sb_rcv
->sb_lastrecord
= m
;
2294 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
2295 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
2297 while (cm
!= NULL
) {
2302 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
2305 * Call the protocol to externalize SCM_RIGHTS message
2306 * and return the modified message to the caller upon
2307 * success. Otherwise, all other control messages are
2308 * returned unmodified to the caller. Note that we
2309 * only get into this loop if MSG_PEEK is not set.
2311 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
2312 cmsg_type
== SCM_RIGHTS
) {
2314 * Release socket lock: see 3903171. This
2315 * would also allow more records to be appended
2316 * to the socket buffer. We still have SB_LOCK
2317 * set on it, so we can be sure that the head
2318 * of the mbuf chain won't change.
2320 socket_unlock(so
, 0);
2321 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
2327 if (controlp
!= NULL
&& error
== 0) {
2329 controlp
= &(*controlp
)->m_next
;
2337 if (sb_rcv
->sb_mb
!= NULL
)
2338 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
2344 if (!(flags
& MSG_PEEK
)) {
2346 * We get here because m points to an mbuf following
2347 * any MT_SONAME or MT_CONTROL mbufs which have been
2348 * processed above. In any case, m should be pointing
2349 * to the head of the mbuf chain, and the nextrecord
2350 * should be either NULL or equal to m->m_nextpkt.
2351 * See comments above about SB_LOCK.
2353 if (m
!= so
->so_rcv
.sb_mb
|| m
->m_nextpkt
!= nextrecord
)
2354 panic("soreceive: post-control !sync so=%p "
2355 "m=%p nextrecord=%p\n", so
, m
, nextrecord
);
2357 if (nextrecord
== NULL
)
2358 so
->so_rcv
.sb_lastrecord
= m
;
2361 if (type
== MT_OOBDATA
)
2364 if (!(flags
& MSG_PEEK
)) {
2365 so
->so_rcv
.sb_mb
= nextrecord
;
2366 SB_EMPTY_FIXUP(&so
->so_rcv
);
2369 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
2370 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
2375 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
)
2382 while (m
&& (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
2383 if (m
->m_type
== MT_OOBDATA
) {
2384 if (type
!= MT_OOBDATA
)
2386 } else if (type
== MT_OOBDATA
) {
2390 * Make sure to allways set MSG_OOB event when getting
2391 * out of band data inline.
2393 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
2394 (so
->so_options
& SO_OOBINLINE
) != 0 &&
2395 (so
->so_state
& SS_RCVATMARK
) != 0) {
2398 so
->so_state
&= ~SS_RCVATMARK
;
2399 len
= uio_resid(uio
) - delayed_copy_len
;
2400 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
)
2401 len
= so
->so_oobmark
- offset
;
2402 if (len
> m
->m_len
- moff
)
2403 len
= m
->m_len
- moff
;
2405 * If mp is set, just pass back the mbufs.
2406 * Otherwise copy them out via the uio, then free.
2407 * Sockbuf must be consistent here (points to current mbuf,
2408 * it points to next record) when we drop priority;
2409 * we must note any additions to the sockbuf when we
2410 * block interrupts again.
2413 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
2414 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
2415 if (can_delay
&& len
== m
->m_len
) {
2417 * only delay the copy if we're consuming the
2418 * mbuf and we're NOT in MSG_PEEK mode
2419 * and we have enough data to make it worthwile
2420 * to drop and retake the lock... can_delay
2421 * reflects the state of the 2 latter
2422 * constraints moff should always be zero
2425 delayed_copy_len
+= len
;
2427 if (delayed_copy_len
) {
2428 error
= sodelayed_copy(so
, uio
,
2429 &free_list
, &delayed_copy_len
);
2435 * can only get here if MSG_PEEK is not
2436 * set therefore, m should point at the
2437 * head of the rcv queue; if it doesn't,
2438 * it means something drastically
2439 * changed while we were out from behind
2440 * the lock in sodelayed_copy. perhaps
2441 * a RST on the stream. in any event,
2442 * the stream has been interrupted. it's
2443 * probably best just to return whatever
2444 * data we've moved and let the caller
2447 if (m
!= so
->so_rcv
.sb_mb
) {
2451 socket_unlock(so
, 0);
2452 error
= uiomove(mtod(m
, caddr_t
) + moff
,
2460 uio_setresid(uio
, (uio_resid(uio
) - len
));
2462 if (len
== m
->m_len
- moff
) {
2463 if (m
->m_flags
& M_EOR
)
2465 if (flags
& MSG_PEEK
) {
2469 nextrecord
= m
->m_nextpkt
;
2470 sbfree(&so
->so_rcv
, m
);
2471 m
->m_nextpkt
= NULL
;
2476 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
2477 *mp
= (struct mbuf
*)0;
2479 if (free_list
== NULL
)
2484 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
2488 m
->m_nextpkt
= nextrecord
;
2489 if (nextrecord
== NULL
)
2490 so
->so_rcv
.sb_lastrecord
= m
;
2492 so
->so_rcv
.sb_mb
= nextrecord
;
2493 SB_EMPTY_FIXUP(&so
->so_rcv
);
2495 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
2496 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
2499 if (flags
& MSG_PEEK
) {
2503 *mp
= m_copym(m
, 0, len
, M_WAIT
);
2506 so
->so_rcv
.sb_cc
-= len
;
2509 if (so
->so_oobmark
) {
2510 if ((flags
& MSG_PEEK
) == 0) {
2511 so
->so_oobmark
-= len
;
2512 if (so
->so_oobmark
== 0) {
2513 so
->so_state
|= SS_RCVATMARK
;
2515 * delay posting the actual event until
2516 * after any delayed copy processing
2524 if (offset
== so
->so_oobmark
)
2528 if (flags
& MSG_EOR
)
2531 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2532 * (for non-atomic socket), we must not quit until
2533 * "uio->uio_resid == 0" or an error termination.
2534 * If a signal/timeout occurs, return with a short
2535 * count but without error. Keep sockbuf locked
2536 * against other readers.
2538 while (flags
& (MSG_WAITALL
|MSG_WAITSTREAM
) && m
== 0 &&
2539 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
2540 !sosendallatonce(so
) && !nextrecord
) {
2541 if (so
->so_error
|| so
->so_state
& SS_CANTRCVMORE
)
2545 * Depending on the protocol (e.g. TCP), the following
2546 * might cause the socket lock to be dropped and later
2547 * be reacquired, and more data could have arrived and
2548 * have been appended to the receive socket buffer by
2549 * the time it returns. Therefore, we only sleep in
2550 * sbwait() below if and only if the socket buffer is
2551 * empty, in order to avoid a false sleep.
2553 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
&&
2554 (((struct inpcb
*)so
->so_pcb
)->inp_state
!=
2556 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
2558 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
2559 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
2561 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
2566 * have to wait until after we get back from the sbwait
2567 * to do the copy because we will drop the lock if we
2568 * have enough data that has been delayed... by dropping
2569 * the lock we open up a window allowing the netisr
2570 * thread to process the incoming packets and to change
2571 * the state of this socket... we're issuing the sbwait
2572 * because the socket is empty and we're expecting the
2573 * netisr thread to wake us up when more packets arrive;
2574 * if we allow that processing to happen and then sbwait
2575 * we could stall forever with packets sitting in the
2576 * socket if no further packets arrive from the remote
2579 * we want to copy before we've collected all the data
2580 * to satisfy this request to allow the copy to overlap
2581 * the incoming packet processing on an MP system
2583 if (delayed_copy_len
> sorecvmincopy
&&
2584 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
2585 error
= sodelayed_copy(so
, uio
,
2586 &free_list
, &delayed_copy_len
);
2591 m
= so
->so_rcv
.sb_mb
;
2593 nextrecord
= m
->m_nextpkt
;
2597 #ifdef MORE_LOCKING_DEBUG
2598 if (so
->so_usecount
<= 1)
2599 panic("soreceive: after big while so=%p ref=%d on socket\n",
2600 so
, so
->so_usecount
);
2603 if (m
&& pr
->pr_flags
& PR_ATOMIC
) {
2605 if (so
->so_options
& SO_DONTTRUNC
) {
2606 flags
|= MSG_RCVMORE
;
2610 if ((flags
& MSG_PEEK
) == 0)
2611 (void) sbdroprecord(&so
->so_rcv
);
2618 * pru_rcvd below (for TCP) may cause more data to be received
2619 * if the socket lock is dropped prior to sending the ACK; some
2620 * legacy OpenTransport applications don't handle this well
2621 * (if it receives less data than requested while MSG_HAVEMORE
2622 * is set), and so we set the flag now based on what we know
2623 * prior to calling pru_rcvd.
2625 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
2626 flags
|= MSG_HAVEMORE
;
2628 if ((flags
& MSG_PEEK
) == 0) {
2630 so
->so_rcv
.sb_mb
= nextrecord
;
2632 * First part is an inline SB_EMPTY_FIXUP(). Second
2633 * part makes sure sb_lastrecord is up-to-date if
2634 * there is still data in the socket buffer.
2636 if (so
->so_rcv
.sb_mb
== NULL
) {
2637 so
->so_rcv
.sb_mbtail
= NULL
;
2638 so
->so_rcv
.sb_lastrecord
= NULL
;
2639 } else if (nextrecord
->m_nextpkt
== NULL
) {
2640 so
->so_rcv
.sb_lastrecord
= nextrecord
;
2643 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
2644 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
2645 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
2646 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
2649 if (delayed_copy_len
) {
2650 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
2656 m_freem_list((struct mbuf
*)free_list
);
2657 free_list
= (struct mbuf
*)0;
2660 postevent(so
, 0, EV_OOB
);
2662 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
2663 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
2664 sbunlock(&so
->so_rcv
, 1);
2671 #ifdef MORE_LOCKING_DEBUG
2672 if (so
->so_usecount
<= 1)
2673 panic("soreceive: release so=%p ref=%d on socket\n",
2674 so
, so
->so_usecount
);
2676 if (delayed_copy_len
) {
2677 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
2680 m_freem_list((struct mbuf
*)free_list
);
2682 sbunlock(&so
->so_rcv
, 0); /* will unlock socket */
2684 // LP64todo - fix this!
2685 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
2686 so
->so_rcv
.sb_cc
, 0, error
);
2692 * Returns: 0 Success
2696 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
2704 socket_unlock(so
, 0);
2706 while (m
&& error
== 0) {
2708 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
2712 m_freem_list(*free_list
);
2714 *free_list
= (struct mbuf
*)NULL
;
2724 * Returns: 0 Success
2727 * <pru_shutdown>:EINVAL
2728 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
2729 * <pru_shutdown>:ENOBUFS[TCP]
2730 * <pru_shutdown>:EMSGSIZE[TCP]
2731 * <pru_shutdown>:EHOSTUNREACH[TCP]
2732 * <pru_shutdown>:ENETUNREACH[TCP]
2733 * <pru_shutdown>:ENETDOWN[TCP]
2734 * <pru_shutdown>:ENOMEM[TCP]
2735 * <pru_shutdown>:EACCES[TCP]
2736 * <pru_shutdown>:EMSGSIZE[TCP]
2737 * <pru_shutdown>:ENOBUFS[TCP]
2738 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2739 * <pru_shutdown>:??? [other protocol families]
2742 soshutdown(struct socket
*so
, int how
)
2752 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) == 0) {
2755 error
= soshutdownlock(so
, how
);
2757 socket_unlock(so
, 1);
2768 soshutdownlock(struct socket
*so
, int how
)
2770 struct protosw
*pr
= so
->so_proto
;
2773 sflt_notify(so
, sock_evt_shutdown
, &how
);
2775 if (how
!= SHUT_WR
) {
2776 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
2777 /* read already shut down */
2782 postevent(so
, 0, EV_RCLOSED
);
2784 if (how
!= SHUT_RD
) {
2785 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
2786 /* write already shut down */
2790 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
2791 postevent(so
, 0, EV_WCLOSED
);
2794 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
2799 sorflush(struct socket
*so
)
2801 register struct sockbuf
*sb
= &so
->so_rcv
;
2802 register struct protosw
*pr
= so
->so_proto
;
2805 #ifdef MORE_LOCKING_DEBUG
2806 lck_mtx_t
*mutex_held
;
2808 if (so
->so_proto
->pr_getlock
!= NULL
)
2809 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
2811 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
2812 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
2815 sflt_notify(so
, sock_evt_flush_read
, NULL
);
2817 sb
->sb_flags
|= SB_NOINTR
;
2818 (void) sblock(sb
, M_WAIT
);
2822 selthreadclear(&sb
->sb_sel
);
2825 bzero((caddr_t
)sb
, sizeof (*sb
));
2826 sb
->sb_so
= so
; /* reestablish link to socket */
2827 if (asb
.sb_flags
& SB_KNOTE
) {
2828 sb
->sb_sel
.si_note
= asb
.sb_sel
.si_note
;
2829 sb
->sb_flags
= SB_KNOTE
;
2831 if (asb
.sb_flags
& SB_DROP
)
2832 sb
->sb_flags
|= SB_DROP
;
2833 if (asb
.sb_flags
& SB_UNIX
)
2834 sb
->sb_flags
|= SB_UNIX
;
2835 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
) {
2836 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
2842 * Perhaps this routine, and sooptcopyout(), below, ought to come in
2843 * an additional variant to handle the case where the option value needs
2844 * to be some kind of integer, but not a specific size.
2845 * In addition to their use here, these functions are also called by the
2846 * protocol-level pr_ctloutput() routines.
2848 * Returns: 0 Success
2853 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
2858 * If the user gives us more than we wanted, we ignore it,
2859 * but if we don't get the minimum length the caller
2860 * wants, we return EINVAL. On success, sopt->sopt_valsize
2861 * is set to however much we actually retrieved.
2863 if ((valsize
= sopt
->sopt_valsize
) < minlen
)
2866 sopt
->sopt_valsize
= valsize
= len
;
2868 if (sopt
->sopt_p
!= kernproc
)
2869 return (copyin(sopt
->sopt_val
, buf
, valsize
));
2871 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
2876 * sooptcopyin_timeval
2877 * Copy in a timeval value into tv_p, and take into account whether the
2878 * the calling process is 64-bit or 32-bit. Moved the sanity checking
2879 * code here so that we can verify the 64-bit tv_sec value before we lose
2880 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2883 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
* tv_p
)
2887 if (proc_is64bit(sopt
->sopt_p
)) {
2888 struct user64_timeval tv64
;
2890 if (sopt
->sopt_valsize
< sizeof(tv64
)) {
2893 sopt
->sopt_valsize
= sizeof(tv64
);
2894 if (sopt
->sopt_p
!= kernproc
) {
2895 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof(tv64
));
2899 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv64
,
2902 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
2903 || tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000) {
2906 tv_p
->tv_sec
= tv64
.tv_sec
;
2907 tv_p
->tv_usec
= tv64
.tv_usec
;
2909 struct user32_timeval tv32
;
2911 if (sopt
->sopt_valsize
< sizeof(tv32
)) {
2914 sopt
->sopt_valsize
= sizeof(tv32
);
2915 if (sopt
->sopt_p
!= kernproc
) {
2916 error
= copyin(sopt
->sopt_val
, &tv32
, sizeof(tv32
));
2921 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv32
,
2924 #ifndef __LP64__ // K64todo "comparison is always false due to limited range of data type"
2925 if (tv32
.tv_sec
< 0 || tv32
.tv_sec
> LONG_MAX
2926 || tv32
.tv_usec
< 0 || tv32
.tv_usec
>= 1000000) {
2930 tv_p
->tv_sec
= tv32
.tv_sec
;
2931 tv_p
->tv_usec
= tv32
.tv_usec
;
2937 * Returns: 0 Success
2942 * sooptcopyin:EINVAL
2943 * sooptcopyin:EFAULT
2944 * sooptcopyin_timeval:EINVAL
2945 * sooptcopyin_timeval:EFAULT
2946 * sooptcopyin_timeval:EDOM
2947 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2948 * <pr_ctloutput>:???w
2949 * sflt_attach_private:??? [whatever a filter author chooses]
2950 * <sf_setoption>:??? [whatever a filter author chooses]
2952 * Notes: Other <pru_listen> returns depend on the protocol family; all
2953 * <sf_listen> returns depend on what the filter author causes
2954 * their filter to return.
2957 sosetopt(struct socket
*so
, struct sockopt
*sopt
)
2962 struct socket_filter_entry
*filter
;
2964 #if CONFIG_MACF_SOCKET
2966 #endif /* MAC_SOCKET */
2969 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
))
2970 == (SS_CANTRCVMORE
| SS_CANTSENDMORE
) &&
2971 (so
->so_flags
& SOF_NPX_SETOPTSHUT
) == 0) {
2972 /* the socket has been shutdown, no more sockopt's */
2977 if (sopt
->sopt_dir
!= SOPT_SET
) {
2978 sopt
->sopt_dir
= SOPT_SET
;
2982 for (filter
= so
->so_filt
; filter
&& (error
== 0);
2983 filter
= filter
->sfe_next_onsocket
) {
2984 if (filter
->sfe_filter
->sf_filter
.sf_setoption
) {
2985 if (filtered
== 0) {
2988 socket_unlock(so
, 0);
2990 error
= filter
->sfe_filter
->sf_filter
.
2991 sf_setoption(filter
->sfe_cookie
, so
, sopt
);
2995 if (filtered
!= 0) {
3000 if (error
== EJUSTRETURN
)
3007 if (sopt
->sopt_level
!= SOL_SOCKET
) {
3008 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3009 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
3010 socket_unlock(so
, 1);
3013 error
= ENOPROTOOPT
;
3015 switch (sopt
->sopt_name
) {
3018 error
= sooptcopyin(sopt
, &l
, sizeof (l
), sizeof (l
));
3022 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
3023 l
.l_linger
: l
.l_linger
* hz
;
3025 so
->so_options
|= SO_LINGER
;
3027 so
->so_options
&= ~SO_LINGER
;
3033 case SO_USELOOPBACK
:
3042 case SO_WANTOOBFLAG
:
3044 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3049 so
->so_options
|= sopt
->sopt_name
;
3051 so
->so_options
&= ~sopt
->sopt_name
;
3058 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3064 * Values < 1 make no sense for any of these
3065 * options, so disallow them.
3072 switch (sopt
->sopt_name
) {
3075 if (sbreserve(sopt
->sopt_name
== SO_SNDBUF
?
3076 &so
->so_snd
: &so
->so_rcv
,
3077 (u_int32_t
) optval
) == 0) {
3081 if (sopt
->sopt_name
== SO_SNDBUF
)
3082 so
->so_snd
.sb_flags
|= SB_USRSIZE
;
3084 so
->so_rcv
.sb_flags
|= SB_USRSIZE
;
3088 * Make sure the low-water is never greater than
3092 so
->so_snd
.sb_lowat
=
3093 (optval
> so
->so_snd
.sb_hiwat
) ?
3094 so
->so_snd
.sb_hiwat
: optval
;
3097 so
->so_rcv
.sb_lowat
=
3098 (optval
> so
->so_rcv
.sb_hiwat
) ?
3099 so
->so_rcv
.sb_hiwat
: optval
;
3106 error
= sooptcopyin_timeval(sopt
, &tv
);
3110 switch (sopt
->sopt_name
) {
3112 so
->so_snd
.sb_timeo
= tv
;
3115 so
->so_rcv
.sb_timeo
= tv
;
3124 error
= sooptcopyin(sopt
, &nke
, sizeof (nke
),
3129 error
= sflt_attach_private(so
, NULL
,
3135 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3140 so
->so_flags
|= SOF_NOSIGPIPE
;
3142 so
->so_flags
&= ~SOF_NOSIGPIPE
;
3147 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3152 so
->so_flags
|= SOF_NOADDRAVAIL
;
3154 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
3158 case SO_REUSESHAREUID
:
3159 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3164 so
->so_flags
|= SOF_REUSESHAREUID
;
3166 so
->so_flags
&= ~SOF_REUSESHAREUID
;
3168 #ifdef __APPLE_API_PRIVATE
3169 case SO_NOTIFYCONFLICT
:
3170 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3174 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3179 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
3181 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
3184 case SO_RESTRICTIONS
:
3185 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3189 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3193 so
->so_restrictions
= (optval
& (SO_RESTRICT_DENYIN
|
3194 SO_RESTRICT_DENYOUT
| SO_RESTRICT_DENYSET
));
3198 #if CONFIG_MACF_SOCKET
3199 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3200 sizeof (extmac
))) != 0)
3203 error
= mac_setsockopt_label(proc_ucred(sopt
->sopt_p
),
3207 #endif /* MAC_SOCKET */
3210 #ifdef __APPLE_API_PRIVATE
3211 case SO_UPCALLCLOSEWAIT
:
3212 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3217 so
->so_flags
|= SOF_UPCALLCLOSEWAIT
;
3219 so
->so_flags
&= ~SOF_UPCALLCLOSEWAIT
;
3224 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3229 so
->so_flags
|= SOF_BINDRANDOMPORT
;
3231 so
->so_flags
&= ~SOF_BINDRANDOMPORT
;
3234 case SO_NP_EXTENSIONS
: {
3235 struct so_np_extensions sonpx
;
3237 error
= sooptcopyin(sopt
, &sonpx
, sizeof(sonpx
), sizeof(sonpx
));
3240 if (sonpx
.npx_mask
& ~SONPX_MASK_VALID
) {
3245 * Only one bit defined for now
3247 if ((sonpx
.npx_mask
& SONPX_SETOPTSHUT
)) {
3248 if ((sonpx
.npx_flags
& SONPX_SETOPTSHUT
))
3249 so
->so_flags
|= SOF_NPX_SETOPTSHUT
;
3251 so
->so_flags
&= ~SOF_NPX_SETOPTSHUT
;
3257 case SO_TRAFFIC_CLASS
: {
3258 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3262 if (optval
< SO_TC_BE
|| optval
> SO_TC_VO
) {
3266 so
->so_traffic_class
= optval
;
3268 #endif /* PKT_PRIORITY */
3271 error
= ENOPROTOOPT
;
3274 if (error
== 0 && so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3275 (void) ((*so
->so_proto
->pr_ctloutput
)(so
, sopt
));
3279 socket_unlock(so
, 1);
3283 /* Helper routines for getsockopt */
3285 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
3293 * Documented get behavior is that we always return a value,
3294 * possibly truncated to fit in the user's buffer.
3295 * Traditional behavior is that we always tell the user
3296 * precisely how much we copied, rather than something useful
3297 * like the total amount we had available for her.
3298 * Note that this interface is not idempotent; the entire answer must
3299 * generated ahead of time.
3301 valsize
= min(len
, sopt
->sopt_valsize
);
3302 sopt
->sopt_valsize
= valsize
;
3303 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
3304 if (sopt
->sopt_p
!= kernproc
)
3305 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
3307 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
3313 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
* tv_p
)
3317 struct user64_timeval tv64
;
3318 struct user32_timeval tv32
;
3323 if (proc_is64bit(sopt
->sopt_p
)) {
3325 tv64
.tv_sec
= tv_p
->tv_sec
;
3326 tv64
.tv_usec
= tv_p
->tv_usec
;
3330 tv32
.tv_sec
= tv_p
->tv_sec
;
3331 tv32
.tv_usec
= tv_p
->tv_usec
;
3334 valsize
= min(len
, sopt
->sopt_valsize
);
3335 sopt
->sopt_valsize
= valsize
;
3336 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
3337 if (sopt
->sopt_p
!= kernproc
)
3338 error
= copyout(val
, sopt
->sopt_val
, valsize
);
3340 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
3348 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3349 * <pr_ctloutput>:???
3350 * <sf_getoption>:???
3353 sogetopt(struct socket
*so
, struct sockopt
*sopt
)
3358 struct socket_filter_entry
*filter
;
3360 #if CONFIG_MACF_SOCKET
3362 #endif /* MAC_SOCKET */
3364 if (sopt
->sopt_dir
!= SOPT_GET
) {
3365 sopt
->sopt_dir
= SOPT_GET
;
3371 for (filter
= so
->so_filt
; filter
&& (error
== 0);
3372 filter
= filter
->sfe_next_onsocket
) {
3373 if (filter
->sfe_filter
->sf_filter
.sf_getoption
) {
3374 if (filtered
== 0) {
3377 socket_unlock(so
, 0);
3379 error
= filter
->sfe_filter
->sf_filter
.
3380 sf_getoption(filter
->sfe_cookie
, so
, sopt
);
3383 if (filtered
!= 0) {
3388 if (error
== EJUSTRETURN
)
3390 socket_unlock(so
, 1);
3396 if (sopt
->sopt_level
!= SOL_SOCKET
) {
3397 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3398 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
3399 socket_unlock(so
, 1);
3402 socket_unlock(so
, 1);
3403 return (ENOPROTOOPT
);
3406 switch (sopt
->sopt_name
) {
3409 l
.l_onoff
= so
->so_options
& SO_LINGER
;
3410 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
3411 so
->so_linger
: so
->so_linger
/ hz
;
3412 error
= sooptcopyout(sopt
, &l
, sizeof (l
));
3415 case SO_USELOOPBACK
:
3427 case SO_WANTOOBFLAG
:
3429 optval
= so
->so_options
& sopt
->sopt_name
;
3431 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
3435 optval
= so
->so_type
;
3440 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
3445 m1
= so
->so_rcv
.sb_mb
;
3447 if (m1
->m_type
== MT_DATA
|| m1
->m_type
== MT_HEADER
||
3448 m1
->m_type
== MT_OOBDATA
)
3449 pkt_total
+= m1
->m_len
;
3454 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
3459 optval
= so
->so_snd
.sb_cc
;
3463 optval
= so
->so_error
;
3468 optval
= so
->so_snd
.sb_hiwat
;
3472 optval
= so
->so_rcv
.sb_hiwat
;
3476 optval
= so
->so_snd
.sb_lowat
;
3480 optval
= so
->so_rcv
.sb_lowat
;
3485 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
3486 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
3488 error
= sooptcopyout_timeval(sopt
, &tv
);
3492 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
3496 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
3499 case SO_REUSESHAREUID
:
3500 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
3503 #ifdef __APPLE_API_PRIVATE
3504 case SO_NOTIFYCONFLICT
:
3505 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
3508 case SO_RESTRICTIONS
:
3509 optval
= so
->so_restrictions
& (SO_RESTRICT_DENYIN
|
3510 SO_RESTRICT_DENYOUT
| SO_RESTRICT_DENYSET
);
3514 #if CONFIG_MACF_SOCKET
3515 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3516 sizeof (extmac
))) != 0 ||
3517 (error
= mac_socket_label_get(proc_ucred(
3518 sopt
->sopt_p
), so
, &extmac
)) != 0)
3521 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
3524 #endif /* MAC_SOCKET */
3528 #if CONFIG_MACF_SOCKET
3529 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3530 sizeof (extmac
))) != 0 ||
3531 (error
= mac_socketpeer_label_get(proc_ucred(
3532 sopt
->sopt_p
), so
, &extmac
)) != 0)
3535 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
3538 #endif /* MAC_SOCKET */
3541 #ifdef __APPLE_API_PRIVATE
3542 case SO_UPCALLCLOSEWAIT
:
3543 optval
= (so
->so_flags
& SOF_UPCALLCLOSEWAIT
);
3547 optval
= (so
->so_flags
& SOF_BINDRANDOMPORT
);
3550 case SO_NP_EXTENSIONS
: {
3551 struct so_np_extensions sonpx
;
3553 sonpx
.npx_flags
= (so
->so_flags
& SOF_NPX_SETOPTSHUT
) ? SONPX_SETOPTSHUT
: 0;
3554 sonpx
.npx_mask
= SONPX_MASK_VALID
;
3556 error
= sooptcopyout(sopt
, &sonpx
, sizeof(struct so_np_extensions
));
3560 case SO_TRAFFIC_CLASS
:
3561 optval
= so
->so_traffic_class
;
3563 #endif /* PKT_PRIORITY */
3566 error
= ENOPROTOOPT
;
3569 socket_unlock(so
, 1);
3574 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
3576 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
3578 struct mbuf
*m
, *m_prev
;
3579 int sopt_size
= sopt
->sopt_valsize
;
3582 if (sopt_size
> MAX_SOOPTGETM_SIZE
)
3585 how
= sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
;
3586 MGET(m
, how
, MT_DATA
);
3589 if (sopt_size
> MLEN
) {
3591 if ((m
->m_flags
& M_EXT
) == 0) {
3595 m
->m_len
= min(MCLBYTES
, sopt_size
);
3597 m
->m_len
= min(MLEN
, sopt_size
);
3599 sopt_size
-= m
->m_len
;
3604 MGET(m
, how
, MT_DATA
);
3609 if (sopt_size
> MLEN
) {
3611 if ((m
->m_flags
& M_EXT
) == 0) {
3615 m
->m_len
= min(MCLBYTES
, sopt_size
);
3617 m
->m_len
= min(MLEN
, sopt_size
);
3619 sopt_size
-= m
->m_len
;
3626 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
3628 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
3630 struct mbuf
*m0
= m
;
3632 if (sopt
->sopt_val
== USER_ADDR_NULL
)
3634 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
3635 if (sopt
->sopt_p
!= kernproc
) {
3638 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
3645 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
3646 mtod(m
, char *), m
->m_len
);
3648 sopt
->sopt_valsize
-= m
->m_len
;
3649 sopt
->sopt_val
+= m
->m_len
;
3652 if (m
!= NULL
) /* should be allocated enoughly at ip6_sooptmcopyin() */
3653 panic("soopt_mcopyin");
3657 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
3659 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
3661 struct mbuf
*m0
= m
;
3664 if (sopt
->sopt_val
== USER_ADDR_NULL
)
3666 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
3667 if (sopt
->sopt_p
!= kernproc
) {
3670 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
3677 bcopy(mtod(m
, char *),
3678 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
3680 sopt
->sopt_valsize
-= m
->m_len
;
3681 sopt
->sopt_val
+= m
->m_len
;
3682 valsize
+= m
->m_len
;
3686 /* enough soopt buffer should be given from user-land */
3690 sopt
->sopt_valsize
= valsize
;
3695 sohasoutofband(struct socket
*so
)
3698 if (so
->so_pgid
< 0)
3699 gsignal(-so
->so_pgid
, SIGURG
);
3700 else if (so
->so_pgid
> 0)
3701 proc_signal(so
->so_pgid
, SIGURG
);
3702 selwakeup(&so
->so_rcv
.sb_sel
);
3706 sopoll(struct socket
*so
, int events
, __unused kauth_cred_t cred
, void * wql
)
3708 struct proc
*p
= current_proc();
3713 if (events
& (POLLIN
| POLLRDNORM
))
3715 revents
|= events
& (POLLIN
| POLLRDNORM
);
3717 if (events
& (POLLOUT
| POLLWRNORM
))
3718 if (sowriteable(so
))
3719 revents
|= events
& (POLLOUT
| POLLWRNORM
);
3721 if (events
& (POLLPRI
| POLLRDBAND
))
3722 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
))
3723 revents
|= events
& (POLLPRI
| POLLRDBAND
);
3726 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
3728 * Darwin sets the flag first,
3729 * BSD calls selrecord first
3731 so
->so_rcv
.sb_flags
|= SB_SEL
;
3732 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
3735 if (events
& (POLLOUT
| POLLWRNORM
)) {
3737 * Darwin sets the flag first,
3738 * BSD calls selrecord first
3740 so
->so_snd
.sb_flags
|= SB_SEL
;
3741 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
3745 socket_unlock(so
, 1);
3750 soo_kqfilter(__unused
struct fileproc
*fp
, struct knote
*kn
,
3751 __unused
struct proc
*p
)
3753 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3758 #if CONFIG_MACF_SOCKET
3759 if (mac_socket_check_kqfilter(proc_ucred(p
), kn
, so
) != 0) {
3760 socket_unlock(so
, 1);
3763 #endif /* MAC_SOCKET */
3765 switch (kn
->kn_filter
) {
3767 kn
->kn_fop
= &soread_filtops
;
3771 kn
->kn_fop
= &sowrite_filtops
;
3775 socket_unlock(so
, 1);
3779 if (KNOTE_ATTACH(&sb
->sb_sel
.si_note
, kn
))
3780 sb
->sb_flags
|= SB_KNOTE
;
3781 socket_unlock(so
, 1);
3786 filt_sordetach(struct knote
*kn
)
3788 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3791 if (so
->so_rcv
.sb_flags
& SB_KNOTE
)
3792 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
3793 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
3794 socket_unlock(so
, 1);
3799 filt_soread(struct knote
*kn
, long hint
)
3801 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3803 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3806 if (so
->so_options
& SO_ACCEPTCONN
) {
3809 /* Radar 6615193 handle the listen case dynamically
3810 * for kqueue read filter. This allows to call listen() after registering
3811 * the kqueue EVFILT_READ.
3814 kn
->kn_data
= so
->so_qlen
;
3815 isempty
= ! TAILQ_EMPTY(&so
->so_comp
);
3817 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3818 socket_unlock(so
, 1);
3823 /* socket isn't a listener */
3825 kn
->kn_data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
3827 if (so
->so_oobmark
) {
3828 if (kn
->kn_flags
& EV_OOBAND
) {
3829 kn
->kn_data
-= so
->so_oobmark
;
3830 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3831 socket_unlock(so
, 1);
3834 kn
->kn_data
= so
->so_oobmark
;
3835 kn
->kn_flags
|= EV_OOBAND
;
3837 if (so
->so_state
& SS_CANTRCVMORE
) {
3838 kn
->kn_flags
|= EV_EOF
;
3839 kn
->kn_fflags
= so
->so_error
;
3840 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3841 socket_unlock(so
, 1);
3846 if (so
->so_state
& SS_RCVATMARK
) {
3847 if (kn
->kn_flags
& EV_OOBAND
) {
3848 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3849 socket_unlock(so
, 1);
3852 kn
->kn_flags
|= EV_OOBAND
;
3853 } else if (kn
->kn_flags
& EV_OOBAND
) {
3855 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3856 socket_unlock(so
, 1);
3860 if (so
->so_error
) { /* temporary udp error */
3861 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3862 socket_unlock(so
, 1);
3866 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3867 socket_unlock(so
, 1);
3869 return ((kn
->kn_flags
& EV_OOBAND
) ||
3870 kn
->kn_data
>= ((kn
->kn_sfflags
& NOTE_LOWAT
) ?
3871 kn
->kn_sdata
: so
->so_rcv
.sb_lowat
));
3875 filt_sowdetach(struct knote
*kn
)
3877 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3880 if (so
->so_snd
.sb_flags
& SB_KNOTE
)
3881 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
3882 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
3883 socket_unlock(so
, 1);
3888 filt_sowrite(struct knote
*kn
, long hint
)
3890 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3892 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3895 kn
->kn_data
= sbspace(&so
->so_snd
);
3896 if (so
->so_state
& SS_CANTSENDMORE
) {
3897 kn
->kn_flags
|= EV_EOF
;
3898 kn
->kn_fflags
= so
->so_error
;
3899 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3900 socket_unlock(so
, 1);
3903 if (so
->so_error
) { /* temporary udp error */
3904 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3905 socket_unlock(so
, 1);
3908 if (((so
->so_state
& SS_ISCONNECTED
) == 0) &&
3909 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3910 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3911 socket_unlock(so
, 1);
3914 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3915 socket_unlock(so
, 1);
3916 if (kn
->kn_sfflags
& NOTE_LOWAT
)
3917 return (kn
->kn_data
>= kn
->kn_sdata
);
3918 return (kn
->kn_data
>= so
->so_snd
.sb_lowat
);
3921 #define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + sizeof(void *) + 1) + 1)
3923 __private_extern__
const char * solockhistory_nr(struct socket
*so
)
3927 static char lock_history_str
[SO_LOCK_HISTORY_STR_LEN
];
3929 for (i
= SO_LCKDBG_MAX
- 1; i
>= 0; i
--) {
3930 n
+= snprintf(lock_history_str
+ n
, SO_LOCK_HISTORY_STR_LEN
- n
, "%lx:%lx ",
3931 (uintptr_t) so
->lock_lr
[(so
->next_lock_lr
+ i
) % SO_LCKDBG_MAX
],
3932 (uintptr_t) so
->unlock_lr
[(so
->next_unlock_lr
+ i
) % SO_LCKDBG_MAX
]);
3934 return lock_history_str
;
3938 socket_lock(struct socket
*so
, int refcount
)
3943 lr_saved
= __builtin_return_address(0);
3945 if (so
->so_proto
->pr_lock
) {
3946 error
= (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
3948 #ifdef MORE_LOCKING_DEBUG
3949 lck_mtx_assert(so
->so_proto
->pr_domain
->dom_mtx
,
3950 LCK_MTX_ASSERT_NOTOWNED
);
3952 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
3955 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
3956 so
->next_lock_lr
= (so
->next_lock_lr
+1) % SO_LCKDBG_MAX
;
3963 socket_unlock(struct socket
*so
, int refcount
)
3967 lck_mtx_t
*mutex_held
;
3969 lr_saved
= __builtin_return_address(0);
3971 if (so
->so_proto
== NULL
)
3972 panic("socket_unlock null so_proto so=%p\n", so
);
3974 if (so
&& so
->so_proto
->pr_unlock
) {
3975 error
= (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
3977 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
3978 #ifdef MORE_LOCKING_DEBUG
3979 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
3981 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
3982 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
3985 if (so
->so_usecount
<= 0)
3986 panic("socket_unlock: bad refcount=%d so=%p (%d, %d, %d) lrh=%s",
3987 so
->so_usecount
, so
, so
->so_proto
->pr_domain
->dom_family
,
3988 so
->so_type
, so
->so_proto
->pr_protocol
,
3989 solockhistory_nr(so
));
3992 if (so
->so_usecount
== 0) {
3993 sofreelastref(so
, 1);
3996 lck_mtx_unlock(mutex_held
);
4002 /* Called with socket locked, will unlock socket */
4004 sofree(struct socket
*so
)
4007 lck_mtx_t
*mutex_held
;
4008 if (so
->so_proto
->pr_getlock
!= NULL
)
4009 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
4011 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
4012 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
4014 sofreelastref(so
, 0);
4018 soreference(struct socket
*so
)
4020 socket_lock(so
, 1); /* locks & take one reference on socket */
4021 socket_unlock(so
, 0); /* unlock only */
4025 sodereference(struct socket
*so
)
4028 socket_unlock(so
, 1);
4032 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4033 * possibility of using jumbo clusters. Caller must ensure to hold
4037 somultipages(struct socket
*so
, boolean_t set
)
4040 so
->so_flags
|= SOF_MULTIPAGES
;
4042 so
->so_flags
&= ~SOF_MULTIPAGES
;
4046 so_isdstlocal(struct socket
*so
) {
4048 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
4050 if (so
->so_proto
->pr_domain
->dom_family
== AF_INET
) {
4051 return inaddr_local(inp
->inp_faddr
);
4052 } else if (so
->so_proto
->pr_domain
->dom_family
== AF_INET6
) {
4053 return in6addr_local(&inp
->in6p_faddr
);